# Distance results

One way of correcting for distance is to optimize the relationship between expression and distance, revealing genes supporting that relationship, then remove those genes from contention in final analyses.

In [1]:
""" Specify and read files containing log-proximity results. """

import os
import pandas as pd
from pygest.convenience import get_ranks_from_file


# Path information to locate files of interest
base_path = "/home/mike/ge_data/derivatives/sub-all_hem-A_samp-glasser_prob-fornito"
split_path = "parby-wellid_splby-wellid_batch-train00{}"
algo_dir = "tgt-max_algo-smrt"
filename = "sub-all_comp-{}_mask-{}_norm-srs_adj-none.tsv"

# Record and load files
masks = ['none', '16', '32', '64', ]
results = []
for mask in masks:
    for comp in ["hcpniftismoothconnsim", "glasserwellidslogproximity", ]:
        for split_seed in range(401, 433):
            full_path = os.path.join(
                base_path, split_path.format(split_seed), algo_dir,
                filename.format(comp, mask)
            )
            rank_col_name = "s{}rank".format(split_seed)
            if os.path.isfile(full_path):
                one_df = get_ranks_from_file(full_path, column_name=rank_col_name)
                result = {
                    'tsv': full_path,
                    'comp': comp,
                    'mask': "00" if mask == "none" else mask,
                    'split': split_seed,
                    'df': one_df,
                    'ranks': one_df[rank_col_name].astype(int),
                }
                results.append(result)
    
df = pd.DataFrame(results)

# We should have a result for each split-mask combination, 4 masks * 32 splits = 128 files
pd.pivot_table(data=df, index='comp', columns=['mask', ], aggfunc=pd.Series.nunique)


Unnamed: 0_level_0,split,split,split,split,tsv,tsv,tsv,tsv
mask,00,16,32,64,00,16,32,64
comp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
glasserwellidslogproximity,32,32,32,32,32,32,32,32
hcpniftismoothconnsim,32,32,32,32,32,32,32,32


In [2]:
""" Average ranks over splits for each probe_id. """

from pygest.rawdata.miscellaneous import map_pid_to_eid_fornito


rankings_by_mask = {}
for mask in df['mask'].unique():
    for comp in df['comp'].unique():
        sdf = pd.DataFrame(
            data=[s for s in df[(df['mask'] == mask) & (df['comp'] == comp)].sort_values('split', ascending=True)['ranks']],
        ).T
        sdf['mean_rank'] = sdf.mean(axis='columns')
        sdf['entrez_id'] = sdf.index.map(map_pid_to_eid_fornito)
        sdf = sdf[sorted(sdf.columns)].sort_values('mean_rank', ascending=True)
        rankings_by_mask[(mask, comp)] = sdf
        print("Created [{} x {}] dataframe for mask == {} and comp == {}".format(sdf.shape[0], sdf.shape[1], mask, comp))


Created [15745 x 34] dataframe for mask == 00 and comp == hcpniftismoothconnsim
Created [15745 x 34] dataframe for mask == 00 and comp == glasserwellidslogproximity
Created [15745 x 34] dataframe for mask == 16 and comp == hcpniftismoothconnsim
Created [15745 x 34] dataframe for mask == 16 and comp == glasserwellidslogproximity
Created [15745 x 34] dataframe for mask == 32 and comp == hcpniftismoothconnsim
Created [15745 x 34] dataframe for mask == 32 and comp == glasserwellidslogproximity
Created [15745 x 34] dataframe for mask == 64 and comp == hcpniftismoothconnsim
Created [15745 x 34] dataframe for mask == 64 and comp == glasserwellidslogproximity


In [3]:
""" Save results to disk. """

os.makedirs("rankings", exist_ok=True)

for (mask, comp), final_result in rankings_by_mask.items():
    if comp == "glasserwellidslogproximity":
        filename = "logproximity_mask-{}.tsv"
    elif comp == "hcpniftismoothconnsim":
        filename = "hcpconnsim_mask-{}.tsv"
    else:
        filename = "BADFILE_{}.junk"
    final_result.set_index("entrez_id")[[col for col in final_result.columns if "rank" in col]].to_csv(
        os.path.join("rankings", filename.format(mask)), sep="\t"
    )
