# Distance results

One way of correcting for distance is to optimize the relationship between expression and distance, revealing genes supporting that relationship, then remove those genes from contention in final analyses.

In [1]:
""" Specify and read files containing log-proximity results. """

import os
import pandas as pd
from pygest.convenience import get_ranks_from_file


# Path information to locate files of interest
base_path = "/home/mike/ge_data/derivatives/sub-all_hem-A_samp-glasser_prob-fornito"
split_path = "parby-wellid_splby-wellid_batch-train00{}"
algo_dir = "tgt-max_algo-smrt"
filename = "sub-all_comp-glasserwellidslogproximity_mask-{}_norm-srs_adj-none.tsv"

# Record and load files
masks = ['none', '16', '32', '64', ]
results = []
for mask in masks:
    for split_seed in range(401, 433):
        full_path = os.path.join(base_path, split_path.format(split_seed), algo_dir, filename.format(mask))
        rank_col_name = "s{}rank".format(split_seed)
        if os.path.isfile(full_path):
            one_df = get_ranks_from_file(full_path, column_name=rank_col_name)
            result = {
                'tsv': full_path,
                'mask': "00" if mask == "none" else mask,
                'split': split_seed,
                'df': one_df,
                'ranks': one_df[rank_col_name].astype(int),
            }
            results.append(result)
    
df = pd.DataFrame(results)

# We should have a result for each split-mask combination, 4 masks * 32 splits = 128 files
df.groupby('mask')['tsv'].count()


mask
00    32
16    32
32    32
64    32
Name: tsv, dtype: int64

In [2]:
""" Average ranks over splits for each probe_id. """

from pygest.rawdata.miscellaneous import map_pid_to_eid_fornito


rankings_by_mask = {}
for mask in df['mask'].unique():
    sdf = pd.DataFrame(
        data=[s for s in df[df['mask'] == mask].sort_values('split', ascending=True)['ranks']],
    ).T
    sdf['mean_rank'] = sdf.mean(axis='columns')
    sdf['entrez_id'] = sdf.index.map(map_pid_to_eid_fornito)
    sdf = sdf[sorted(sdf.columns)].sort_values('mean_rank', ascending=True)
    rankings_by_mask[mask] = sdf


In [3]:
""" Save results to disk. """

os.makedirs("rankings", exist_ok=True)

for mask, final_result in rankings_by_mask.items():
    final_result.to_csv(os.path.join("rankings", "log_proximity_mask-{}.tsv".format(mask)), sep="\t")
    