# Compare some SRS-normalized results to our AHBA-direct results.

In [1]:
""" Set up the context and globals. """

# Result files were copied to this directory, so no need for a path.
srs_file = "sub-all_comp-hcpniftismoothgrandmeansim_mask-16_norm-none_adj-none.tsv"
raw_file = "sub-all_comp-hcpniftismoothgrandmeansim_mask-16_norm-srs_adj-none.tsv"

In [2]:
""" Load the files. """

import pandas as pd

srs_df = pd.read_csv(srs_file, sep="\t")
srs_df.rename(columns={'Unnamed: 0': 'whacked_at'}, inplace=True)
raw_df = pd.read_csv(raw_file, sep="\t")
raw_df.rename(columns={'Unnamed: 0': 'whacked_at'}, inplace=True)


In [19]:
""" Perhaps if we drop the bottom half, 3/4, 7/8, ...
    we can remove the noise and measure just the top genes. """

from scipy.stats import kendalltau

for threshold in range(8):
    """ Use sequentially smaller portions of the tops of the ranked lists, calculating Kendall tau and overlap each time. """
    t = int(len(raw_df) / 2**threshold)
    raw_ranks = raw_df[['whacked_at', 'probe_id']][:t].set_index('probe_id').sort_index()['whacked_at']
    srs_ranks = srs_df[['whacked_at', 'probe_id']][:t].set_index('probe_id').sort_index()['whacked_at']
    tau, p = kendalltau(raw_ranks, srs_ranks)
    
    overlap = len(set(raw_ranks.index).intersection(set(srs_ranks.index))) / len(raw_ranks.index)
    # print("{} raw, {} srs, {} intersection".format(
    #     len(set(raw_ranks.index)), len(set(srs_ranks.index)),
    #     len(set(raw_ranks.index).intersection(set(srs_ranks.index)))
    # ))
    print("Top {}. tau = {:0.4f}, p = {:0.4f}; {:0.2%} overlap".format(t, tau, p, overlap))

Top 15745. tau = 0.8646, p = 0.0000; 100.00% overlap
Top 7872. tau = 0.0353, p = 0.0000; 94.92% overlap
Top 3936. tau = 0.0240, p = 0.0239; 91.23% overlap
Top 1968. tau = 0.0145, p = 0.3346; 84.81% overlap
Top 984. tau = 0.0828, p = 0.0001; 85.47% overlap
Top 492. tau = 0.0335, p = 0.2661; 84.35% overlap
Top 246. tau = 0.1326, p = 0.0019; 78.05% overlap
Top 123. tau = 0.1713, p = 0.0050; 79.67% overlap


In [4]:
""" K. Maybe not. What about the average rank of the top 100 genes vs the bottom 100? """

import numpy as np

top_100_srs = srs_df['probe_id'][0:100]
mid_100_srs = srs_df['probe_id'][int(len(srs_df)/2) - 50:int(len(srs_df)/2) + 50]
bot_100_srs = srs_df['probe_id'][-100:]
print("The mean raw ranking for the top 100 SRS genes: {}; mid 100: {}; bottom 100: {}.".format(
    np.mean(raw_df[raw_df['probe_id'].isin(top_100_srs)].index),
    np.mean(raw_df[raw_df['probe_id'].isin(mid_100_srs)].index),
    np.mean(raw_df[raw_df['probe_id'].isin(bot_100_srs)].index),
))

top_100_raw = raw_df['probe_id'][0:100]
mid_100_raw = raw_df['probe_id'][int(len(raw_df)/2) - 50:int(len(raw_df)/2) + 50]
bot_100_raw = raw_df['probe_id'][-100:]
print("The mean SRS ranking for the top 100 raw genes: {}; mid 100: {}; bottom 100: {}.".format(
    np.mean(srs_df[srs_df['probe_id'].isin(top_100_raw)].index),
    np.mean(srs_df[srs_df['probe_id'].isin(mid_100_raw)].index),
    np.mean(srs_df[srs_df['probe_id'].isin(bot_100_raw)].index),
))


The mean raw ranking for the top 100 SRS genes: 98.32; mid 100: 7627.56; bottom 100: 15606.63.
The mean SRS ranking for the top 100 raw genes: 207.17; mid 100: 8012.04; bottom 100: 15674.95.
