# Compare gene ontology from real results to those from shuffles

In [1]:
PYGEST_DATA="/home/mike/mnt/tendril_ge_data"


In [2]:
""" One of these kids is doing his own thing. This file stands alone, and was never split.
    It was also calculated using Leon French's software instead of ours, but has been formatted to match ours.
"""

import os


whole_result = os.path.join(
    PYGEST_DATA,
    "derivatives/sub-all_hem-A_samp-glasser_prob-fornito/parby-wellid_splby-wellid_batch-full/tgt-max_algo-leon_shuf-none",
    "sub-all_comp-hcpniftismoothconnsim_mask-none_norm-srs_adj-none.tsv"
)


## Collect results for real and shuffled data within each split-quarter

In [3]:
""" Collect lists of files. """

import glob
import pandas as pd
from datetime import datetime
from erminej import dataframe_from_erminej_results, p_real_v_shuffles


def dictify_result(split, file_override=None, verbose=True):
    """ From a single result file, collect information about the result and its gene ontology. """
    
    start_time = datetime.now()
    if verbose:
        print("Seeking files for split {} at {}...".format(split, start_time))
        
    # Determine which results are available on disk...
    
    if file_override is None:
        real_file = os.path.join(
            PYGEST_DATA, "derivatives", "sub-all_hem-A_samp-glasser_prob-fornito",
            "parby-wellid_splby-wellid_batch-train00{}".format(split), "tgt-max_algo-smrt_shuf-none",
            "sub-all_comp-hcpniftismoothconnsim_mask-none_norm-srs_adj-none.tsv"
        )
    else:
        real_file = file_override
    real_go = real_file.replace(".tsv", ".ejgo_roc")
    split_data = {
        'split': split,
        'real_file': real_file if os.path.isfile(real_file) else None,
        'real_go': real_go if os.path.isfile(real_go) else None,
    }
    for shuf in ["agno", "dist", "be04", ]:
        glob_pattern = real_file.replace("shuf-none", "shuf-{}".format(shuf)).replace(".tsv", "_seed-*.tsv")
        split_data[shuf + "_files"] = glob.glob(glob_pattern)
        split_data[shuf + "_gos"] = glob.glob(glob_pattern.replace(".tsv", ".ejgo_roc"))
    
    # Generate a string to describe these data without back-calculating and counting
    
    split_data['description'] = "Split {} :  {}/{} real, {}/{} agno-, {}/{} dist-, {}/{} be04".format(
        split,
        "-" if split_data['real_file'] is None else "+",
        "-" if split_data['real_go'] is None else "+",
        len(split_data["agno_files"]), len(split_data["agno_gos"]),
        len(split_data["dist_files"]), len(split_data["dist_gos"]),
        len(split_data["be04_files"]), len(split_data["be04_gos"]),
    )
    if verbose:
        print("        " + split_data['description'] + " @ " + str(datetime.now()))
    
    # Load and parse GO data for each result.
    
    if split_data["real_go"] is not None:
        real_go = dataframe_from_erminej_results(split_data["real_go"])
        real_go = real_go.set_index("ID").sort_index()[["Pval", ]].rename(columns={"Pval": "p"})
        for shuffle_type in ["agno", "dist", "be04", ]:
            # Each of these splits has n=50 shuffled permutations per shuffle_type
            if len(split_data[shuffle_type + "_gos"]) > 0:
                if verbose:
                    print("        {:,} {} p-values @ {}".format(len(split_data[shuffle_type + "_gos"]), shuffle_type, datetime.now()))
                real_copy = real_go.copy()
                for seed, shuffled_df in enumerate([dataframe_from_erminej_results(f) for f in split_data[shuffle_type + "_gos"]]):
                    # Each seed (of each shuffle_type and each split) contributes its own p-values per GO term
                    new_df = shuffled_df.set_index("ID").sort_index()[["Pval", ]]
                    new_df = new_df.rename(columns={"Pval": "p_{}_{:03}_{:03}".format(shuffle_type, split_data['split'], seed + 1)})
                    real_copy = pd.concat([real_copy, new_df], axis="columns")
                shuffle_summary = real_copy.apply(p_real_v_shuffles, axis="columns")
                split_data[shuffle_type + "_pvals"] = pd.concat([real_copy, shuffle_summary], axis="columns")
            else:
                split_data[shuffle_type + "_pvals"] = real_go.copy()

    end_time = datetime.now()
    if verbose:
        print("        finished split {} at {}".format(split, end_time))
        print("        ...took {}".format(end_time - start_time))

    return split_data


In [4]:
results = {
    100: dictify_result(100, file_override=whole_result)
}
for split in range(200, 216):
    results[split] = dictify_result(split)
for split in range(401, 433):
    results[split] = dictify_result(split)

print("Found {:,} real results.".format(len(results)))


Seeking files for split 100 at 2020-07-28 21:05:33.912653...
        Split 100 :  +/+ real, 0/0 agno-, 0/0 dist-, 0/0 be04 @ 2020-07-28 21:05:33.987827
        finished split 100 at 2020-07-28 21:05:34.524996
        ...took 0:00:00.612343
Seeking files for split 200 at 2020-07-28 21:05:34.525449...
        Split 200 :  +/+ real, 24/24 agno-, 24/24 dist-, 24/24 be04 @ 2020-07-28 21:05:35.560100
        24 agno p-values @ 2020-07-28 21:05:36.051360
        24 dist p-values @ 2020-07-28 21:05:49.845403
        24 be04 p-values @ 2020-07-28 21:06:04.266912
        finished split 200 at 2020-07-28 21:06:18.165608
        ...took 0:00:43.640159
Seeking files for split 201 at 2020-07-28 21:06:18.168040...
        Split 201 :  +/+ real, 24/24 agno-, 24/24 dist-, 24/24 be04 @ 2020-07-28 21:06:18.880318
        24 agno p-values @ 2020-07-28 21:06:19.288555
        24 dist p-values @ 2020-07-28 21:06:33.078626
        24 be04 p-values @ 2020-07-28 21:06:48.413509
        finished split 201 at 20

In [14]:
""" Just to save our work at a cut-off point... """

import pickle


pickle.dump(results, open("results_0005-0128.dict.pickle", "wb"))


## Compare results

At this point, p-values are calculated within each of 32 split-quarters. This allows the real samples to be compared fairly to shuffles arising from them and not others. To attain an overall p-value, the 'hits' (GO terms scoring better in shuffled data than real) can be summed across splits and divided by the overall number of GO runs.

In [6]:
""" Generate a collection of just the reals.
    We ignore all other p-values, numerators, and denominators, because they are all relative to shuffles.
"""

real_lists = {"whole": [], "halves": [], "quarters": [], }
real_dfs = {}

def append_p_to_list(which_seed, which_list):
    this_series = results[which_seed]["agno_pvals"].sort_index()["p"]
    this_series.name = str(which_seed)
    real_lists[which_list].append(this_series)
    
for seed in [100, ]:
    append_p_to_list(seed, "whole")
for seed in range(200, 216):
    append_p_to_list(seed, "halves")
for seed in range(401, 433):
    append_p_to_list(seed, "quarters")
    
for k in real_lists.keys():
    real_dfs[k] = pd.concat(real_lists[k], axis="columns")
    real_dfs[k]["mean_p"] = real_dfs[k].mean(axis="columns")


In [7]:
real_dfs["whole"]

Unnamed: 0_level_0,100,mean_p
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
GO:0000002,0.493540,0.493540
GO:0000012,0.826676,0.826676
GO:0000014,0.708340,0.708340
GO:0000018,0.021567,0.021567
GO:0000019,0.861522,0.861522
...,...,...
GO:2001267,0.948086,0.948086
GO:2001268,0.788256,0.788256
GO:2001269,0.925895,0.925895
GO:2001279,0.942997,0.942997


In [8]:
real_dfs["halves"]

Unnamed: 0_level_0,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,mean_p
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
GO:0000002,0.426472,0.525824,0.557235,0.425244,0.484430,0.548144,0.407009,0.578040,0.566694,0.204187,0.315019,0.447450,0.456899,0.582128,0.782902,0.798155,0.506614
GO:0000012,0.137470,0.167648,0.212906,0.056494,0.101690,0.051293,0.031359,0.072861,0.020189,0.026894,0.130897,0.083597,0.078572,0.188559,0.119008,0.043397,0.095177
GO:0000014,0.148879,0.041214,0.038255,0.047466,0.078432,0.352514,0.091059,0.022613,0.072983,0.141581,0.163877,0.296699,0.091287,0.115044,0.069358,0.244744,0.126000
GO:0000018,0.954870,0.968621,0.982382,0.871303,0.872366,0.945333,0.981527,0.958989,0.967652,0.992874,0.873960,0.800205,0.979756,0.987981,0.864125,0.649788,0.915733
GO:0000019,0.388126,0.205988,0.186071,0.109009,0.137671,0.133497,0.139647,0.186244,0.102475,0.125812,0.039463,0.129979,0.338853,0.152291,0.027387,0.200932,0.162715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:2001267,0.060681,0.536475,0.204365,0.026858,0.005230,0.041583,0.154601,0.099778,0.149451,0.153995,0.030128,0.077814,0.144063,0.192104,0.020709,0.116377,0.125888
GO:2001268,0.183283,0.379632,0.157952,0.091663,0.136729,0.105869,0.594944,0.134524,0.202028,0.179653,0.153510,0.125499,0.245909,0.173057,0.186879,0.419252,0.216899
GO:2001269,0.102431,0.643718,0.402319,0.084189,0.007965,0.111558,0.066619,0.222946,0.254807,0.285264,0.056701,0.183259,0.209048,0.361743,0.028506,0.086272,0.194209
GO:2001279,0.228373,0.124898,0.014034,0.148803,0.106991,0.196296,0.184882,0.594821,0.135905,0.260718,0.308199,0.104555,0.035191,0.101317,0.026493,0.085748,0.166076


In [9]:
real_dfs["quarters"]

Unnamed: 0_level_0,401,402,403,404,405,406,407,408,409,410,...,424,425,426,427,428,429,430,431,432,mean_p
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GO:0000002,0.650164,0.778572,0.682236,0.355619,0.487600,0.485390,0.437101,0.279645,0.786145,0.437347,...,0.437803,0.417947,0.489368,0.621389,0.202824,0.882904,0.482624,0.838063,0.777999,0.572731
GO:0000012,0.280707,0.446418,0.313941,0.018879,0.068577,0.276094,0.354092,0.034158,0.057409,0.155026,...,0.168265,0.509913,0.324575,0.008251,0.157074,0.096429,0.006348,0.188142,0.017685,0.155338
GO:0000014,0.059665,0.501610,0.484538,0.234712,0.173035,0.710818,0.285314,0.255786,0.130110,0.119055,...,0.191600,0.471220,0.313750,0.075363,0.374092,0.348028,0.067393,0.312428,0.339480,0.249680
GO:0000018,0.971475,0.999768,0.837348,0.973329,0.907089,0.997311,0.996148,0.945024,0.992988,0.928897,...,0.975437,0.958373,0.812120,0.863500,0.907555,0.847776,0.948560,0.963762,0.890923,0.932271
GO:0000019,0.406523,0.524989,0.188068,0.220548,0.116489,0.522158,0.327928,0.521643,0.143707,0.235217,...,0.248529,0.120071,0.111565,0.356479,0.063639,0.101362,0.256823,0.023885,0.317493,0.270939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:2001267,0.169865,0.161704,0.249664,0.078732,0.038389,0.111839,0.258595,0.063402,0.195555,0.606040,...,0.081290,0.041218,0.102166,0.037290,0.309801,0.120157,0.073656,0.171760,0.082325,0.137643
GO:2001268,0.205591,0.466505,0.584356,0.065713,0.172479,0.470297,0.476559,0.074506,0.320005,0.842404,...,0.463535,0.280805,0.085071,0.046387,0.365606,0.211858,0.133335,0.376156,0.406276,0.278033
GO:2001269,0.287340,0.113924,0.147799,0.276898,0.065964,0.066491,0.215357,0.215239,0.234217,0.318127,...,0.041391,0.038393,0.306196,0.177146,0.359339,0.194258,0.165870,0.167294,0.054455,0.179921
GO:2001279,0.409854,0.150353,0.048254,0.303245,0.205718,0.112967,0.502402,0.185998,0.012258,0.587117,...,0.203599,0.385333,0.750753,0.063843,0.504772,0.001325,0.257831,0.641981,0.040834,0.280943


In [10]:
""" Turn those mean p-values into ranks and sort by them. """

go_rank_dfs = []
for k, v in real_dfs.items():
    df = v[['mean_p']].rename(columns={"mean_p": str(k) + "_p"})
    df = df.sort_values(k + "_p", ascending=True)
    df[k + "_rank"] = range(1, len(df) + 1)
    df = df.sort_index()
    go_rank_dfs.append(df)
    
df_ranks = pd.concat(go_rank_dfs, axis="columns")

df_ranks


Unnamed: 0_level_0,whole_p,whole_rank,halves_p,halves_rank,quarters_p,quarters_rank
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GO:0000002,0.493540,4416,0.506614,5504,0.572731,6342
GO:0000012,0.826676,6889,0.095177,975,0.155338,1224
GO:0000014,0.708340,5944,0.126000,1304,0.249680,2280
GO:0000018,0.021567,457,0.915733,9192,0.932271,9418
GO:0000019,0.861522,7178,0.162715,1703,0.270939,2530
...,...,...,...,...,...,...
GO:2001267,0.948086,8173,0.125888,1303,0.137643,1023
GO:2001268,0.788256,6526,0.216899,2303,0.278033,2630
GO:2001269,0.925895,7873,0.194209,2049,0.179921,1479
GO:2001279,0.942997,8098,0.166076,1748,0.280943,2665


In [11]:
""" Compare rankings with Kendall tau """

from scipy.stats import kendalltau


whole_to_halves_r, whole_to_halves_p = kendalltau(df_ranks['whole_rank'], df_ranks['halves_rank'])
halves_to_whole_r, halves_to_whole_p = kendalltau(df_ranks['halves_rank'], df_ranks['whole_rank'])
whole_to_quarters_r, whole_to_quarters_p = kendalltau(df_ranks['whole_rank'], df_ranks['quarters_rank'])
quarters_to_whole_r, quarters_to_whole_p = kendalltau(df_ranks['quarters_rank'], df_ranks['whole_rank'])
halves_to_quarters_r, halves_to_quarters_p = kendalltau(df_ranks['halves_rank'], df_ranks['quarters_rank'])
quarters_to_halves_r, quarters_to_halves_p = kendalltau(df_ranks['quarters_rank'], df_ranks['halves_rank'])

print("Whole (re-rank every iteration) vs halves (smart-re-ranking):")
print("    ktau = {:0.3f} (p={:0.5f}), should match {:0.3f}".format(whole_to_halves_r, whole_to_halves_p, halves_to_whole_r))
print("Whole (re-rank every iteration) vs quarters (smart-re-ranking):")
print("    ktau = {:0.3f} (p={:0.5f}), should match {:0.3f}".format(whole_to_quarters_r, whole_to_quarters_p, quarters_to_whole_r))
print("Halves (smart-re-ranking) vs quarters (smart-re-ranking):")
print("    ktau = {:0.3f} (p={:0.5f}), should match {:0.3f}".format(halves_to_quarters_r, halves_to_quarters_p, quarters_to_halves_r))


Whole (re-rank every iteration) vs halves (smart-re-ranking):
    ktau = -0.527 (p=0.00000), should match -0.527
Whole (re-rank every iteration) vs quarters (smart-re-ranking):
    ktau = -0.581 (p=0.00000), should match -0.581
Halves (smart-re-ranking) vs quarters (smart-re-ranking):
    ktau = 0.833 (p=0.00000), should match 0.833


In [12]:
""" The same thing, put another way... """

df_ranks.sort_values("whole_rank", ascending=True).iloc[:10, :]


Unnamed: 0_level_0,whole_p,whole_rank,halves_p,halves_rank,quarters_p,quarters_rank
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GO:0032543,8.48e-10,1,0.675388,7190,0.790706,8590
GO:0070125,2.715e-08,2,0.438105,4763,0.674079,7506
GO:0072599,3.456e-08,3,0.074557,739,0.206378,1762
GO:0015934,1.387e-07,4,0.125759,1300,0.284717,2717
GO:0006414,1.404e-07,5,0.478827,5210,0.678307,7554
GO:0000313,1.43e-07,6,0.532754,5766,0.713923,7926
GO:0005761,1.43e-07,7,0.532754,5765,0.713923,7927
GO:0006415,1.441e-07,8,0.397797,4320,0.644477,7172
GO:0045047,2.5e-07,9,0.053079,494,0.178765,1467
GO:0070126,4.892e-07,10,0.3691,4030,0.610301,6774


In [13]:
df_ranks.sort_values("halves_rank", ascending=True).iloc[:10, :]


Unnamed: 0_level_0,whole_p,whole_rank,halves_p,halves_rank,quarters_p,quarters_rank
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GO:0061687,0.999184,9380,2.8e-05,1,0.000414,2
GO:0010273,0.998262,9305,5.3e-05,2,0.00075,4
GO:1990169,0.998262,9306,5.3e-05,3,0.00075,3
GO:0097501,0.9957,9180,0.00018,4,0.001303,5
GO:0017153,0.999338,9389,0.000256,5,0.000333,1
GO:0032036,0.996879,9227,0.000538,6,0.011788,28
GO:0102991,0.996175,9201,0.000563,7,0.002375,7
GO:0019886,0.995617,9174,0.000564,8,0.004287,8
GO:0002495,0.994903,9149,0.000669,9,0.005222,12
GO:0002504,0.994903,9148,0.000669,10,0.005222,11


In [152]:
for idx in df_ranks.index:
    if "726" in idx:
        print(idx)
        

GO:0000726
GO:0002726
GO:0004726
GO:0005726
GO:0007260
GO:0007263
GO:0007266
GO:0007269
GO:0035726
GO:0042726
GO:0061726
GO:0072643
GO:0072655
GO:0072656
GO:0072662
GO:0072663
GO:0072665
GO:0072666
GO:0072669
GO:0072673
GO:0072674
GO:0072675
GO:0072676
GO:0072677
GO:0072678
GO:0072683
GO:0072686
GO:0072687
GO:0072698
GO:0097264
GO:1903726
GO:1990726
GO:2000726


## Comparison to Fulcher, et al. 2020

The most reported GO term in the Fulcher survey is

    - "GO:0007268", 'chemical synaptic transmission', #1 (15 citations).         NOT included in our GO because it has 415 genes, multifunc rank 0.92

Additional terms mentioned in results are:

    - "GO:0071805", 'potassium ion transmembrane transport', #2 (10 citations),  NOT included in our GO because it has 157 genes, multifunc rank 0.78
    - "GO:0007267", 'cell-cell signaling', #3 (8 citations),                     NOT included in our GO because it has 1079 genes, multifunc rank 0.93
    - "GO:0007611", 'learning or memory', #27 (5 citations),                     NOT included in our GO because it has 258 genes, multifunc rank 0.98
    - "GO:0022900", 'electron transport chain', #47 (4 citations),               NOT included in our GO because it has 168 genes, multifunc rank 0.90
   
### Most importantly, ordered by CFPR vs human SBPspatial:

    - "GO:0006614" SRP-dependent cotranslational protein targeting to membrane (SBPSpatial=0.36, SBPRandom=0.25), 95 genes, multifunc rank 0.27
    - "GO:0045047" protein targeting to ER (SBPSpatial=0.36, SBPRandom=0.25), 108 genes, multifunc rank 0.43
    - "GO:0019882" antigen processing and presentation (SBPSpatial=0.36, SBPRandom=0.16), 214 genes, multifunc rank 0.93
    - "GO:0006613" cotranslational proterin targeting to membrane (SBPSpatial=0.36, SBPRandom=0.25), 99 genes, multifunc rank 0.26
    - "GO:0072599" establishment of protein localization to endoplasmic reticulum (SBPSpatial=0.36, SBPRandom=0.25), 112 genes, multifunc rank 0.51
    - "GO:0006612" protein targeting to membrane (SBPSpatial=0.35, SBPRandom=0.23), 157 genes, multifunc rank 0.58
    

In [155]:
""" Where do false positive terms rank in our data?
"""

cited_go_list = ["GO:0007268", "GO:0071805", "GO:0007267", "GO:0007611", "GO:0022900", ]
cfpr_go_list = ["GO:0006614", "GO:0045047", "GO:0019882", "GO:0006613", "GO:0072599", "GO:0006612", ]

df_ranks.loc[[gt for gt in cited_go_list + cfpr_go_list if gt in df_ranks.index], :]



Unnamed: 0_level_0,whole_p,whole_rank,halves_p,halves_rank,quarters_p,quarters_rank
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GO:0071805,0.9197752,7806,0.756087,7976,0.573858,6354
GO:0006614,2.346e-06,13,0.04182,389,0.156844,1235
GO:0045047,2.5e-07,9,0.053079,494,0.178765,1467
GO:0006613,9.009e-07,11,0.044534,409,0.163454,1302
GO:0072599,3.456e-08,3,0.074557,739,0.206378,1762


In [53]:
for k, v in finals.items():
    print(k)

# 'none'-shuffles aren't necessary. Every dataframe in the dict has real values embedded as column 'p'. The shuffled values follow.

('agno', 401)
('dist', 401)
('be04', 401)
('agno', 402)
('dist', 402)
('be04', 402)
('agno', 403)
('dist', 403)
('be04', 403)
('agno', 404)
('dist', 404)
('be04', 404)
('agno', 405)
('dist', 405)
('be04', 405)
('agno', 406)
('dist', 406)
('be04', 406)
('agno', 407)
('dist', 407)
('be04', 407)
('agno', 408)
('dist', 408)
('be04', 408)
('agno', 409)
('dist', 409)
('be04', 409)
('agno', 410)
('dist', 410)
('be04', 410)
('agno', 411)
('dist', 411)
('be04', 411)
('agno', 412)
('dist', 412)
('be04', 412)
('agno', 413)
('dist', 413)
('be04', 413)
('agno', 414)
('dist', 414)
('be04', 414)
('agno', 415)
('dist', 415)
('be04', 415)
('agno', 416)
('dist', 416)
('be04', 416)
('agno', 417)
('dist', 417)
('be04', 417)
('agno', 418)
('dist', 418)
('be04', 418)
('agno', 419)
('dist', 419)
('be04', 419)
('agno', 420)
('dist', 420)
('be04', 420)
('agno', 421)
('dist', 421)
('be04', 421)
('agno', 422)
('dist', 422)
('be04', 422)
('agno', 423)
('dist', 423)
('be04', 423)
('agno', 424)
('dist', 424)
('be04

In [54]:
finals[("agno", 402)]


Unnamed: 0_level_0,p,p_agno_402_001,p_agno_402_002,p_agno_402_003,p_agno_402_004,p_agno_402_005,p_agno_402_006,p_agno_402_007,p_agno_402_008,p_agno_402_009,...,p_agno_402_044,p_agno_402_045,p_agno_402_046,p_agno_402_047,p_agno_402_048,p_agno_402_049,p_agno_402_050,numerator,denominator,new_p
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GO:0000002,0.778572,0.678843,0.790110,0.727893,0.697876,0.619297,0.912719,0.269952,0.728341,0.687306,...,0.445831,0.177739,0.522926,0.745311,0.798751,0.779445,0.851763,28.0,50.0,0.56
GO:0000012,0.446418,0.498293,0.359762,0.580676,0.407942,0.590255,0.241997,0.903390,0.885351,0.219388,...,0.851329,0.015194,0.097858,0.278442,0.735112,0.647464,0.067179,27.0,50.0,0.54
GO:0000014,0.501610,0.315237,0.078376,0.865124,0.946506,0.491796,0.772408,0.175188,0.647627,0.716004,...,0.891390,0.726061,0.264759,0.524415,0.908828,0.599697,0.804659,11.0,50.0,0.22
GO:0000018,0.999768,0.894492,0.834463,0.657378,0.235051,0.076197,0.380962,0.251679,0.861483,0.999547,...,0.818055,0.288718,0.273273,0.215581,0.923326,0.884258,0.076997,50.0,50.0,1.00
GO:0000019,0.524989,0.718658,0.519326,0.785314,0.136136,0.063094,0.678658,0.229756,0.507046,0.990179,...,0.917345,0.307694,0.193163,0.606996,0.883004,0.168177,0.294530,25.0,50.0,0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:2001267,0.161704,0.099953,0.724556,0.489703,0.427183,0.386299,0.538353,0.593881,0.410854,0.239835,...,0.400725,0.239874,0.161415,0.248205,0.758888,0.536228,0.653474,15.0,50.0,0.30
GO:2001268,0.466505,0.305522,0.673180,0.202000,0.445612,0.076359,0.630467,0.309427,0.131000,0.353146,...,0.752011,0.722005,0.333051,0.339867,0.358714,0.079179,0.693305,31.0,50.0,0.62
GO:2001269,0.113924,0.105974,0.656680,0.741546,0.450167,0.789611,0.441282,0.762024,0.734040,0.272720,...,0.189284,0.083248,0.177814,0.294329,0.885425,0.899731,0.538597,9.0,50.0,0.18
GO:2001279,0.150353,0.615373,0.085671,0.199916,0.845675,0.060276,0.598427,0.488472,0.280917,0.514505,...,0.092894,0.095487,0.000944,0.255400,0.508903,0.602458,0.043127,19.0,50.0,0.38


In [55]:
finals[("dist", 402)]


Unnamed: 0_level_0,p,p_dist_402_001,p_dist_402_002,p_dist_402_003,p_dist_402_004,p_dist_402_005,p_dist_402_006,p_dist_402_007,p_dist_402_008,p_dist_402_009,...,p_dist_402_044,p_dist_402_045,p_dist_402_046,p_dist_402_047,p_dist_402_048,p_dist_402_049,p_dist_402_050,numerator,denominator,new_p
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GO:0000002,0.778572,0.895196,0.677928,0.863467,0.939987,0.767387,0.689215,0.669868,0.976278,0.943602,...,0.603504,0.778658,0.875594,0.601757,0.647465,0.846804,0.455428,22.0,50.0,0.44
GO:0000012,0.446418,0.767701,0.651579,0.099236,0.467788,0.601467,0.494807,0.089971,0.149771,0.609001,...,0.527613,0.593224,0.347142,0.455102,0.240097,0.154203,0.500872,24.0,50.0,0.48
GO:0000014,0.501610,0.426702,0.054009,0.712638,0.538953,0.756689,0.554764,0.216462,0.251688,0.762048,...,0.622990,0.139457,0.339341,0.407885,0.147154,0.361578,0.370375,23.0,50.0,0.46
GO:0000018,0.999768,0.443075,0.998301,0.781827,0.678155,0.976013,0.674489,0.992313,0.999655,0.816317,...,0.993926,0.926921,0.808601,0.894167,0.937026,0.990157,0.919199,49.0,50.0,0.98
GO:0000019,0.524989,0.269724,0.199243,0.267735,0.110566,0.370383,0.168719,0.173122,0.313970,0.113417,...,0.944084,0.232939,0.214246,0.120849,0.378588,0.802707,0.694650,35.0,50.0,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:2001267,0.161704,0.113960,0.209017,0.248993,0.191849,0.451485,0.344187,0.048537,0.210253,0.649702,...,0.258113,0.519818,0.311734,0.239912,0.110097,0.124713,0.133256,19.0,50.0,0.38
GO:2001268,0.466505,0.524655,0.551395,0.666485,0.632729,0.509177,0.540135,0.362505,0.216819,0.313494,...,0.492784,0.726473,0.732104,0.679221,0.572934,0.669981,0.330767,22.0,50.0,0.44
GO:2001269,0.113924,0.054592,0.124914,0.110101,0.081969,0.430092,0.274270,0.031314,0.344443,0.814109,...,0.205216,0.334477,0.127292,0.100027,0.041671,0.032210,0.140815,19.0,50.0,0.38
GO:2001279,0.150353,0.109588,0.006044,0.226967,0.128131,0.320048,0.343113,0.356970,0.317583,0.023850,...,0.025280,0.437167,0.216651,0.156663,0.277785,0.904869,0.313872,15.0,50.0,0.30


In [19]:
hit_sums = pd.DataFrame()

for k, v in finals.items():
    hit_sums["{}_num_{:03}".format(k[0], k[1])] = v["numerator"]
    hit_sums["{}_den_{:03}".format(k[0], k[1])] = v["denominator"]
    print("Split {}, {}-shuffled: {} significants".format(
        k[1], k[0],
        len(v[v["new_p"] < 0.05])
    ))


Split 402, agno-shuffled: 723 significants
Split 402, dist-shuffled: 506 significants
Split 402, be04-shuffled: 2626 significants
Split 405, agno-shuffled: 809 significants
Split 405, dist-shuffled: 408 significants
Split 405, be04-shuffled: 2596 significants
Split 406, agno-shuffled: 613 significants
Split 406, dist-shuffled: 427 significants
Split 406, be04-shuffled: 2433 significants
Split 410, agno-shuffled: 697 significants
Split 410, dist-shuffled: 357 significants
Split 410, be04-shuffled: 2524 significants
Split 411, agno-shuffled: 583 significants
Split 411, dist-shuffled: 466 significants
Split 411, be04-shuffled: 2813 significants
Split 412, agno-shuffled: 669 significants
Split 412, dist-shuffled: 491 significants
Split 412, be04-shuffled: 3443 significants
Split 415, agno-shuffled: 616 significants
Split 415, dist-shuffled: 430 significants
Split 415, be04-shuffled: 2511 significants
Split 416, agno-shuffled: 728 significants
Split 416, dist-shuffled: 632 significants
Spli

In [31]:
for shuffle_term in ["agno", "dist", "be04", ]:
    hit_sums[shuffle_term + "_numerator"] = hit_sums[[col for col in hit_sums.columns if shuffle_term + "_num_" in col]].sum(axis="columns")
    hit_sums[shuffle_term + "_denominator"] = hit_sums[[col for col in hit_sums.columns if shuffle_term + "_den_" in col]].sum(axis="columns")
    hit_sums[shuffle_term + "_p_overall"] = hit_sums[shuffle_term + "_numerator"] / hit_sums[shuffle_term + "_denominator"]
    display_df = hit_sums[[shuffle_term + "_numerator", shuffle_term + "_denominator", shuffle_term + "_p_overall", ]]
    print("For {}-, {} terms are p<0.05".format(shuffle_term, len(display_df[display_df[shuffle_term + "_p_overall"] < 0.05].index)))
    print(display_df.sort_values(shuffle_term + "_p_overall", ascending=True).iloc[0:10, :])


For agno-, 67 terms are p<0.05
            agno_numerator  agno_denominator  agno_p_overall
ID                                                          
GO:0017153             3.0             650.0        0.004615
GO:0008637             4.0             650.0        0.006154
GO:0036473             4.0             650.0        0.006154
GO:0071565             5.0             650.0        0.007692
GO:0097501             5.0             650.0        0.007692
GO:0061687             7.0             650.0        0.010769
GO:0050750             7.0             650.0        0.010769
GO:0071564             7.0             650.0        0.010769
GO:0036109             7.0             650.0        0.010769
GO:0102991            10.0             650.0        0.015385
For dist-, 0 terms are p<0.05
            dist_numerator  dist_denominator  dist_p_overall
ID                                                          
GO:1902950            42.0             630.0        0.066667
GO:0017153            42

In [24]:
hit_sums[[col for col in hit_sums.columns if "agno_num_" in col]].sum(axis="columns")

ID
GO:0000002    254.0
GO:0000012    155.0
GO:0000014    105.0
GO:0000018    517.0
GO:0000019    223.0
              ...  
GO:2001267    120.0
GO:2001268    249.0
GO:2001269    111.0
GO:2001279    312.0
GO:2001280    348.0
Length: 9544, dtype: float64

## Create tables of original complete GO rankings

and write them out with original whac-a-probe and GO results.

In [None]:
whole_go = dataframe_from_erminej_results(whole_go_file)

whole_go[whole_go['CorrectedPvalue'] < 0.05][["ID", "Name", "Pval", "CorrectedPvalue"]].sort_values("Pval").to_csv(
    "/home/mike/Dropbox/Projects/GE-Conn/gene_ontology/leons_java/java_glasser1280_whole_results.significant.csv"
)

whole_go_ranks = whole_go.sort_values("Pval").set_index("ID")
whole_go_ranks['rank'] = range(1, len(whole_go_ranks) + 1)
whole_go_ranks = whole_go_ranks.sort_index()[['rank']]
whole_go_ranks.to_csv("/home/mike/Dropbox/Projects/GE-Conn/gene_ontology/leons_java/java_glasser1280_whole_results.ranks.csv")


and compare them to split-halves and split-quarters

In [78]:
whole_go_ranks[['Pval', 'rank']]

Unnamed: 0_level_0,Pval,rank
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
GO:0032543,8.480000e-10,1
GO:0070125,2.715000e-08,2
GO:0072599,3.456000e-08,3
GO:0015934,1.387000e-07,4
GO:0006414,1.404000e-07,5
...,...,...
GO:0050906,1.000000e+00,9540
GO:0004984,1.000000e+00,9541
GO:0050911,1.000000e+00,9542
GO:0009593,1.000000e+00,9543


In [None]:
from scipy.stats import kendalltau


kendalltau(whole_go_ranks.index

## Scratch

In [None]:
test_real_df = dataframe_from_erminej_results(os.path.join(
    PYGEST_DATA, "derivatives", "sub-all_hem-A_samp-glasser_prob-fornito",
    "parby-wellid_splby-wellid_batch-train00422", "tgt-max_algo-smrt_shuf-none",
    "sub-all_comp-hcpniftismoothconnsim_mask-none_norm-srs_adj-none.ejgo_roc"
))
test_shuf_df_a = dataframe_from_erminej_results(os.path.join(
    PYGEST_DATA, "derivatives", "sub-all_hem-A_samp-glasser_prob-fornito",
    "parby-wellid_splby-wellid_batch-train00422", "tgt-max_algo-smrt_shuf-agno",
    "sub-all_comp-hcpniftismoothconnsim_mask-none_norm-srs_adj-none_seed-00002.ejgo_roc"
))
test_shuf_df_b = dataframe_from_erminej_results(os.path.join(
    PYGEST_DATA, "derivatives", "sub-all_hem-A_samp-glasser_prob-fornito",
    "parby-wellid_splby-wellid_batch-train00422", "tgt-max_algo-smrt_shuf-agno",
    "sub-all_comp-hcpniftismoothconnsim_mask-none_norm-srs_adj-none_seed-00012.ejgo_roc"
))


In [None]:
test_real_df.sort_values("Pval", ascending=True)


In [None]:
test_real_df.sort_values("Pval", ascending=True)["ID"]

In [None]:
test_shuf_df_a.sort_values("Pval", ascending=True)["ID"]

In [None]:
test_real_df.sort_values("ID", ascending=True).index


In [None]:
test_shuf_df_a.sort_values("Pval", ascending=True).sort_values("ID")["ID"]

In [None]:
test_real_df[["ID", "Pval", "CorrectedPvalue"]].loc[4332]


In [None]:
test_shuf_df_b.set_index("ID").sort_index()[["Pval", ]].rename(columns={"Pval": "p_012"})

In [None]:
test_real_df.set_index("ID").sort_index()[["Pval", ]].rename(columns={"Pval": "p_real"})

In [None]:
import pandas as pd

r = test_real_df.set_index("ID").sort_index()[["Pval", ]].rename(columns={"Pval": "p"})
sa = test_shuf_df_a.set_index("ID").sort_index()[["Pval", ]].rename(columns={"Pval": "p_agno_002"})
sb = test_shuf_df_b.set_index("ID").sort_index()[["Pval", ]].rename(columns={"Pval": "p_agno_010"})
df_all = pd.concat([r, sa, sb, ], axis='columns')

df_all


In [None]:
row = df_all.loc["GO:2001280", :]
[row["p"] > row[p] for p in [col for col in df_all.columns if "p_agno" in col]]

In [None]:
sum([False, True])

In [None]:
def new_p(row):
    shuffled_columns = [col for col in row.index if "p_agno" in col]
    numerator = sum([row["p"] > row[p] for p in shuffled_columns])
    denominator = len(shuffled_columns)
    return pd.Series([numerator, denominator, numerator/denominator], index=["numerator", "denominator", "new_p", ])

df_p = df_all.apply(new_p, axis="columns")
new_df = pd.concat([df_all, df_p], axis="columns")
new_df


In [None]:
df_all.iloc[:4, :].apply(lambda row: print([i for i in row.index]), axis="columns")
