In [46]:
import pandas as pd
import re
import time
import numpy as np

# Set up mapping (Entrez <-> Ensembl <-> UniProt <-> eggNOG)

In [2]:
e2u_file = '../ppi_ml/annotations/uniprot/EntrezToUniprot_clean.csv'
u2og_file = '../ppi_ml/data/og_proteomes/nog_mapping/human.euNOG.diamond.mapping.2759'

In [3]:
u2og = pd.read_csv(u2og_file, sep='\t')
u2og['uniprot_id'] = u2og['ProteinID'].str.extract(r'(?<=\|)(.*)(?=\|)')    
u2og.head()

Unnamed: 0,ProteinID,ID,uniprot_id
0,tr|A0A024R1R8|A0A024R1R8_HUMAN,KOG4766,A0A024R1R8
1,sp|A0A024RBG1|NUD4B_HUMAN,KOG2839,A0A024RBG1
2,tr|A0A024RCN7|A0A024RCN7_HUMAN,ENOG502R0UR,A0A024RCN7
3,tr|A0A075B6H5|A0A075B6H5_HUMAN,ENOG502SA48,A0A075B6H5
4,sp|A0A075B6H7|KV37_HUMAN,ENOG502S3KF,A0A075B6H7


In [4]:
og_dict = dict()
og_issues = []
og_dupes = []
for i in range(len(u2og)):
    up = u2og['uniprot_id'][i]
    og = u2og['ID'][i]
    if up not in og_dict.keys():
        og_dict.update({up: og})
    else:
        if u2og['ID'][i] != og_dict[up]:
            og_issues.append(up)
            og_dict.pop(up)
        if up not in og_dupes:
            og_dupes.append(up)
print("# of total duplicated UniProt IDs = ", len(og_dupes))
print("# of UniProt IDs that map to multiple eggNOG IDs = ", len(set(og_issues)))

# of total duplicated UniProt IDs =  347
# of UniProt IDs that map to multiple eggNOG IDs =  347


In [5]:
e2u = pd.read_csv(e2u_file)
e2u['entrez_id'] = e2u['entrez_id'].astype(str).replace('\.0', '', regex=True)
e2u.dropna(inplace=True)
e2u.reset_index(drop=True, inplace=True)
e2u.head()

Unnamed: 0,gene_stable_id,gene_stable_id_version,entrez_id,uniprot_id,uniprot_sp_id
0,ENSG00000198888,ENSG00000198888.2,4535,P03886,P03886
1,ENSG00000198888,ENSG00000198888.2,4535,U5Z754,P03886
2,ENSG00000198763,ENSG00000198763.3,4536,P03891,P03891
3,ENSG00000198763,ENSG00000198763.3,4536,Q7GXY9,P03891
4,ENSG00000198763,ENSG00000198763.3,4536,A0A1X7RBG6,P03891


In [6]:
ensembl_dict = dict()
ensembl_issues = []
ensembl_dupes = []
for i in range(len(e2u)):
    up = e2u['uniprot_sp_id'][i]
    ens = e2u['gene_stable_id'][i]
    if ens in ensembl_issues:
        continue
    elif ens not in ensembl_dict.keys():
        ensembl_dict.update({ens: up})
    else:
        if e2u['uniprot_sp_id'][i] != ensembl_dict[ens]:
            ensembl_issues.append(ens)
            ensembl_dict.pop(ens)
        if ens not in ensembl_dupes:
            ensembl_dupes.append(ens)
print("# of total Ensembl IDs uniquely mapped to UniProt IDs = ", len(ensembl_dict))
print("# of total duplicated Ensembl IDs = ", len(ensembl_dupes))
print("# of Ensembl IDs that map to multiple UniProt IDs = ", len(set(ensembl_issues)))

# of total Ensembl IDs uniquely mapped to UniProt IDs =  21236
# of total duplicated Ensembl IDs =  15456
# of Ensembl IDs that map to multiple UniProt IDs =  41


In [21]:
entrez_dict = dict()
entrez_issues = []
entrez_dupes = []
for i in range(len(e2u)):
    up = e2u['uniprot_sp_id'][i]
    enz = e2u['entrez_id'][i]
    if enz in entrez_issues:
        continue
    elif enz not in entrez_dict.keys():
        entrez_dict.update({enz: up})
    else:
        if e2u['uniprot_sp_id'][i] != entrez_dict[enz]:
            entrez_issues.append(enz)
            entrez_dict.pop(enz)
        if enz not in entrez_dupes:
            entrez_dupes.append(enz)
print("# of total Entrez IDs uniquely mapped to UniProt IDs = ", len(entrez_dict))
print("# of total duplicated Entrez IDs = ", len(entrez_dupes))
print("# of Entrez IDs that map to multiple UniProt IDs = ", len(set(entrez_issues)))

# of total Entrez IDs uniquely mapped to UniProt IDs =  19105
# of total duplicated Entrez IDs =  14189
# of Entrez IDs that map to multiple UniProt IDs =  110


In [24]:
entrez_dict

{'4535': 'P03886',
 '4536': 'P03891',
 '4512': 'P00395',
 '4513': 'P00403',
 '4509': 'P03928',
 '4508': 'P00846',
 '4514': 'P00414',
 '4537': 'P03897',
 '4539': 'P03901',
 '4538': 'P03905',
 '4540': 'P03915',
 '4541': 'P03923',
 '4519': 'P00156',
 '55251': 'Q9NV79',
 '3804': 'P43628',
 '2832': 'P48146',
 '3809': 'P43632',
 '100132285': 'P43631',
 '124900573': 'P43631',
 '124900575': 'P43631',
 '3805': 'Q99706',
 '124900568': 'Q99706',
 '3803': 'P43627',
 '3808': 'Q14952',
 '4661': 'Q01538',
 '3811': 'P43629',
 '65123': 'Q68E01',
 '124900569': 'Q8N109',
 '2204': 'P24071',
 '3812': 'P43630',
 '553128': 'Q8NHK3',
 '57459': 'Q8WXI9',
 '80212': 'Q53HC0',
 '3806': 'Q14954',
 '3802': 'P43626',
 '11000': 'Q5K4L6',
 '7639': 'Q03923',
 '115653': 'Q8N743',
 '54474': 'P35900',
 '283189': 'Q8NGQ1',
 '83896': 'Q9BYR8',
 '83897': 'Q9BYR7',
 '85294': 'Q9BYR9',
 '25984': 'Q9C075',
 '83899': 'Q9BYQ4',
 '85280': 'Q9BYQ2',
 '83900': 'Q9BYQ3',
 '163227': 'Q8IYN0',
 '100533177': 'A8MX34',
 '100507608': 'A8M

# Map validation data sets to eggNOG IDs

In [91]:
from itertools import combinations
import pandas as pd
import re
import time
import numpy as np
from sklearn.metrics import log_loss

In [116]:
def map_ids(x, d):
    try:
        new_id = d[x]
        return(new_id)
    except:
        return(x)
    
def logloss(true_label, predicted, eps=1e-15):
    p = np.clip(predicted, eps, 1 - eps)
    if true_label == 1:
        return -np.log(p)
    else:
        return -np.log(1 - p)
    
def loglike(true_label, predicted, eps=1e-15):
    p = np.clip(predicted, eps, 1 - eps)
    if true_label == 1:
        return(np.log(p))
    else:
        return(np.log(1 - p))

In [81]:
# my data
leca_file = '../ppi_ml/results/ppi_predict/feature_sweep/100/scored_interactions_all_LinearSVC.csv'
leca_nogs_file = '../ppi_ml/annotations/lists/leca_nogs.txt'

# external data
xlms_file = '../ppi_ml/data/validation/bartolec_xlms_human.csv'
y2h_file = '../ppi_ml/data/validation/luck_HuRI_y2h_pairs.tsv'
cx_file = '../ppi_ml/data/validation/kim_hn3_coexp.tsv'

In [20]:
xlms = pd.read_csv(xlms_file)
xlms = xlms[xlms['URP Type'] == 'Inter']
xlms = xlms[['Protein1','Protein2','Total number of URPs','Total number of CSMs']]
xlms['ID1'] = [map_ids(i, og_dict) for i in xlms['Protein1']]
xlms['ID2'] = [map_ids(i, og_dict) for i in xlms['Protein2']]
xlms

Unnamed: 0,Protein1,Protein2,Total number of URPs,Total number of CSMs,ID1,ID2
1,Q9Y5J9,Q9Y5L4,2,4,KOG3489,KOG1733
2,Q9Y3U8,Q9Y676,1,1,KOG3452,KOG4021
3,Q9Y399,Q9Y3D9,1,2,KOG0832,ENOG502RZIC
4,Q9Y2Q9,Q9Y399,1,2,KOG4078,KOG0832
5,Q9Y2Q9,Q9Y3D9,2,3,KOG4078,ENOG502RZIC
...,...,...,...,...,...,...
2105,A1L167,P50914,1,1,KOG0897,KOG3421
2106,A0FGR8,Q9BSJ8,1,1,KOG1012,KOG1012
2107,A0A1B0GUW6,Q9UM21,1,1,ENOG502TDYW,ENOG502QPQJ
2108,A0A0B4J271,P13667,1,1,ENOG502SVXB,KOG0190


In [26]:
y2h = pd.read_csv(y2h_file, sep='\t', header=None, names=['ensembl_1','ensembl_2'])
y2h['ID1'] = [map_ids(map_ids(str(i), ensembl_dict), og_dict) for i in y2h['ensembl_1']]
y2h['ID2'] = [map_ids(map_ids(str(i), ensembl_dict), og_dict) for i in y2h['ensembl_2']]
y2h

Unnamed: 0,ensembl_1,ensembl_2,ID1,ID2
0,ENSG00000000005,ENSG00000061656,ENOG502QPTP,KOG2687
1,ENSG00000000005,ENSG00000099968,ENOG502QPTP,ENOG502R6AP
2,ENSG00000000005,ENSG00000104765,ENOG502QPTP,ENOG502R8Q5
3,ENSG00000000005,ENSG00000105383,ENOG502QPTP,ENOG502S41V
4,ENSG00000000005,ENSG00000114455,ENOG502QPTP,ENOG502S3IN
...,...,...,...,...
52543,ENSG00000273899,ENSG00000273899,KOG4709,KOG4709
52544,ENSG00000275302,ENSG00000278619,ENOG502S8M4,KOG0838
52545,ENSG00000275774,ENSG00000275774,KOG0118,KOG0118
52546,ENSG00000276070,ENSG00000278619,ENOG502S8M4,KOG0838


In [27]:
cx = pd.read_csv(cx_file, sep='\t', header=None, names=['entrez_1','entrez_2','score'])
cx['ID1'] = [map_ids(map_ids(str(i), entrez_dict), og_dict) for i in cx['entrez_1']]
cx['ID2'] = [map_ids(map_ids(str(i), entrez_dict), og_dict) for i in cx['entrez_2']]
cx

Unnamed: 0,entrez_1,entrez_2,score,ID1,ID2
0,6232,7178,4.575464,KOG1779,KOG1727
1,1915,7178,4.570955,KOG0052,KOG1727
2,6143,6167,4.564302,KOG1696,KOG3475
3,6147,6228,4.557484,KOG1751,KOG1749
4,6191,6228,4.554408,KOG0378,KOG1749
...,...,...,...,...,...
81059,55032,8195,1.585753,KOG2234,KOG0360
81060,10159,51765,1.585749,KOG4737,KOG0201
81061,10799,1798,1.585744,ENOG502QSAV,KOG2788
81062,4998,55635,1.585742,KOG1514,ENOG502QR00


In [96]:
leca_nogs = [line.strip() for line in open(leca_nogs_file, 'r')]
leca_nogs = set(leca_nogs)
len(leca_nogs)

10092

In [106]:
xlms_fs = [get_leca_pairs(i, j, leca_nogs) for i, j in zip(xlms['ID1'], xlms['ID2'])]
set(xlms_fs)

{frozenset({'KOG1569', 'KOG4607'}),
 frozenset({'KOG1723', 'KOG3434'}),
 frozenset({'KOG0327', 'KOG0401'}),
 frozenset({'KOG1245', 'KOG3048'}),
 frozenset({'KOG2013', 'KOG2014'}),
 frozenset({'KOG1769', 'KOG2013'}),
 frozenset({'KOG3833', 'KOG4380'}),
 frozenset({'KOG3366', 'KOG3976'}),
 frozenset({'KOG0318', 'KOG0676'}),
 frozenset({'KOG0118', 'KOG0127'}),
 frozenset({'KOG0101', 'KOG0676'}),
 frozenset({'KOG0319', 'KOG1539'}),
 frozenset({'KOG1697', 'KOG3291'}),
 frozenset({'KOG0052', 'KOG1628'}),
 frozenset({'KOG0446', 'KOG2480'}),
 frozenset({'KOG1628', 'KOG3273'}),
 frozenset({'KOG0845', 'KOG1835'}),
 frozenset({'KOG1494', 'KOG1997'}),
 frozenset({'KOG0105', 'KOG0106'}),
 frozenset({'KOG2291', 'KOG3593'}),
 frozenset({'ENOG502RRRX', 'KOG0488'}),
 frozenset({'KOG0019', 'KOG1144'}),
 frozenset({'KOG1708', 'KOG3331'}),
 frozenset({'KOG3168', 'KOG3172'}),
 frozenset({'KOG1753', 'KOG3411'}),
 frozenset({'KOG0829', 'KOG3184'}),
 frozenset({'KOG2074', 'KOG2179'}),
 frozenset({'KOG1465', '

In [117]:
#xlms_fs = [x for x in get_leca_pairs(i, j, leca_nogs) for i, j in zip(xlms['ID1'], xlms['ID2'])]
xlms_fs = [frozenset({i, j}) for i, j in zip(xlms['ID1'], xlms['ID2']) if i in leca_nogs and j in leca_nogs]
y2h_fs = [frozenset({i, j}) for i, j in zip(y2h['ID1'], y2h['ID2']) if i in leca_nogs and j in leca_nogs]
cx_fs = [frozenset({i, j}) for i, j in zip(cx['ID1'], cx['ID2']) if i in leca_nogs and j in leca_nogs]
print(f'# total XLMS pairs: {len(xlms_fs)}')
print(f'# total Y2H pairs: {len(y2h_fs)}')
print(f'# total CX pairs: {len(cx_fs)}')

# total XLMS pairs: 1587
# total Y2H pairs: 20568
# total CX pairs: 41650


In [202]:
leca = pd.read_csv(leca_file)
leca[['ID1', 'ID2']] = leca['ID'].str.split(' ', n=1, expand=True)
leca['fs'] = [frozenset({i, j}) for i, j in zip(leca['ID1'], leca['ID2'])]
leca_fs = [frozenset({i, j}) for i, j in zip(leca['ID1'], leca['ID2'])]

In [203]:
leca_gs = leca[(leca.set == 'train') | (leca.set == 'test')]
leca_fsgs = [frozenset({i, j}) for i, j in zip(leca_gs['ID1'], leca_gs['ID2'])]

In [204]:
xlms_true = set(leca_fs) & set(xlms_fs)
y2h_true = set(leca_fs) & set(y2h_fs)
cx_true = set(leca_fs) & set(cx_fs)
print(f'# XLMS overlap: {len(xlms_true)}')
print(f'# Y2H overlap: {len(y2h_true)}')
print(f'# CX overlap: {len(cx_true)}')

# XLMS overlap: 912
# Y2H overlap: 4610
# CX overlap: 17501


In [205]:
xlms_true_gs = set(leca_fsgs) & set(xlms_fs)
y2h_true_gs = set(leca_fsgs) & set(y2h_fs)
cx_true_gs = set(leca_fsgs) & set(cx_fs)
print(f'# XLMS overlap: {len(xlms_true)}')
print(f'# Y2H overlap: {len(y2h_true)}')
print(f'# CX overlap: {len(cx_true)}')

# XLMS overlap: 912
# Y2H overlap: 4610
# CX overlap: 17501


In [206]:
t0 = time.time()
leca['xlms'] = [1 if i in xlms_true else 0 for i in leca['fs']]
rt_xlms = round(time.time() - t0, 2)
print(f'XLMS match time: {rt_xlms} seconds')
t1 = time.time()
leca['y2h'] = [1 if i in y2h_true else 0 for i in leca['fs']]
rt_y2h = round(time.time() - t1, 2)
print(f'Y2H match time: {rt_y2h} seconds')
t2 = time.time()
leca['cx'] = [1 if i in cx_true else 0 for i in leca['fs']]
rt_cx = round(time.time() - t2, 2)
print(f'CX match time: {rt_cx} seconds')

XLMS match time: 1.72 seconds
Y2H match time: 1.82 seconds
CX match time: 2.01 seconds


In [207]:
leca['bin'] = np.divmod(np.arange(len(leca)),1000)[0]+1

## Calculate logloss per bin for the intersection of LECA PPIs and external sets

In [147]:
def calc_cumu_overlap(df, col):
    cumu_frac = 0
    cumu_lst = []
    df = df.sort_values('mean_ppi_score').reset_index(drop=True)
    for i in range(len(df)):
        cumu_frac += df[col][i]
        cumu_lst.append(cumu_frac)
    cumu_lst.reverse()
    return(cumu_lst)

def calc_binned_logloss(df, target, real_total=None):
    df = df[df[target] == 1]
    overlap = len(df)
    gb = df.groupby(['bin'])
    n = gb.size().to_frame(name='int_size')
    ll = gb.apply(lambda x: log_loss(x[target], x['ppi_score'], labels=[1,0])).to_frame(name='logloss')
    avg = gb.agg({'ppi_score':'mean'}).rename(columns={'ppi_score':'mean_ppi_score'})
    res = n.join(ll).join(avg)
    res['int_frac'] = res['int_size']/overlap
    res['int_cumu_overlap'] = calc_cumu_overlap(res, 'int_frac')
    if real_total:
        res['total_frac'] = res['int_size']/real_total
        res['total_cumu_overlap'] = calc_cumu_overlap(res, 'total_frac')
    res['set'] = target
    return(res)

In [148]:
xlms_ll = calc_binned_logloss(leca, 'xlms', real_total=len(xlms_fs))
y2h_ll = calc_binned_logloss(leca, 'y2h', real_total=len(y2h_fs))
cx_ll = calc_binned_logloss(leca, 'cx', real_total=len(cx_fs))

In [150]:
ll_out = pd.concat([xlms_ll, y2h_ll, cx_ll])
ll_out.to_csv('../ppi_ml/results/ext_val_logloss.csv')

In [151]:
ll_out

Unnamed: 0_level_0,int_size,logloss,mean_ppi_score,int_frac,int_cumu_overlap,total_frac,total_cumu_overlap,set
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,81,3.215782e-07,1.000000,0.088816,1.000000,0.051040,0.574669,xlms
2,51,8.321853e-06,0.999992,0.055921,0.911184,0.032136,0.523629,xlms
3,43,5.750111e-05,0.999943,0.047149,0.855263,0.027095,0.491493,xlms
4,36,1.993283e-04,0.999801,0.039474,0.808114,0.022684,0.464398,xlms
5,25,5.976943e-04,0.999402,0.027412,0.768640,0.015753,0.441714,xlms
...,...,...,...,...,...,...,...,...
4488,2,2.850650e+00,0.057807,0.000114,0.000800,0.000048,0.000336,cx
4489,4,2.860023e+00,0.057268,0.000229,0.000686,0.000096,0.000288,cx
4490,3,2.872887e+00,0.056536,0.000171,0.000457,0.000072,0.000192,cx
4491,2,2.890372e+00,0.055556,0.000114,0.000286,0.000048,0.000120,cx


# Calculate odds for LECA PPIs and external PPIs

In [190]:
def calc_binned_odds(df, target, overlap):
    
    # calc ext odds
    df_ext = df[df[target] == 1]
    gb = df_ext.groupby(['bin'])
    n_ext = gb.size().to_frame(name='int_size')
    n_ext['ext_odds'] = n_ext['int_size']/(overlap - n_ext['int_size'])

    # calc leca odds
    df_leca = df[(df.set == 'test') | (df.set == 'train')]
    gs_total = len(df_leca)
    gb = df_leca.groupby(['bin'])
    n_leca = gb.size().to_frame(name='gs_size')
    n_leca['leca_odds'] = n_leca['gs_size']/(gs_total - n_leca['gs_size'])
    
    # calc avg ppi score
    avg = gb.agg({'ppi_score':'mean'}).rename(columns={'ppi_score':'mean_ppi_score'})
    res = n_ext.join(n_leca).join(avg)
    
    # calc odds ratio
    res['odds_ratio'] = res['ext_odds']/res['leca_odds']
    res['set'] = target
    res['total_overlap'] = overlap
    print(res)
    return(res)

In [191]:
xlms_lo = calc_binned_odds(leca, 'xlms', overlap=len(xlms_true))
y2h_lo = calc_binned_odds(leca, 'y2h', overlap=len(y2h_true))
cx_lo = calc_binned_odds(leca, 'cx', overlap=len(cx_true))

      int_size  ext_odds  gs_size  leca_odds  mean_ppi_score  odds_ratio  \
bin                                                                        
1           81  0.097473       73   0.002761        1.000000   35.307898   
2           51  0.059233      108   0.004090        0.999992   14.483675   
3           43  0.049482      122   0.004622        0.999941   10.705182   
4           36  0.041096      141   0.005346        0.999781    7.687263   
5           25  0.028185      154   0.005842        0.999438    4.824741   
...        ...       ...      ...        ...             ...         ...   
4210         1  0.001098        2   0.000075        0.070133   14.552141   
4220         1  0.001098        4   0.000151        0.069941    7.275521   
4276         1  0.001098        2   0.000075        0.068760   14.552141   
4318         1  0.001098        5   0.000189        0.067763    5.820198   
4401         1  0.001098        2   0.000075        0.065252   14.552141   

       set 

In [192]:
lo_out = pd.concat([xlms_lo, y2h_lo, cx_lo])
lo_out.to_csv('../ppi_ml/results/ext_val_odds_ratio.csv')

# Calculate log odds ratio

In [265]:
from datetime import datetime as dt
from tqdm import tqdm
tqdm.pandas()

In [194]:
# total leca
leca = pd.read_csv(leca_file)
leca[['ID1', 'ID2']] = leca['ID'].str.split(' ', n=1, expand=True)
leca['fs'] = [frozenset({i, j}) for i, j in zip(leca['ID1'], leca['ID2'])]
leca_fs = [frozenset({i, j}) for i, j in zip(leca['ID1'], leca['ID2'])]

In [193]:
# total external
xlms_fs = [frozenset({i, j}) for i, j in zip(xlms['ID1'], xlms['ID2']) if i in leca_nogs and j in leca_nogs]
y2h_fs = [frozenset({i, j}) for i, j in zip(y2h['ID1'], y2h['ID2']) if i in leca_nogs and j in leca_nogs]
cx_fs = [frozenset({i, j}) for i, j in zip(cx['ID1'], cx['ID2']) if i in leca_nogs and j in leca_nogs]
print(f'# total XLMS pairs: {len(xlms_fs)}')
print(f'# total Y2H pairs: {len(y2h_fs)}')
print(f'# total CX pairs: {len(cx_fs)}')

# total XLMS pairs: 1587
# total Y2H pairs: 20568
# total CX pairs: 41650


In [195]:
# intersection of leca and external
xlms_int = set(leca_fs) & set(xlms_fs)
y2h_int = set(leca_fs) & set(y2h_fs)
cx_int = set(leca_fs) & set(cx_fs)
print(f'# XLMS overlap: {len(xlms_int)}')
print(f'# Y2H overlap: {len(y2h_int)}')
print(f'# CX overlap: {len(cx_int)}')

# XLMS overlap: 912
# Y2H overlap: 4610
# CX overlap: 17501


In [267]:
def calc_binned_log_odds(df, target, ext_set):
    
    all_leca_ppis = set([frozenset({i, j}) for i, j in zip(df['ID1'], df['ID2'])])
    all_prots = set([p for pair in all_leca_ppis for p in list(pair)])
    all_ppis = set([frozenset({i, j}) for i,j in list(combinations(list(all_prots), 2))])
    all_assayable_ppis = all_ppis.difference(all_leca_ppis)

    leca_only = len(all_leca_ppis.difference(ext_set))
    ext_only = len(all_assayable_ppis.intersection(ext_set))
    neither = len(all_assayable_ppis.difference(ext_set))
    both = len(df[df[target] == 1])
    odds_ratio = (both/ext_only)/(leca_only/neither)
    
    return(odds_ratio)

In [262]:
leca_binned = leca.groupby('bin')
avg = leca_binned.agg({'ppi_score':'mean'}).rename(columns={'ppi_score':'mean_ppi_score'})

In [268]:
xlms_or = leca_binned.progress_apply(lambda x: calc_binned_log_odds(x, 'xlms', xlms_fs))

  4%|███                                                                                   | 163/4492 [03:42<1:38:28,  1.36s/it]


KeyboardInterrupt: 

In [269]:
y2h_or = leca_binned.progress_apply(lambda x: calc_binned_log_odds(x, 'y2h', y2h_fs))

 17%|███████████████                                                                       | 786/4492 [27:03<2:07:36,  2.07s/it]


KeyboardInterrupt: 

In [239]:
cx_or = leca_binned.apply(lambda x: calc_binned_log_odds(x, 'cx', cx_fs))

Unnamed: 0_level_0,xlms_or
bin,Unnamed: 1_level_1
1,30.246464
2,12.815109
3,13.082196
4,13.873444
5,12.274084
...,...
4488,0.000000
4489,0.000000
4490,0.000000
4491,0.000000


In [None]:
res = avg.join(xlms_or).join(y2h_or).join(cx_or)

In [None]:
res.to_csv('../ppi_ml/results/ext_val_odds_ratio.csv')

In [240]:
len(y2h_fs)

20568

### set up for a janky parallelization on the command line

In [270]:
import pickle

In [271]:
with open('../ppi_ml/results/ext_val/xlms_set.pkl', 'wb') as f:
    pickle.dump(xlms_fs, f)
with open('../ppi_ml/results/ext_val/y2h_set.pkl', 'wb') as f:
    pickle.dump(y2h_fs, f)
with open('../ppi_ml/results/ext_val/cx_set.pkl', 'wb') as f:
    pickle.dump(cx_fs, f)

In [272]:
leca.to_pickle('../ppi_ml/results/ext_val/leca_ext_val.pkl')

In [274]:
!ls /stor/work/Marcotte/project/rmcox/leca/ppi_ml/results/ext_val/

cx_set.pkl  leca_ext_val.pkl  xlms_set.pkl  y2h_set.pkl


In [None]:
infile = sys.argv[1]
extfile = sys.argv[2]
target = sys.argv[3]
outfile = f'/stor/work/Marcotte/project/rmcox/leca/ppi_ml/results/ext_val/{target}_or.pkl'

In [279]:
for i in exps:
    print(f'python3 janky_parallel_or.py leca_ext_val.pkl {i}_set.pkl {i}')

python3 janky_parallel_or.py leca_ext_val.pkl xlms_set.pkl xlms
python3 janky_parallel_or.py leca_ext_val.pkl y2h_set.pkl y2h
python3 janky_parallel_or.py leca_ext_val.pkl cx_set.pkl cx


### results

In [281]:
res_list = []
for exp in exps:
    with open(f'../ppi_ml/results/ext_val/{exp}_or.pkl', 'rb') as f:
        results = pickle.load(f)
        print(results)
        res_list.append(results)
res_list

Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []


[Empty DataFrame
 Columns: []
 Index: [],
 Empty DataFrame
 Columns: []
 Index: [],
 Empty DataFrame
 Columns: []
 Index: []]

In [283]:
with open('../ppi_ml/results/ext_val/cx_set.pkl', 'rb') as f:
    test = pickle.load(f)
test

[frozenset({'KOG1727', 'KOG1779'}),
 frozenset({'KOG0052', 'KOG1727'}),
 frozenset({'KOG1696', 'KOG3475'}),
 frozenset({'KOG1749', 'KOG1751'}),
 frozenset({'KOG0378', 'KOG1749'}),
 frozenset({'KOG1727', 'KOG3311'}),
 frozenset({'KOG3475', 'KOG3499'}),
 frozenset({'KOG1728', 'KOG3475'}),
 frozenset({'KOG1646', 'KOG1749'}),
 frozenset({'KOG1728', 'KOG2988'}),
 frozenset({'KOG1570', 'KOG3320'}),
 frozenset({'KOG0397', 'KOG1646'}),
 frozenset({'KOG1779', 'KOG2988'}),
 frozenset({'KOG0052', 'KOG1749'}),
 frozenset({'KOG0402', 'KOG1749'}),
 frozenset({'KOG1727', 'KOG1762'}),
 frozenset({'KOG1696', 'KOG1728'}),
 frozenset({'KOG1570', 'KOG1646'}),
 frozenset({'KOG1646', 'KOG1728'}),
 frozenset({'KOG0397', 'KOG3475'}),
 frozenset({'KOG1646', 'KOG1751'}),
 frozenset({'KOG0378', 'KOG1646'}),
 frozenset({'KOG1751', 'KOG2988'}),
 frozenset({'KOG3486', 'KOG3499'}),
 frozenset({'KOG1646', 'KOG1696'}),
 frozenset({'KOG0402', 'KOG3311'}),
 frozenset({'KOG1696', 'KOG1751'}),
 frozenset({'KOG0402', 'KOG2