In [61]:
import pandas as pd
import datetime as dt
import pickle
import random
from itertools import combinations

In [88]:
def flatten(lst):
    return [item for sublist in lst for item in sublist]

def make_fset(x, drop=True):
    if len(set(x.split(' '))) < 2:
        if drop == False:
            print(f"[{ct}] WARNING: Features for '{x}' (self-self PPI) detected ...")
            x1 = x.split(' ')[0]
            fset = frozenset({x1})
            return(fset)
        else:
            return()   
    else:
        x1 = x.split(' ')[0]
        x2 = x.split(' ')[1]
        fset = frozenset({x1,x2})
        return(fset)

In [4]:
gold_std_file = '/stor/work/Marcotte/project/rmcox/leca/ppi_ml/data/gold_stds/all.gold.cmplx.noRibos.merged.txt'
pos_outfile = '/stor/work/Marcotte/project/rmcox/leca/ppi_ml/data/unit_testing/all_pos_ppis.txt'

In [5]:
gs_dict = make_gs_dict(gold_std_file)

In [20]:
fset_lst = []
for ppi in gs_dict.values():
    fset_lst.append(ppi)
flat_fsets = set(flatten(fset_lst))
all_pos_ppis = [] 
for ppi in flat_fsets:
    all_pos_ppis.append(list(ppi))

In [22]:
len(all_pos_ppis)

16064

In [8]:
with open(pos_outfile, 'w') as f:
    for ppi in pos_ppis:
        pair = list(ppi)
        line = f'{pair[0]}\t{pair[1]}\n'
        f.write(line)

In [9]:
neg_ppis = get_neg_ppis(gs_dict)

--> # total possible gold standard PPIs = 16064
--> # unique gold standard prots = 2958
--> # total possible negative PPIs = 4357339


In [25]:
neg_ppis & flat_fsets

set()

In [103]:
fmat_file = 'ppi_ml/data/featmats/featmat_labeled.pkl'
gold_std_file = '/stor/work/Marcotte/project/rmcox/leca/ppi_ml/data/gold_stds/all.gold.cmplx.noRibos.merged.txt'
ml_results_file = 'ppi_ml/results/test_walktrap/archive/ppi_scores_all.csv'

In [104]:
with open(fmat_file, 'rb') as handle:
    fmat = pickle.load(handle)

# assert no ppis are repeated in labeled feature matrix
ppi_counts = fmat.groupby(['ID']).size().sort_values(ascending=False)
assert any(x > 1 for x in ppi_counts) == False, "Non-unique PPI labels detected."

In [76]:
# assert all combinations of a cmplx are generated
cmplx_list = []

with open(gold_std_file, 'r') as f:
    ppis = f.read().splitlines()
    num_test = 5
    for p in ppis:
        ogs = p.split(' ')
        if 2 < len(ogs) < 5:
            cmplx_list.append(ogs)
            
for cmplx in cmplx_list:
    num_prots = len(cmplx)
    expected_ppi_number = (num_prots*(num_prots-1))/2
    ppis = [frozenset({i, j}) for i,j in list(combinations(cmplx, 2))]
    assert expected_ppi_number == len(ppis), "Problem with gold standard complexes; make sure each complex contains unique protein IDs (no repeated subunits)."

In [79]:
# assert that there is no overlap between train and test ppis
res = pd.read_csv(ml_results_file)

In [89]:
tt_df = res[(res.set == 'test') | (res.set == 'train')]

In [90]:
tt_df

Unnamed: 0.1,Unnamed: 0,ID,label,ppi_score,set
0,0,KOG3409 KOG3013,1,1.0,test
1,1,KOG0358 KOG0362,1,1.0,train
2,2,KOG2072 KOG0643,1,1.0,train
3,3,KOG3677 KOG1560,1,1.0,train
4,4,KOG2314 KOG0643,1,1.0,train
...,...,...,...,...,...
4490157,4490157,KOG4845 KOG1666,-1,0.0,train
4490158,4490158,KOG3229 KOG4795,-1,0.0,train
4490159,4490159,KOG3432 KOG4168,-1,0.0,train
4490160,4490160,KOG2741 KOG4322,-1,0.0,train


In [87]:
tt_ppi_counts = tt_df.groupby(['ID']).size().sort_values(ascending=False)
assert any(x > 1 for x in tt_ppi_counts) == False, "Non-unique PPI labels detected."

In [102]:
train_df = tt_df[(tt_df.set == 'train')]
train_ppis = [make_fset(i, drop=True) for i in train_df['ID']]

test_df = tt_df[(tt_df.set == 'test')]
test_ppis = [make_fset(i, drop=True) for i in test_df['ID']]

assert len(set(test_ppis) & set(train_ppis)) == 0, "Overlap between train and test PPIs detected."

In [107]:
labeled = fmat[(fmat.super_group > 0)]
labeled

Unnamed: 0,ID,super_group,label,drome.iex_4.150p.pearsonR.feat,valid.values,human.sec_1.150p.pearsonR.feat,caeel.iex_2.150p.euclidean.feat,caeel.beads_iex_2.150p.pearsonR.feat,cocnu.sec_1.150p.euclidean.feat,tetts.sec_3.150p.braycurtis.feat,...,human.iex_5.150p.euclidean.feat,cerri.sec_1.150p.euclidean.feat,human.iex_7.150p.braycurtis.feat,caeel.beads_iex_8.150p.spearmanR.feat,soybn.sec_xlink_1.150p.euclidean.feat,plaf7.bng_4.150p.pearsonR.feat,mouse.sec_1.150p.braycurtis.feat,human.iex_24.150p.euclidean.feat,tsar_concat.raw.150p.euclidean.feat,plants_concat.raw.150p.euclidean.feat
3573,ENOG502QR6E ENOG502QPKB,511,1,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0000,4.2230,0.0,0.0,6.6111,0.0000,0.0000,0.0000,0.7413,0.7131
3650,ENOG502QPKB KOG0263,1,-1,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0000,5.0303,0.0,0.0,9.5940,0.0000,0.0000,0.0000,0.6036,0.5781
3794,KOG0803 ENOG502QPKB,31,-1,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0000,4.0322,0.0,0.0,8.9051,0.0000,0.0000,0.0000,0.7509,0.7303
3989,KOG1556 ENOG502QPKB,1,-1,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0000,4.3448,0.0,0.0,7.7875,0.0000,0.0000,0.0000,0.6222,0.6099
4345,KOG2937 ENOG502QPKB,47,-1,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0000,3.7210,0.0,0.0,5.8842,0.0000,0.0000,0.0000,0.6645,0.7165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4488124,KOG4725 KOG1153,1,-1,0.0,0.0,0.2035,0.0,0.0,0.0,0.0,...,5.8269,0.0000,0.0,0.0,0.0000,-0.0450,0.9987,7.9117,0.5762,0.6945
4490540,KOG4201 KOG0303,1,-1,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0000,0.0000,0.0,0.0,0.0000,-0.0250,0.0000,0.0000,0.6443,0.0000
4490826,KOG1894 KOG1597,1,1,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0000,0.0000,0.0,0.0,0.0000,-0.0113,0.0000,0.0000,0.2738,0.0000
4490840,KOG1894 KOG2691,1,1,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0000,0.0000,0.0,0.0,0.0000,0.1752,0.0000,0.0000,0.2940,0.0000
