In [19]:
import pandas as pd
import pickle
import numpy as np
import os
import re
import random
import time
from functools import reduce
from itertools import combinations
from itertools import accumulate
from collections import defaultdict

In [2]:
!ls ../ppi_ml/data/feat_files/

featmat_allconcat	  featmat_animals.pkl	featmat_plants
featmat_allconcat.pkl	  featmat_excavate	featmat_plants.pkl
featmat_allexps_p3c2	  featmat_excavate.pkl	featmat_tsar
featmat_allexps_p3c2.pkl  featmat_humap		featmat_tsar.pkl
featmat_animals		  featmat_humap.pkl


In [3]:
!head ../ppi_ml/data/gold_stds/all.gold.cmplx.noRibos.merged.txt

KOG3599 KOG4203
KOG0730 KOG0101 KOG0676 KOG0985
KOG0374 ENOG502R17J
KOG1737 KOG0439
KOG4593 KOG3285
KOG1724 KOG3701 KOG2166
KOG4031 KOG4728
KOG0266 KOG1273
ENOG502QV4A KOG0904
KOG0660 KOG0941


In [None]:
def match_label(x, pos_ppi_dict, neg_ppi_dict):
    group_list = []
    for k, v in pos_ppi_dict.items():
        if x in v:
            group_list.append(k)
    if len(group_list) > 1:
        return(1, group_list)
    elif len(group_list) == 1:
        return(1, group_list[0])
    elif x in neg_ppi_dict.keys():
        return(-1, neg_ppi_dict.get(x))
    else:
        return(pd.nan, pd.nan)

In [5]:
data_dir = '../ppi_ml/data/feat_files/'
lmat_file = 'featmat_allexps_p3c2.pkl'
fmat_files = [f for f in os.listdir(data_dir) if re.match('.*.pkl', f) and f != lmat_file]
gold_std_file = '../ppi_ml/data/gold_stds/all.gold.cmplx.noRibos.merged.txt'
random.seed(13)

In [6]:
print('Generating grouped positive PPI labels from gold standard complexes ...')
pos_ppi_dict = dict()
group_no = 0
dupes = []
with open(gold_std_file, 'r') as f:
    ppis = f.read().splitlines() 
    for p in ppis:
        ogs = p.split(' ')
        fsets = [frozenset({i, j}) for i,j in list(combinations(ogs, 2))]
        pos_ppi_dict.update({group_no: fsets})
        group_no += 1

Generating grouped positive PPI labels from gold standard complexes ...


In [7]:
len(set(dupes))

0

In [9]:
print('Getting random proteins from positive PPIs to generate negative PPIs ...')
random_prots = []
for group_no, fsets in pos_ppi_dict.items():
    prot_list = []
    for pair in fsets:
        prot_list.append(list(pair)[0])
        prot_list.append(list(pair)[1])
    prot_list = list(set(prot_list))
    neg_prot = random.sample(prot_list, n=2)
    random_prots.append(neg_prot)

Getting random proteins from positive PPIs to generate negative PPIs ...


In [11]:
neg_ppis = [frozenset({i, j}) for i,j in list(combinations(random_prots, 2))]

In [62]:
## TODO: check & make sure this is working correctly
print('Removing overlap between random negative PPIs & positive PPIs ...')
overlap_count = 0
for fset in neg_ppis:
    if any(fset in vals for vals in pos_ppi_dict.values()):
        neg_ppis.remove(fset)
        overlap_count += 1
print('# overlapping negative PPIs found & removed =', overlap_count)

Removing overlap between random negative PPIs & positive PPIs ...


KeyboardInterrupt: 

In [13]:
num_pos_groups = group_no
print(f'Randomly splitting negative PPIs into {num_pos_groups} groups ...')
neg_cmplx_sizes = [random.randint(2, 30) for x in range(num_pos_groups)]
neg_ppi_grouped = [neg_ppis[x - y: x] for x, y in zip(
        accumulate(neg_cmplx_sizes), neg_cmplx_sizes)]

Randomly splitting negative PPIs into 1498 groups ...


In [14]:
print('# of negative PPI groups =', len(neg_ppi_grouped))

# of negative PPI groups = 1498


In [15]:
print('Generating grouped negative PPI labels ...')
neg_ppi_dict = dict()
group_sizes = []
group_no = 0
for group in neg_ppi_grouped:
    group_sizes.append(len(group))
    for pair in group:
        neg_ppi_dict.update({pair: group_no})
    group_no += 1

Generating grouped negative PPI labels ...


In [16]:
print('Loading feature matrix ...')
with open(data_dir+lmat_file, 'rb') as handle:
    lmat = pickle.load(handle)

Loading feature matrix ...


In [17]:
lmat[['ID1','ID2']] = lmat['ID'].str.split(' ',expand=True)
lmat.head()

Unnamed: 0,ID,arath.iex_1.150p.braycurtis.feat,arath.iex_1.150p.euclidean.feat,arath.iex_1.150p.pearsonR.feat,arath.iex_1.150p.spearmanR.feat,arath.iex_2.150p.braycurtis.feat,arath.iex_2.150p.euclidean.feat,arath.iex_2.150p.pearsonR.feat,arath.iex_2.150p.spearmanR.feat,arath.iex_3.150p.braycurtis.feat,...,xenla.sucrose_4.150p.braycurtis.feat,xenla.sucrose_4.150p.euclidean.feat,xenla.sucrose_4.150p.pearsonR.feat,xenla.sucrose_4.150p.spearmanR.feat,yeast.iex_1.150p.braycurtis.feat,yeast.iex_1.150p.euclidean.feat,yeast.iex_1.150p.pearsonR.feat,yeast.iex_1.150p.spearmanR.feat,ID1,ID2
0,ENOG502QPIC ENOG502QR7H,0.980388,5.157008,-0.075497,-0.066312,0.971325,9.577124,-0.06719,-0.015749,0.997534,...,,,,,,,,,ENOG502QPIC,ENOG502QR7H
1,ENOG502QPIC KOG0253,0.997407,5.492013,-0.046335,-0.059675,,,,,0.855288,...,,,,,,,,,ENOG502QPIC,KOG0253
2,ENOG502QPIC KOG0734,0.865005,5.916432,0.103333,0.095018,0.998996,11.567026,-0.13919,-0.171415,0.636175,...,,,,,,,,,ENOG502QPIC,KOG0734
3,ENOG502QPIC KOG0739,0.930706,5.596031,0.075926,0.071345,0.899831,9.242962,0.084084,0.11591,0.879471,...,,,,,,,,,ENOG502QPIC,KOG0739
4,ENOG502QPIC KOG0935,0.996862,4.694391,-0.036563,-0.041946,,,,,0.997371,...,,,,,,,,,ENOG502QPIC,KOG0935


In [24]:
t0 = time.time()
print('Labeling feature matrix ...')
lmat[['label','group']] = [match_label(frozenset({i, j}), pos_ppi_dict, neg_ppi_dict) for i, j in zip(lmat['ID1'], lmat['ID2'])]
print(f"Total time to label {len(lmat)} rows: {time.time() - t0} seconds")

Labeling feature matrix ...
Total time to label 4491718 rows: 2953.346905708313 seconds


In [28]:
pos_check = random.choice(list(pos_ppi_dict.values()))
neg_check = random.choice(list(neg_ppi_dict.keys()))

In [29]:
pos_check

[frozenset({'KOG0274', 'KOG1724'}),
 frozenset({'ENOG502RDJD', 'KOG1724'}),
 frozenset({'ENOG502RDJD', 'KOG0274'})]

In [30]:
neg_check

frozenset({'ENOG502QU6B', 'KOG1222'})

In [36]:
lmat

Unnamed: 0,ID,arath.iex_1.150p.braycurtis.feat,arath.iex_1.150p.euclidean.feat,arath.iex_1.150p.pearsonR.feat,arath.iex_1.150p.spearmanR.feat,arath.iex_2.150p.braycurtis.feat,arath.iex_2.150p.euclidean.feat,arath.iex_2.150p.pearsonR.feat,arath.iex_2.150p.spearmanR.feat,arath.iex_3.150p.braycurtis.feat,...,xenla.sucrose_4.150p.euclidean.feat,xenla.sucrose_4.150p.pearsonR.feat,xenla.sucrose_4.150p.spearmanR.feat,yeast.iex_1.150p.braycurtis.feat,yeast.iex_1.150p.euclidean.feat,yeast.iex_1.150p.pearsonR.feat,yeast.iex_1.150p.spearmanR.feat,ID1,ID2,"(label, group)"
0,ENOG502QPIC ENOG502QR7H,0.980388,5.157008,-0.075497,-0.066312,0.971325,9.577124,-0.067190,-0.015749,0.997534,...,,,,,,,,ENOG502QPIC,ENOG502QR7H,"(None, None)"
1,ENOG502QPIC KOG0253,0.997407,5.492013,-0.046335,-0.059675,,,,,0.855288,...,,,,,,,,ENOG502QPIC,KOG0253,"(None, None)"
2,ENOG502QPIC KOG0734,0.865005,5.916432,0.103333,0.095018,0.998996,11.567026,-0.139190,-0.171415,0.636175,...,,,,,,,,ENOG502QPIC,KOG0734,"(None, None)"
3,ENOG502QPIC KOG0739,0.930706,5.596031,0.075926,0.071345,0.899831,9.242962,0.084084,0.115910,0.879471,...,,,,,,,,ENOG502QPIC,KOG0739,"(None, None)"
4,ENOG502QPIC KOG0935,0.996862,4.694391,-0.036563,-0.041946,,,,,0.997371,...,,,,,,,,ENOG502QPIC,KOG0935,"(None, None)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4491713,ENOG502RYI9 KOG3271,,,,,,,,,,...,,,,0.591567,1.930056,0.797882,0.696994,ENOG502RYI9,KOG3271,"(None, None)"
4491714,ENOG502RYI9 KOG3274,,,,,,,,,,...,,,,0.647338,3.135001,0.531502,0.456474,ENOG502RYI9,KOG3274,"(None, None)"
4491715,ENOG502RYI9 KOG3394,,,,,,,,,,...,,,,0.715621,3.625529,0.439617,0.354524,ENOG502RYI9,KOG3394,"(None, None)"
4491716,ENOG502RYI9 KOG3855,,,,,,,,,,...,,,,0.612288,2.614478,0.523113,0.519076,ENOG502RYI9,KOG3855,"(None, None)"


In [51]:
lmat[['label', 'group']] = pd.DataFrame(lmat.iloc[:,-1].tolist(), index=lmat.index)

In [59]:
len(lmat[lmat['label'] == 1])

6629

In [60]:
len(lmat[lmat['label'] == -1])

5655

In [58]:
for k, v in pos_ppi_dict.items():
    if k == 320:
        print(k, v)
        #if 'ENOG502QPKB' in v:
            #print(k, v)

320 [frozenset({'KOG4452', 'KOG2291'}), frozenset({'KOG4452', 'KOG2292'}), frozenset({'KOG4452', 'KOG1746'}), frozenset({'KOG3593', 'KOG4452'}), frozenset({'KOG4452', 'KOG2754'}), frozenset({'KOG4452', 'KOG2603'}), frozenset({'ENOG502SDSY', 'KOG4452'}), frozenset({'KOG3356', 'KOG4452'}), frozenset({'KOG4452', 'KOG2447'}), frozenset({'KOG2291', 'KOG2292'}), frozenset({'KOG2291', 'KOG1746'}), frozenset({'KOG3593', 'KOG2291'}), frozenset({'KOG2291', 'KOG2754'}), frozenset({'KOG2291', 'KOG2603'}), frozenset({'ENOG502SDSY', 'KOG2291'}), frozenset({'KOG3356', 'KOG2291'}), frozenset({'KOG2291', 'KOG2447'}), frozenset({'KOG1746', 'KOG2292'}), frozenset({'KOG3593', 'KOG2292'}), frozenset({'KOG2754', 'KOG2292'}), frozenset({'KOG2292', 'KOG2603'}), frozenset({'ENOG502SDSY', 'KOG2292'}), frozenset({'KOG3356', 'KOG2292'}), frozenset({'KOG2292', 'KOG2447'}), frozenset({'KOG3593', 'KOG1746'}), frozenset({'KOG1746', 'KOG2754'}), frozenset({'KOG1746', 'KOG2603'}), frozenset({'ENOG502SDSY', 'KOG1746'}),