In [1]:
import pandas as pd
import pickle
import numpy as np
import os
import re
import random
import time
from itertools import combinations
from itertools import accumulate
from collections import defaultdict
random.seed(13)

In [2]:
def match_label(x, pos_ppi_dict, neg_ppi_dict):
    group_list = []
    for k, v in pos_ppi_dict.items():
        if x in v:
            group_list.append(k)
    if len(group_list) > 1:
        return(1, group_list)
    elif len(group_list) == 1:
        return(1, group_list[0])
    elif x in neg_ppi_dict.keys():
        return(-1, neg_ppi_dict.get(x))
    else:
        return(0, np.nan)

In [3]:
data_dir = '../ppi_ml/data/featmats/'
fmat_file = 'featmat_final.pkl'
outfile = 'featmat_final_labeled'
gold_std_file = '../ppi_ml/data/gold_stds/all.gold.cmplx.noRibos.merged.txt'

## Make positive label dictionary

In [4]:
## TO DO (IMPORTANT): if a PPI belongs to multiple groups, give all of those groups the same group #
print('Generating grouped positive PPI labels from gold standard complexes ...')
pos_ppi_dict = dict()
group_no = 0
dupes = []
with open(gold_std_file, 'r') as f:
    ppis = f.read().splitlines() 
    for p in ppis:
        ogs = p.split(' ')
        fsets = [frozenset({i, j}) for i,j in list(combinations(ogs, 2))]
#             for ppi in fsets:
#                 for group, fset in pos_ppi_dict.items():
#                     if ppi in fset:
        pos_ppi_dict.update({group_no: fsets})
        group_no += 1

Generating grouped positive PPI labels from gold standard complexes ...


## Make negative label dictionary

In [5]:
print('Getting random proteins from positive PPIs to generate negative PPIs ...')
random_prots = set()
for group_no, fsets in pos_ppi_dict.items():
    prot_set = set()
    for pair in fsets:
        prot_set.add(list(pair)[0])
        prot_set.add(list(pair)[1])
    neg_prot = random.sample(prot_set, 2)
    random_prots.add(neg_prot[0])
    random_prots.add(neg_prot[1])
    
print('Generating negative PPIs ...')
neg_ppis = [frozenset({i, j}) for i,j in list(combinations(random_prots, 2))]

Getting random proteins from positive PPIs to generate negative PPIs ...
Generating negative PPIs ...


since Python 3.9 and will be removed in a subsequent version.
  neg_prot = random.sample(prot_set, 2)


In [6]:
## TODO: check & make sure this is working correctly
#  update: I believe it's working as intended but needs to be more efficient
print('Removing overlap between random negative PPIs & positive PPIs ...')
t0 = time.time()
overlap_count = 0
for fset in neg_ppis:
    if any(fset in vals for vals in pos_ppi_dict.values()):
        neg_ppis.remove(fset)
        overlap_count += 1
print(f'# overlapping negative PPIs found & removed = {overlap_count}; total time: {time.time() - t0} seconds)')

Removing overlap between random negative PPIs & positive PPIs ...
# overlapping negative PPIs found & removed = 5848; total time: 1370.866881608963 seconds)


In [7]:
num_pos_groups = group_no
print(f'Randomly splitting negative PPIs into {num_pos_groups} groups ...')
neg_cmplx_sizes = [random.randint(2, 30) for x in range(num_pos_groups)]
neg_ppi_grouped = [neg_ppis[x - y: x] for x, y in zip(
        accumulate(neg_cmplx_sizes), neg_cmplx_sizes)]
print('# of negative PPI groups =', len(neg_ppi_grouped))

Randomly splitting negative PPIs into 1498 groups ...
# of negative PPI groups = 1498


In [8]:
print('Generating grouped negative PPI labels ...')
neg_ppi_dict = dict()
group_sizes = []
group_no = 0
for group in neg_ppi_grouped:
    group_sizes.append(len(group))
    for pair in group:
        neg_ppi_dict.update({pair: group_no})
    group_no += 1

Generating grouped negative PPI labels ...


## Label feature matrix

In [9]:
print(f'Loading features from {fmat_file}...')
with open(data_dir+fmat_file, 'rb') as handle:
    fmat = pickle.load(handle)

print('Formatting feature matrix ID columns & rows ...')
fmat[['ID1','ID2']] = fmat['ID'].str.split(' ',expand=True)
fmat = fmat[fmat['ID2'].notna()]

Loading features from featmat_final.pkl...
Formatting feature matrix ID columns & rows ...


In [10]:
t0 = time.time()
print('Labeling feature matrix ...')
fmat[['label','group']] = [match_label(frozenset({i, j}), pos_ppi_dict, neg_ppi_dict) for i, j in zip(fmat['ID1'], fmat['ID2'])]
print(f"Total time to label {len(fmat)} rows: {time.time() - t0} seconds")

Labeling feature matrix ...


  return asarray(a).ndim


Total time to label 4905920 rows: 3053.4434237480164 seconds


In [11]:
print('Formatting labeled feature matrix ...')
# fmat[['label', 'group']] = pd.DataFrame(fmat.iloc[:,-1].tolist(), index=fmat.index)
# apparently I don't need this anymore?
# this is what happens when you write code when you're tired
final_fmat = fmat.explode('group')

Formatting labeled feature matrix ...


In [12]:
final_fmat.reset_index(drop=True)

Unnamed: 0,ID,arath.iex_1.150p.braycurtis.feat,arath.iex_1.150p.euclidean.feat,arath.iex_1.150p.pearsonR.feat,arath.iex_1.150p.spearmanR.feat,arath.iex_2.150p.braycurtis.feat,arath.iex_2.150p.euclidean.feat,arath.iex_2.150p.pearsonR.feat,arath.iex_2.150p.spearmanR.feat,arath.iex_3.150p.braycurtis.feat,...,all.norm.150p.pearsonR.feat,all.norm.150p.euclidean.feat,all.norm.150p.braycurtis.feat,all.norm.150p.spearmanR_weighted.feat,all.norm.150p.covariance.feat,all.norm.150p.spearmanR.feat,ID1,ID2,label,group
0,ENOG502QPIC ENOG502QR7H,0.980388,5.157008,-0.075497,-0.066312,0.971325,9.577124,-0.067190,-0.015749,0.997534,...,0.084792,16.020943,0.922872,0.092672,0.000964,0.092672,ENOG502QPIC,ENOG502QR7H,,
1,ENOG502QPIC KOG0253,0.997407,5.492013,-0.046335,-0.059675,0.000000,0.000000,0.000000,0.000000,0.855288,...,0.045951,20.657592,0.931287,0.046892,0.000976,0.046892,ENOG502QPIC,KOG0253,,
2,ENOG502QPIC KOG0734,0.865005,5.916432,0.103333,0.095018,0.998996,11.567026,-0.139190,-0.171415,0.636175,...,0.054817,26.813569,0.913607,0.060166,0.001697,0.060166,ENOG502QPIC,KOG0734,,
3,ENOG502QPIC KOG0739,0.930706,5.596031,0.075926,0.071345,0.899831,9.242962,0.084084,0.115910,0.879471,...,0.026507,25.106546,0.924551,0.016978,0.000738,0.018129,ENOG502QPIC,KOG0739,,
4,ENOG502QPIC KOG0935,0.996862,4.694391,-0.036563,-0.041946,0.000000,0.000000,0.000000,0.000000,0.997371,...,0.083056,20.047832,0.884782,0.104665,0.001732,0.104665,ENOG502QPIC,KOG0935,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4909444,ENOG502RYI9 KOG3271,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.076095,23.269955,0.933585,0.136094,0.000875,0.136094,ENOG502RYI9,KOG3271,,
4909445,ENOG502RYI9 KOG3274,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.149089,16.385381,0.853965,0.314815,0.001299,0.314815,ENOG502RYI9,KOG3274,,
4909446,ENOG502RYI9 KOG3394,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.136270,19.947022,0.881158,0.298281,0.001433,0.298281,ENOG502RYI9,KOG3394,,
4909447,ENOG502RYI9 KOG3855,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.107709,22.084680,0.900136,0.335223,0.001242,0.335223,ENOG502RYI9,KOG3855,,


In [13]:
t0 = time.time()
final_fmat.to_pickle(data_dir+outfile+'.pkl')
final_fmat.to_csv(data_dir+outfile, index=False)
print(f"Total time to write feature matrix of shape {final_fmat.shape}: {time.time() - t0} seconds")

In [29]:
traintest = fmat[(fmat['label'] == 1) | (fmat['label'] == -1)]
traintest.reset_index(drop=True, inplace=True)
traintest

Unnamed: 0,ID,arath.iex_1.150p.braycurtis.feat,arath.iex_1.150p.euclidean.feat,arath.iex_1.150p.pearsonR.feat,arath.iex_1.150p.spearmanR.feat,arath.iex_2.150p.braycurtis.feat,arath.iex_2.150p.euclidean.feat,arath.iex_2.150p.pearsonR.feat,arath.iex_2.150p.spearmanR.feat,arath.iex_3.150p.braycurtis.feat,...,all.norm.150p.pearsonR.feat,all.norm.150p.euclidean.feat,all.norm.150p.braycurtis.feat,all.norm.150p.spearmanR_weighted.feat,all.norm.150p.covariance.feat,all.norm.150p.spearmanR.feat,ID1,ID2,label,group
0,ENOG502QPKB ENOG502QR6E,0.991141,4.629059,-0.017727,-0.020866,0.996961,6.541499,-0.039241,-0.044258,0.918232,...,0.163667,14.212332,0.820479,0.324410,0.001886,0.324410,ENOG502QPKB,ENOG502QR6E,1.0,510
1,ENOG502QPKK KOG0217,0.904316,7.923271,-0.084660,-0.089603,0.841520,7.512258,0.077270,0.061780,0.847915,...,0.047714,28.773405,0.866353,0.054372,0.001887,0.054372,ENOG502QPKK,KOG0217,-1.0,1086
2,ENOG502QPKK KOG0543,0.834664,7.655461,-0.012730,0.187082,0.740612,7.184413,0.058533,0.178202,0.887328,...,0.070146,27.910029,0.839593,0.089879,0.002658,0.089879,ENOG502QPKK,KOG0543,-1.0,1204
3,ENOG502QPKK KOG1010,0.881270,6.775107,0.122434,0.022435,0.997989,6.211178,-0.045498,-0.058684,0.958449,...,0.106118,26.054951,0.832354,0.142634,0.003807,0.142634,ENOG502QPKK,KOG1010,1.0,1123
4,ENOG502QPKK KOG1342,0.913148,9.151854,-0.027414,-0.022579,0.943018,7.738496,-0.020960,-0.066114,0.834587,...,0.076809,29.055032,0.842580,0.131878,0.003153,0.131878,ENOG502QPKK,KOG1342,1.0,1123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14758,KOG1746 KOG3593,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.225024,14.573284,0.765552,0.319363,0.002834,0.319363,KOG1746,KOG3593,1.0,320
14759,KOG0916 KOG1690,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.029378,22.605125,0.969016,-0.060988,-0.000691,-0.060988,KOG0916,KOG1690,-1.0,783
14760,KOG1597 KOG1894,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.007452,21.764704,0.970387,-0.011462,0.000138,-0.013672,KOG1597,KOG1894,1.0,1443
14761,KOG1894 KOG2691,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.029312,19.900526,0.955195,0.016888,0.000490,0.018053,KOG1894,KOG2691,1.0,1443


In [30]:
t0 = time.time()
traintest.to_pickle(data_dir+outfile+'_traintest'+'.pkl')
traintest.to_csv(data_dir+outfile+'_traintest', index=False)
print(f"Total time to write traint/test matrix of shape {final_fmat.shape}: {time.time() - t0} seconds")

Total time to write traint/test matrix of shape (4909449, 698): 9.822200775146484 seconds


## Checks & balances below here

In [32]:
index_list = []
multi_count = 0
for i in range(len(fmat)):
    group = fmat['group'][i]
    if type(group) == list:
        index_list.append(i)
        multi_count += 1

In [33]:
multi_count

2070

In [15]:
pos_check = random.choice(list(pos_ppi_dict.values()))
neg_check = random.choice(list(neg_ppi_dict.keys()))
print(pos_check)
print(neg_check)

[frozenset({'KOG3801', 'KOG1549'})]
frozenset({'ENOG502QRRE', 'KOG1567'})


In [17]:
len(fmat[fmat['label'] == 1])

8150

In [20]:
fmat[fmat['label'] == 1].head()

Unnamed: 0,ID,arath.iex_1.150p.braycurtis.feat,arath.iex_1.150p.euclidean.feat,arath.iex_1.150p.pearsonR.feat,arath.iex_1.150p.spearmanR.feat,arath.iex_2.150p.braycurtis.feat,arath.iex_2.150p.euclidean.feat,arath.iex_2.150p.pearsonR.feat,arath.iex_2.150p.spearmanR.feat,arath.iex_3.150p.braycurtis.feat,...,all.norm.150p.pearsonR.feat,all.norm.150p.euclidean.feat,all.norm.150p.braycurtis.feat,all.norm.150p.spearmanR_weighted.feat,all.norm.150p.covariance.feat,all.norm.150p.spearmanR.feat,ID1,ID2,label,group
3573,ENOG502QPKB ENOG502QR6E,0.991141,4.629059,-0.017727,-0.020866,0.996961,6.541499,-0.039241,-0.044258,0.918232,...,0.163667,14.212332,0.820479,0.32441,0.001886,0.32441,ENOG502QPKB,ENOG502QR6E,1.0,510
5048,ENOG502QPKK KOG1010,0.88127,6.775107,0.122434,0.022435,0.997989,6.211178,-0.045498,-0.058684,0.958449,...,0.106118,26.054951,0.832354,0.142634,0.003807,0.142634,ENOG502QPKK,KOG1010,1.0,1123
5199,ENOG502QPKK KOG1342,0.913148,9.151854,-0.027414,-0.022579,0.943018,7.738496,-0.02096,-0.066114,0.834587,...,0.076809,29.055032,0.84258,0.131878,0.003153,0.131878,ENOG502QPKK,KOG1342,1.0,1123
5872,ENOG502QPKK KOG2577,0.943066,7.732929,-0.037749,-0.041866,0.756016,6.088625,0.34017,0.378,0.883751,...,0.081174,25.225641,0.8677,0.111333,0.002678,0.111333,ENOG502QPKK,KOG2577,1.0,"[580, 1123]"
6269,ENOG502QPKK KOG3378,0.925078,7.029331,-0.013101,0.04033,0.916539,6.634619,-0.003357,-0.027489,0.968065,...,0.029063,27.44461,0.886788,0.024398,0.001057,0.024682,ENOG502QPKK,KOG3378,1.0,792


In [21]:
multigroup = 'ENOG502QPKK KOG2577'

In [23]:
final_fmat[final_fmat['ID'] == multigroup]

Unnamed: 0,ID,arath.iex_1.150p.braycurtis.feat,arath.iex_1.150p.euclidean.feat,arath.iex_1.150p.pearsonR.feat,arath.iex_1.150p.spearmanR.feat,arath.iex_2.150p.braycurtis.feat,arath.iex_2.150p.euclidean.feat,arath.iex_2.150p.pearsonR.feat,arath.iex_2.150p.spearmanR.feat,arath.iex_3.150p.braycurtis.feat,...,all.norm.150p.pearsonR.feat,all.norm.150p.euclidean.feat,all.norm.150p.braycurtis.feat,all.norm.150p.spearmanR_weighted.feat,all.norm.150p.covariance.feat,all.norm.150p.spearmanR.feat,ID1,ID2,label,group
5872,ENOG502QPKK KOG2577,0.943066,7.732929,-0.037749,-0.041866,0.756016,6.088625,0.34017,0.378,0.883751,...,0.081174,25.225641,0.8677,0.111333,0.002678,0.111333,ENOG502QPKK,KOG2577,1.0,580
5872,ENOG502QPKK KOG2577,0.943066,7.732929,-0.037749,-0.041866,0.756016,6.088625,0.34017,0.378,0.883751,...,0.081174,25.225641,0.8677,0.111333,0.002678,0.111333,ENOG502QPKK,KOG2577,1.0,1123


In [18]:
len(fmat[fmat['label'] == -1])

6613

In [19]:
for k, v in pos_ppi_dict.items():
    if k == 320:
        print(k, v)
        #if 'ENOG502QPKB' in v:
            #print(k, v)

320 [frozenset({'KOG4452', 'KOG2291'}), frozenset({'KOG4452', 'KOG2292'}), frozenset({'KOG1746', 'KOG4452'}), frozenset({'KOG4452', 'KOG3593'}), frozenset({'KOG2754', 'KOG4452'}), frozenset({'KOG4452', 'KOG2603'}), frozenset({'ENOG502SDSY', 'KOG4452'}), frozenset({'KOG4452', 'KOG3356'}), frozenset({'KOG2447', 'KOG4452'}), frozenset({'KOG2291', 'KOG2292'}), frozenset({'KOG1746', 'KOG2291'}), frozenset({'KOG2291', 'KOG3593'}), frozenset({'KOG2754', 'KOG2291'}), frozenset({'KOG2291', 'KOG2603'}), frozenset({'ENOG502SDSY', 'KOG2291'}), frozenset({'KOG2291', 'KOG3356'}), frozenset({'KOG2447', 'KOG2291'}), frozenset({'KOG1746', 'KOG2292'}), frozenset({'KOG2292', 'KOG3593'}), frozenset({'KOG2754', 'KOG2292'}), frozenset({'KOG2603', 'KOG2292'}), frozenset({'ENOG502SDSY', 'KOG2292'}), frozenset({'KOG2292', 'KOG3356'}), frozenset({'KOG2447', 'KOG2292'}), frozenset({'KOG1746', 'KOG3593'}), frozenset({'KOG1746', 'KOG2754'}), frozenset({'KOG1746', 'KOG2603'}), frozenset({'KOG1746', 'ENOG502SDSY'}),