In [38]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sbn
import networkx as nx 
import datetime as dt
import time
import random
import pickle as pkl
from scipy.sparse import csr_matrix

# reproducibility
random.seed(0)
np.random.seed(0)

In [3]:
fi = pd.read_csv('../data/raw/FIsInGene_020720_with_annotations.txt', sep='\t', header=0)
fi.head()

Unnamed: 0,Gene1,Gene2,Annotation,Direction,Score
0,16-5-5,CDC42,predicted,-,0.98
1,16-5-5,PARD3,predicted,-,1.0
2,16-5-5,PARD3B,predicted,-,1.0
3,A1CF,APOBEC1,catalyzed by; complex; input,<-,1.0
4,A1CF,EP300,expression regulated by,<-,1.0


In [4]:
with open('../../GENEID_MAPPING/data/processed/all2genesymbol.pkl', 'rb') as f: 
    all2symb = pkl.load(f)

In [5]:
edgetype = {
    'activated by; reaction'                                    :'activate',
    'activate; inhibited by; reaction'                          :'activate;inhibit',
    'activated by; complex; input; reaction'                    :'activate;complex',
    'PPrel: activation; complex; input; reaction'               :'activate;complex',
    'PPrel: inhibition, dephosphorylation'                      :'inhibit' ,
    'PPrel: activation; PPrel: activation, indirect effect'     :'activate;uncertain',
    'PPrel: binding/association; predicted'                     :'complex;uncertain',
    'PPrel: activation; catalyze; complex; input'               :'activate;catalyze;complex',
    'PPrel: activation; catalyze; catalyzed by; complex; input' :'activate;catalyze;complex',
    'GErel: expression; expression regulates'                   :'regulate',
    'catalyze; complex; input; reaction'                        :'catalyze;complex',
    'PPrel: activated by; PPrel: activated by, phosphorylation' :'activate',
    'catalyze; catalyzed by; complex; inhibit; input'           :'catalyze;complex;inhibit',
    'catalyze; catalyzed by; complex; inhibite; input'          :'catalyze;complex;inhibit',
    'PPrel: activated by; activated by'                         :'activate',
    'complex; inhibite; input'                                  :'complex;inhibit',
    'PPrel: activation; PPrel: activation, phosphorylation'     :'activate',
    'activated by; complex; inhibited by; input'                :'activate;complex;inhibit',
    'activate; activated by'                                    :'activate',
    'PPrel: activation; activate'                               :'activate',
    'PPrel: activation; PPrel: binding/association'             :'activate;complex',
    'PCrel: binding/association'                                :'complex',
    'activate; reaction'                                        :'activate',
    'complex; inhibit; input'                                   :'complex;inhibit',
    'PPrel: activated by, phosphorylation; catalyzed by'        :'activate;catalyze',
    'PCrel: inhibited by; PPrel; PPrel: activated by; PPrel: inhibited by':'inhibit;activate',
    'PPrel: activation; complex'                        :'activate;complex',
    'activated by; input'                               :'activate',
    'PPrel: inhibition, indirect effect'                :'inhibit',
    'catalyze; inhibited by'                            :'catalyze;inhibit',
    'PPrel: inhibited, indirect effect'                 :'inhibit',
    'PPrel: activated by, dephosphorylation'            :'activate',
    'catalyzed by; inhibited by'                        :'catalyze;inhibit',
    'PPrel: phosphorylated by'                          :'activate;inhibit;uncertain',
    'GErel: expression by; expression regulated by'     :'regulate',
    'PPrel: activated by; PPrel: activated by, phosphorylation; complex; input' :'activate;complex',
    'complex; inhibited by; input'                      :'complex;inhibit',
    'catalyze; catalyzed by; complex; input; reaction'  :'catalyze;complex',
    'GErel: expression, indirect effect'                :'regulate;uncertain',
    'PPrel'                                             :'uncertain',
    'catalyze; complex'                                 :'catalyze;complex',
    'PPrel: activated binding/association'              :'activate;complex',
    'activated by; catalyzed by'                        :'activate;catalyze',
    'activated by; inhibite'                            :'activate;inhibit',
    'PPrel: binding/association; complex'               :'complex',
    'activate; catalyze; complex; input'                :'activate;catalyze;complex',
    'activated by; inhibit'                             :'activate;inhibit',
    'catalyzed by; input'                               :'catalyze',
    'PPrel: indirect effect'                            :'uncertain',
    'activated by; catalyzed by; complex; input'        :'activate;catalyze;complex',
    'activate; catalyze; catalyzed by; complex; input'  :'activate;catalyze;complex', 
    'activated by; inhibited by'                        :'activate;inhibit',
    'PPrel: activation, phosphorylation'                :'activate',
    'PPrel: inhibition'                                 :'inhibit',
    'activate; catalyze'                                :'activate;catalyze',
    'complex; input; predicted'                         :'complex;uncertain',
    'PPrel: compound'                                   :'uncertain',
    'inhibit'                                           :'inhibit',
    'PPrel: activation, binding/association'            :'activate',
    'activate; input'                                   :'activate',
    'PPrel: inhibited by, dephosphorylation'            :'inhibit',
    'inhibite'                                          :'inhibit',
    'PPrel: activated, indirect effect'                 :'activate',
    'PPrel: binding/association; complex; input'        :'complex',
    'catalyze; input'                                   :'catalyze',
    'complex; reaction'                                 :'complex',
    'PPrel: activation, indirect effect'                :'activate',
    'activate; inhibited by'                            :'activate',
    'PPrel: inhibited by'                               :'inhibit',
    'PPrel: activated by, phosphorylation'              :'activate',
    'complex; predicted'                                :'complex;uncertain',
    'activate; complex; input'                          :'complex;activate',
    'activated by; complex; input'                      :'complex;activate',
    'PPrel: activation; complex; input'                 :'complex;activate',
    'inhibited by'                                      :'inhibit',
    'PPrel: inhibited by, phosphorylated by'            :'inhibit',
    'PPrel: activated by; complex; input'               :'complex;activate',
    'complex; input; reaction'                          :'complex;activate',
    'PPrel: inhibition, phosphorylation'                :'inhibit',
    'GErel: expression'                                 :'regulate',
    'GErel: expression by'                              :'regulate',
    'ECrel: compound'                                   :'uncertain',
    'PPrel: binding/association'                        :'complex',
    'reaction'                                          :'uncertain',
    'catalyze; complex; input'                          :'catalyze;complex',
    'catalyzed by; complex; input'                      :'catalyze;complex',
    'expression regulates'                              :'regulate',
    'PPrel: activated by'                               :'activate',
    'expression regulated by'                           :'regulate',
    'PPrel: activation'                                 :'activate',
    'catalyze; catalyzed by; input'                     :'catalyze',
    'activate'                                          :'activate',
    'activated by'                                      :'activate',
    'catalyze; catalyzed by; complex; input'            :'catalyze',
    'input'                                             :'uncertain',
    'catalyze'                                          :'catalyze',
    'catalyzed by'                                      :'catalyze',
    'predicted'                                         :'uncertain',
    'complex'                                           :'complex',
    'complex; input'                                    :'complex'
}

In [36]:
fi2     = fi.assign(edgetype = [edgetype[x] if x in edgetype else 'uncertain' for x in fi.Annotation.tolist()])
fi2     = fi2[lambda x: ~x.edgetype.isna()]
print(f'# of edges dropped in mapping: {fi.shape[0] - fi2.shape[0]} [{100*(fi.shape[0] - fi2.shape[0])/fi.shape[0]:.1f}%]')

max_prob = 0.95
fi2 = fi2.assign(catalyze   =   [max_prob if 'catalyze' in x else 0. for x in fi2.edgetype],
                 complex    =   [max_prob if 'complex' in x else 0. for x in fi2.edgetype], 
                 regulate   =   [max_prob if 'regulate' in x else 0. for x in fi2.edgetype], 
                 activate   =   [max_prob if 'activate' in x else 0. for x in fi2.edgetype], 
                 inhibit    =   [max_prob if 'inhibit' in x else 0. for x in fi2.edgetype])

# add uncertain edges 
for et in ['catalyze', 'complex', 'regulate', 'activate', 'inhibit']: 
    fi2[et] = fi2[et].values + fi2['edgetype'].str.contains('uncertain').values * 1.

# normalize probability
fi2[['catalyze', 'complex', 'regulate', 'activate', 'inhibit']] = fi2[['catalyze', 'complex', 'regulate', 'activate', 'inhibit']].values / fi2[['catalyze', 'complex', 'regulate', 'activate', 'inhibit']].values.sum(axis=1).reshape(-1,1)

fi2.head()

# of edges dropped in mapping: 0 [0.0%]


Unnamed: 0,Gene1,Gene2,Annotation,Direction,Score,edgetype,catalyze,complex,regulate,activate,inhibit
0,16-5-5,CDC42,predicted,-,0.98,uncertain,0.2,0.2,0.2,0.2,0.2
1,16-5-5,PARD3,predicted,-,1.0,uncertain,0.2,0.2,0.2,0.2,0.2
2,16-5-5,PARD3B,predicted,-,1.0,uncertain,0.2,0.2,0.2,0.2,0.2
3,A1CF,APOBEC1,catalyzed by; complex; input,<-,1.0,catalyze;complex,0.5,0.5,0.0,0.0,0.0
4,A1CF,EP300,expression regulated by,<-,1.0,regulate,0.0,0.0,1.0,0.0,0.0


In [7]:
fi2.shape

(268857, 11)

In [8]:
fi2.groupby('edgetype').count()['Gene1'].sort_values()


edgetype
activate;uncertain               86
activate;complex;inhibit        101
inhibit;activate                125
activate;inhibit;uncertain      159
catalyze;complex;inhibit        192
regulate;uncertain              197
catalyze;inhibit                288
complex;inhibit                 412
activate;catalyze               802
activate;complex                835
activate;inhibit               1034
activate;catalyze;complex      1183
complex;uncertain              1371
complex;activate               5858
catalyze;complex               7533
inhibit                        7793
regulate                      13398
activate                      27675
catalyze                      36581
uncertain                     79710
complex                       83524
Name: Gene1, dtype: int64

In [9]:
# how many of each edge type do we have (non-zero values for)? 

(fi2[['catalyze', 'complex', 'regulate', 'activate', 'inhibit']] > 0.).sum(axis=0)

catalyze    128102
complex     181161
regulate     94921
activate    119136
inhibit      91468
dtype: int64

In [10]:
# TODO: check for redundant edges, as is, will overwrite edges - probably should append

In [11]:
1e8 * 32 / 8 / 1000 / 1000

400.0

In [12]:
G = nx.Graph()

for i, row in fi2.iterrows(): 
    if row.Gene1.upper() in all2symb and row.Gene2.upper() in all2symb:
        G.add_edge(all2symb[row.Gene1.upper()][0], all2symb[row.Gene2.upper()][0], catalyze=row.catalyze, complex=row.complex, regulate=row.regulate, activate=row.activate, inhibit=row.inhibit)

# add self edges - DO NOT ADD SELF EDGES
#for node in G.nodes(): 
#    G.add_edge(node, node, catalyze=1., complex=1., regulate=1., activate=1., inhibit=1.)

print('# nodes in G', len(G))
print('# edges in G', len(G.edges()))

# nodes in G 13565
# edges in G 272060


In [13]:
# one large comp, rest are small (2-4 nodes)
largest_comp = next(nx.connected_components(G))
print('larget comp size:', len(largest_comp))

toremove = G.nodes()-largest_comp
G.remove_nodes_from(toremove)
print('size G:', len(G))

larget comp size: 13369
size G: 13369


In [14]:
PPI_nodelist = list(G.nodes())

catalyze_adj = nx.adjacency_matrix(G, nodelist=PPI_nodelist, weight='catalyze')
complex_adj  = nx.adjacency_matrix(G, nodelist=PPI_nodelist, weight='complex')
regulate_adj = nx.adjacency_matrix(G, nodelist=PPI_nodelist, weight='regulate')
activate_adj = nx.adjacency_matrix(G, nodelist=PPI_nodelist, weight='activate')
inhibit_adj  = nx.adjacency_matrix(G, nodelist=PPI_nodelist, weight='inhibit')

In [45]:
# remove edges that have probability of zero
new_graphs = []
for graph in [catalyze_adj, complex_adj, regulate_adj, activate_adj, inhibit_adj]: 
    coo = graph.tocoo() 
    sel = coo.data > 0
    new_row = coo.row[sel]
    new_col = coo.col[sel]
    new_data = coo.data[sel]
    new_graphs.append( csr_matrix((new_data, (new_row, new_col))) ) 

for graph in new_graphs: 
    print('# edges: len(graph.tocoo().row) -->', len(graph.tocoo().row) * 4 / 1000, 'MB')

# edges: len(graph.tocoo().row) --> 968.936 MB
# edges: len(graph.tocoo().row) --> 1396.016 MB
# edges: len(graph.tocoo().row) --> 724.144 MB
# edges: len(graph.tocoo().row) --> 911.208 MB
# edges: len(graph.tocoo().row) --> 697.128 MB


In [None]:
catalyze    128102
complex     181161
regulate     94921
activate    119136
inhibit      91468

In [15]:
PPI_nodelist[0:5]

['A1CF', 'APOBEC1', 'EP300', 'KHSRP', 'A2M']

In [27]:
len(PPI_nodelist)**2

178730161

In [25]:
catalyze_adj.indices.shape

(530109,)

In [23]:
complex_adj.todense()

matrix([[1. , 0.5, 0. , ..., 0. , 0. , 0. ],
        [0.5, 1. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 1. , ..., 0. , 0. , 0. ],
        ...,
        [0. , 0. , 0. , ..., 1. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 1. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 1. ]])

In [72]:
PPI_graph = {'nodelist'         : PPI_nodelist,                              # adjacency ordered gene list 
             'graphs'           : {
                    'catalyze'         : catalyze_adj,                           # adjacency matrices (with self edges)
                    'complex'          : complex_adj,                            # stored as scipy sparse matrices 
                    'regulate'         : regulate_adj,                           # edge values (probability of edge) 
                    'activate'         : activate_adj,                                  #                            
                    'inhibit'          : inhibit_adj,                                   #              
             },
             
             'genemap'          : {k:v[0] for k,v in all2symb.items()},          # all gene ids -> valid node labels 
             'meta'             : 'protein-protein interaction network; based on the reactome functional interactions (Wu et al.)',
             'creation_date'    :dt.datetime.now().__str__()}         

with open('../../PPI_graph.pkl', 'wb') as f: 
    pkl.dump(PPI_graph, f)