In [1]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
import networkx as nx
import pickle as pkl
import datetime as dt

In [2]:
edges = pd.read_csv('../data/raw/edges.txt', sep='\t')
nodes = pd.read_csv('../data/raw/nodes.txt', sep='\t')
lincs_cmp = pd.read_csv('./../../LINCS/LVL3/data/raw/compoundinfo_beta.txt', sep='\t')

In [3]:
overlap1 = set(lincs_cmp.cmap_name.unique()).intersection(set(nodes.Name.str.lower().unique()))
overlap2 = set(lincs_cmp.compound_aliases.unique()).intersection(set(nodes.Name.str.lower().unique()))
overlap = overlap1.union(overlap2)
len(overlap)

1274

In [4]:
nodes = nodes.assign(name=lambda x: x.Name.str.lower())
namedict = nodes.set_index('idx')['name'].to_dict()

edges = edges.assign(edge_from= lambda x: x.Var1.map(namedict), edge_to=lambda x: x.Var2.map(namedict))

In [5]:
# filter to only include edges if they are less than the n% most similar
q = 0.05

# filter to cmap drugs
edges2 = edges[lambda x: x.edge_from.isin(overlap) & x.edge_to.isin(overlap)]

# filter to most similar edges
edges2 = edges2[lambda x: x['value'] <= edges2['value'].quantile(q)]

# assign probability based on structural similarity 
edges2 = edges2.assign(edge_prob = lambda x: 1 - x['value'])

In [6]:
G = nx.from_pandas_edgelist(edges2,  source='edge_from', target='edge_to', edge_attr='edge_prob')

# add self edges
_ = [G.add_edge(node, node, edge_prob=1) for node in G.nodes()]

print('# nodes:', len(G))
print('# edges:', len(G.edges()))
print('network density:', nx.density(G))

comps = list(nx.connected_components(G))
print('# components: ', len(comps))
_ = [print(f'\t comp: {i} : {len(comps[i])}') for i in range(len(comps))]

# select just the largest component 
toremove = G.nodes() - comps[0]
_ = [G.remove_node(node) for node in toremove]

print('\n after filter')
comps = list(nx.connected_components(G))
print('# components: ', len(comps))
_ = [print(f'\t comp: {i} : {len(comps[i])}') for i in range(len(comps))]


# nodes: 1225
# edges: 48069
network density: 0.06411764705882353
# components:  2
	 comp: 0 : 1224
	 comp: 1 : 1

 after filter
# components:  1
	 comp: 0 : 1224


In [7]:
drugs = list(G.nodes())
drugdrug_adj = nx.adjacency_matrix(G, nodelist=drugs, weight='edge_prob')

In [8]:
drugdrug_adj.todense()

matrix([[1.        , 0.49871469, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.49871469, 1.        , 0.58027875, ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.58027875, 1.        , ..., 0.        , 0.50139972,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.50139972, ..., 0.        , 1.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         1.        ]])

In [9]:
DDS_graph = {'nodelist'         : drugs,                              # adjacency ordered drug node list 
             'graphs'           : {
                 'structural': drugdrug_adj
             },
             'meta'             : 'drug-drug similarity graph based on structural similarity as calculated by Sirci et al. for the CHEMANTRA network.',
             'creation_date'    :dt.datetime.now().__str__()}                              # adjacency matrices (with self edges)
                                                                        #edge prob is attr          
with open('../../DDS_graph.pkl', 'wb') as f: 
    pkl.dump(DDS_graph, f)