# How to calculate network distance

In [1]:
from multidimvis_main import *

In [None]:
################################################
#
# H U M A N 
#
################################################

organism = 'Human'

G = nx.read_edgelist('input/ppi_elist.txt',data=False)
# d_ent_sym, d_sym_ent = genent2sym()

d_gene_do = pickle.load( open( "input/d_gene_do.pkl", "rb" ) )
d_do_genes = pickle.load( open( "input/d_do_genes.pkl", "rb" ) )
d_do_names = pickle.load( open( "input/DO_names.pkl", "rb" ) )
d_names_do = {y:x for x,y in d_do_names.items()}
    
# Gene Symbols 
df_gene_sym = pd.read_csv('_output_csv/DF_gene_symbol_Human.csv', index_col=0)
sym = list(df_gene_sym['0'])
l_features = []
for i in sym:
    l_features.append(i[2:-2])
d_gene_sym = dict(zip(G.nodes(),l_features))


# ESSENTIALITY 
# get dataframe with ENSG-ID and essentiality state 
df_human_ess = pd.read_table('input/human_essentiality.txt', delim_whitespace=True)

# create dict with ENSG-ID:essentiality state 
ensg_id = list(set(df_human_ess['sciName']))
gene_ess = list(df_human_ess['locus'])
d_ensg_ess = dict(zip(ensg_id, gene_ess))
 
# match ENSG-ID with entrezID
# "engs_to_entrezid": entrezIDs were matched with "ensg_id.txt" via "DAVID Database" (https://david.ncifcrf.gov/conversion.jsp)
df_human_ensg_entrez = pd.read_table('input/ensg_to_entrezid.txt')# delim_whitespace=False)
df_human_ensg_entrez.dropna()

df = df_human_ensg_entrez
df['To'] = df['To'].fillna(0)
df['To'] = df['To'].astype(int)
df_human_ensg_entrez = df

# create dict with ENGS-ID: entrezID
ensgid = list(df_human_ensg_entrez['From']) #engs ID
entrezid = list(df_human_ensg_entrez['To']) #entrez ID 

# dict with engsid : entrezid
d_ensg_entrez = dict(zip(ensgid, entrezid))

# create dict with entrezID:essentiality state 
d_id_ess_unsorted = {}
for ens,ent in d_ensg_entrez.items():
    for en, ess in d_ensg_ess.items():
        if ens == en:
            d_id_ess_unsorted[str(ent)] = ess
            
            
# check if G.nodes match entrezID in dict and sort according to G.nodes 
d_gid_ess = {}
for k,v in d_id_ess_unsorted.items():
    if k in G.nodes():
        d_gid_ess[k]=v
        
# create dict with rest of G.nodes not in dict (entrezID:essentiality)
d_gid_rest = {}
for g in G.nodes():
    if g not in d_gid_ess.keys():
        d_gid_rest[g]='not defined'
        
#print(len(d_gid_rest)+len(d_gid_ess)) # this should match G.nodes count 

# merge both dicts
d_gid_ess_all_unsorted = {**d_gid_ess, **d_gid_rest}

# sort -> G.nodes()
d_gID_all = {key:d_gid_ess_all_unsorted[key] for key in G.nodes()}

essential_genes = []
non_ess_genes = []
notdefined_genes = [] 
for k,v in d_gID_all.items():
    if v == 'E':
        essential_genes.append(k)
    elif v == 'NE':
        non_ess_genes.append(k)
    else:
        notdefined_genes.append(k)
        
        
# Centrality features 
df_centralities = pd.read_csv('_output_csv/Features_centralities_Dataframe_'+organism+'.csv', index_col=0)

d_deghubs = dict(zip(G.nodes(), df_centralities['degs']))
d_clos = dict(zip(G.nodes(), df_centralities['clos']))
d_betw = dict(zip(G.nodes(), df_centralities['betw']))
d_eigen = dict(zip(G.nodes(), df_centralities['eigen']))

d_centralities = dict(zip(list(G.nodes),zip(d_deghubs.values(),d_clos.values(),d_betw.values(),d_eigen.values())))

cent_features = []
for i in d_centralities.items():
    k=list(i)
    cent_features.append(k)

In [2]:
################################################
#
# Y E A S T 
#
################################################

organism = 'Yeast'

data = pickle.load( open( "input/BIOGRID-ORGANISM-Saccharomyces_cerevisiae_S288c-3.5.185.mitab.pickle", "rb" ) )

filter_score = data[
                    #(data['Interaction Types'] == 'psi-mi:"MI:0915"(physical association)') +
                    (data['Interaction Types'] == 'psi-mi:"MI:0407"(direct interaction)') 
                    #&
                    #(data['Taxid Interactor A'] == "taxid:559292") & 
                    #(data['Taxid Interactor B'] == "taxid:559292") 
]

g = nx.from_pandas_edgelist(filter_score, '#ID Interactor A', 'ID Interactor B')
g.remove_edges_from(nx.selfloop_edges(g)) #remove self loop

G_cere = g.subgraph(max(nx.connected_components(g), key=len)) # largest connected component (lcc)
G = G_cere

posG_entrez = []
for k in G.nodes():
    posG_entrez.append(k[22:])
    
df_gID_sym = pd.read_csv('input/Yeast_geneID_sym.csv', index_col=0)
gene_sym = list(df_gID_sym['Sym'])
gene_id = list(df_gID_sym.index)
g_ID_sym = dict(list(zip(gene_id, gene_sym)))
#len(g_ID_sym)
    
l_features = []
for i in g_ID_sym.values():
    l_features.append(i)
    
# ESSENTIALITY 

cere_gene =pd.read_csv("input/Saccharomyces cerevisiae.csv",
           delimiter= ',',
           skipinitialspace=True)
 
cere_sym = list(cere_gene['symbols'])
cere_ess = list(cere_gene['essentiality status'])
cere_sym_essentiality = dict(zip(cere_sym, cere_ess))

d_cere_ess = {}
d_cere_noess = {}
d_cere_unknown = {}

for node,es in cere_sym_essentiality.items():
    if es == 'E':
        d_cere_ess[node]=es
    elif es == 'NE':
        d_cere_noess[node]=es
        
d_cere_alless = {}
for nid, sym in g_ID_sym.items():
    for sy,ess in cere_sym_essentiality.items():
        if sym == sy:
            d_cere_alless[nid] = ess
            
d_cere_unknown = {} 
for g in G.nodes():
    if g not in d_cere_alless.keys():
        d_cere_unknown[g]='status unkonwn'
    
d_geneID_ess = {**d_cere_unknown, **d_cere_alless}

d_gID_ess = {}
d_gID_noess = {}
d_gID_notdef = {}

for k,i in d_geneID_ess.items():
    if i == 'E':
        d_gID_ess[k] = i
    elif i == 'NE':
        d_gID_noess[k] = i
    else: 
        d_gID_notdef[k] = 'not defined'

d_gID_all_unsorted = {**d_gID_ess, **d_gID_noess, **d_gID_notdef}
d_gID_all = {key:d_gID_all_unsorted[key] for key in G.nodes()}

essential_genes = []
non_ess_genes = []
notdefined_genes = [] 
for k,v in d_gID_all.items():
    if v == 'E':
        essential_genes.append(k)
    elif v == 'NE':
        non_ess_genes.append(k)
    else:
        notdefined_genes.append(k)
        
# Centrality features 
df_centralities = pd.read_csv('_output_csv/Features_centralities_Dataframe_'+organism+'.csv', index_col=0)

d_deghubs = dict(zip(G.nodes(), df_centralities['degs']))
d_clos = dict(zip(G.nodes(), df_centralities['clos']))
d_betw = dict(zip(G.nodes(), df_centralities['betw']))
d_eigen = dict(zip(G.nodes(), df_centralities['eigen']))

d_centralities = dict(zip(list(G.nodes),zip(d_deghubs.values(),d_clos.values(),d_betw.values(),d_eigen.values())))

cent_features = []
for i in d_centralities.items():
    k=list(i)
    cent_features.append(k)

In [None]:
#################################################################
#
#    SAMPLE SUP PPI NETWORK
#
#################################################################

rand_set = rd.sample(G.nodes(),200)

G_sub = nx.subgraph(G,rand_set)

G_ = G_sub.subgraph(max(nx.connected_components(G_sub), key=len))  # extract lcc graph


print(G_.number_of_nodes())
print(G_.number_of_edges())

# nx.write_edgelist(G,'subPPI_4testing.txt')
G=G_

### Network Distance method 1

In [6]:
%%time

d_idx_entz = {}
cc = 0
for entz in sorted(G.nodes()):
    d_idx_entz[cc] = entz
    cc += 1

Mspl = np.zeros(len(list(G.nodes())))

for n1 in range(len(list(G.nodes()))):
    vec = []
    for n2 in range(len(list(G.nodes()))):
        geneA = d_idx_entz[n1]
        geneB = d_idx_entz[n2]
        try:
            spl = nx.shortest_path_length(G,geneA,geneB)
            vec.append(spl)
        except nx.NetworkXNoPath:
            print('no path')
        
    Mspl = np.vstack((Mspl,vec))
Mspl = np.delete(Mspl, (0), axis=0)
DM_spl = pd.DataFrame(Mspl, index = list(G.nodes()), columns = list(G.nodes()))

CPU times: user 1h 24min 18s, sys: 4min 4s, total: 1h 28min 22s
Wall time: 1h 28min 41s


In [None]:
# from dataframe get a dict with all pairs and values 

d_DM_spl = DM_spl.to_dict()

d_SPL_pairs = {}
for k,d in d_DM_spl.items():
    for n,v in d.items():
        d_SPL_pairs[k,n]=v

In [7]:
DM_spl.to_csv(r'_output_csv/SPL_Dataframe_Yeast.csv', index = True)

### Network Distance method 2

In [383]:
%%time 

# NETWORK DISTANCE

dist_network2D = {}
for a in nx.shortest_path_length(G):
    for n,spl in a[1].items():
        dist_network2D[(a[0],n)] = spl

CPU times: user 3.29 ms, sys: 26 µs, total: 3.32 ms
Wall time: 3.39 ms


In [386]:
len(dist_network2D)

81

### Network distance precalculated

In [327]:
DM_spl_prec = pd.read_csv('_output_csv/SPL_Dataframe_'+organism+'.csv', index_col=0)
DM_spl_prec.index = list(G.nodes())
DM_spl_prec.columns = list(G.nodes())

In [328]:
DM_spl_prec

Unnamed: 0,66008,8473,2561,3759,22906,4928,1994,8481,81610,51361,...,10838,8001,51351,1551,51458,143903,10861,51471,221044,29965
66008,0.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,4.0,...,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0
8473,3.0,0.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,4.0,...,3.0,3.0,3.0,1.0,3.0,2.0,4.0,3.0,3.0,2.0
2561,3.0,3.0,0.0,3.0,3.0,3.0,2.0,4.0,3.0,4.0,...,2.0,3.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0
3759,3.0,2.0,3.0,0.0,3.0,2.0,3.0,3.0,3.0,4.0,...,2.0,3.0,2.0,2.0,3.0,1.0,3.0,2.0,3.0,2.0
22906,3.0,3.0,3.0,3.0,0.0,3.0,3.0,4.0,3.0,4.0,...,2.0,3.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143903,3.0,2.0,2.0,1.0,2.0,2.0,2.0,3.0,2.0,4.0,...,2.0,2.0,2.0,2.0,3.0,0.0,3.0,3.0,2.0,2.0
10861,3.0,4.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,5.0,...,3.0,4.0,3.0,3.0,4.0,3.0,0.0,4.0,3.0,3.0
51471,3.0,3.0,3.0,2.0,3.0,3.0,3.0,4.0,3.0,5.0,...,2.0,3.0,3.0,3.0,3.0,3.0,4.0,0.0,3.0,3.0
221044,3.0,3.0,3.0,3.0,3.0,2.0,2.0,4.0,3.0,4.0,...,2.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,0.0,3.0


In [388]:
# from dataframe get a dict with all pairs and values 

d_DM_spl = DM_spl_prec.to_dict()

d_SPL_pairs = {}
for k,d in d_DM_spl.items():
    for n,v in d.items():
        d_SPL_pairs[k,n]=v