# HUMAN PPI 


### PIPELINE OVERVIEW
+ graph with genes as nodes and edges
+ generate a matrix from nodes based on specific parameters (features, random walk, shortest path..)
+ multidimensional matrices > embedding with tsne into 2D or 3D 
----------
+ Visualization typologies:
+ 2D PORTRAIT
+ 3D PORTRAIT
+ 3D LANDSCAPE
+ 3D SPHERE
----------

In [1]:
from multidimvis_main import *

In [2]:
organism = 'Human'

In [3]:
G = nx.read_edgelist('input/ppi_elist.txt',data=False)
# d_ent_sym, d_sym_ent = genent2sym()

d_gene_do = pickle.load( open( "input/d_gene_do.pkl", "rb" ) )
d_do_genes = pickle.load( open( "input/d_do_genes.pkl", "rb" ) )
d_do_names = pickle.load( open( "input/DO_names.pkl", "rb" ) )
d_names_do = {y:x for x,y in d_do_names.items()}

### PPI - DISEASE SUBGRAPH 

In [None]:
'''# SPECIFY SUB CATEGORY OF DISEASE

spec = 'DOID:5093'
# DOID:707 B-Cell Lymphoma (71 genes)
# DOID:9252 amino acids metabolic disorder (130 genes)
# DOID:14320 generalized anxiety disorder (112 genes)


spec_nodes = []
for k,v in d_do_genes.items():
    if spec == k:
        spec_nodes = v

set_spec_nodes = set(spec_nodes)


# SPECIFIED GRAPH 
G_sub = G_ppi.subgraph(set_spec_nodes)
G_sub_lcc = G_sub.subgraph(max(nx.connected_components(G_sub), key=len)) # largest connected component (lcc)
print('The lcc of the %s-sub PPI network contains %s genes.' %(disease_category,G_sub_lcc.number_of_nodes()))

G = G_sub_lcc

# Check which subcategory was chosen:
for k,v in sub_categories.items():
    if spec == k[1]:
        print(k,v)


G_sub = G_ppi.subgraph(set_disease_genes)
G = G_sub.subgraph(max(nx.connected_components(G_sub), key=len)) # largest connected component (lcc)
'''

### PPI RANDOM SUBGRAPH

In [None]:
'''PPI = nx.read_edgelist('input/ppi_elist.txt',data=False)
# d_ent_sym, d_sym_ent = genent2sym()

# RANDOM SAMPLE 

N = 1000
rand_set = rd.sample(PPI.nodes(), N)

PPI_sub = nx.subgraph(PPI, rand_set)
G = PPI_sub.subgraph(max(nx.connected_components(PPI_sub), key=len))  # extract lcc graph'''

# GRAPH PROPERTIES

In [4]:
print('Number of nodes i.e. genes: %s' %len(list(G.nodes())))
print('Number of edges: %s' %len(list(G.edges())))
print('Network density: %.1f%%' %(200.*len(list(G.edges()))/(len(list(G.nodes()))*len(list(G.nodes()))-1)))

Number of nodes i.e. genes: 16376
Number of edges: 309355
Network density: 0.2%


#### Degree Distribution 

In [None]:
l_k = list(degs.values())
set_k = set(l_k)
print('Degree min:', min(l_k))
print('Degree max:', max(l_k))

d_k_f = {}
for k in set_k:
    d_k_f[k] = l_k.count(k)
#print('Degree counts:',d_k_f)

mean_deg = np.mean(l_k)
var_deg = np.var(l_k)
print('Degree Mean:', mean_deg)
print('Degree Variant:', var_deg)

plt.figure(figsize=(10,10))
plt.title('Degree Distribution', fontsize = 20)
plt.xlabel('Degree', fontsize = 14)
plt.ylabel('P(k)', fontsize = 14)
plt.loglog(list(d_k_f.keys()),list(d_k_f.values()),'o',c='#008CA0')
plt.show()

#plt.savefig('PPIdegreedist.png')

# FEATURE COLLECTIONS

### + STRUCTURAL : Centrality measures

##### based on: https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html
+ degree centrality
+ closeness centrality
+ betweeness centrality
+ eigenvector centrality

In [None]:
'''%%time 

# DEGREE CENTRALITY - Node degree: important nodes being involved within high number of interactions
degs = dict(G.degree())
d_deghubs = {}
for node, de in sorted(degs.items(),key = lambda x: x[1], reverse = 1):
    d_deghubs[node] = round(float(de/max(degs.values())),4)

# CLOSENESS CENTRALITY - Measures how closely a node is connected to all other nodes to highlight f.ex. core-periphery structure, or identify central nodes
closeness = nx.closeness_centrality(G)
d_clos = {}
for node, cl in sorted(closeness.items(), key = lambda x: x[1], reverse = 1):
    d_clos[node] = round(cl,4)
    
# BETWEENESS CENTRALITY - How many shortest paths between pairs of other nodes in the network go through one node. High BC indicates "bottleneck nodes" in the network
betweens = nx.betweenness_centrality(G)
d_betw = {}
for node, be in sorted(betweens.items(), key = lambda x: x[1], reverse = 1):
     d_betw[node] = round(be,4)

# EIGENVECTOR CENTRALITY - Compute the eigenvector centrality for the graph
eigen = nx.eigenvector_centrality(G)
d_eigen = {}
for node, eig in sorted(eigen.items(), key = lambda x: x[1], reverse = 1):
     d_eigen[node] = round(eig,4)
    
    
d_deghubs_sorted = {key:d_deghubs[key] for key in sorted(d_deghubs.keys())}
d_clos_sorted = {key:d_clos[key] for key in sorted(d_clos.keys())}
d_betw_sorted = {key:d_betw[key] for key in sorted(d_betw.keys())}
d_eigen_sorted = {key:d_eigen[key] for key in sorted(d_eigen.keys())}

# feature collection
feature_dict = dict(zip(d_deghubs_sorted.keys(), zip(
                                                     d_deghubs_sorted.values(), 
                                                     d_clos_sorted.values(), 
                                                     d_betw_sorted.values(), 
                                                     d_eigen_sorted.values(),
                                                    )))

# IMPORTANT :
# sort all feature according to Graph node IDs
feature_dict_sorted = {key:feature_dict[key] for key in G.nodes()}
feature_df = pd.DataFrame.from_dict(feature_dict_sorted, orient = 'index', columns = ['degs', 
                                                                                      'clos', 
                                                                                      'betw', 
                                                                                      'eigen',
                                                                                      ]) 


l_features = [] 
for i in feature_dict_sorted.items():
    k = list(i)
    l_features.append(k)
    
feature_df.to_csv(r'output_csv/Features_centralities_Dataframe_'+organism+'.csv', index = True)
'''

In [5]:
# precalculated import

df_centralities = pd.read_csv('output_csv/Features_centralities_Dataframe_'+organism+'.csv', index_col=0)


# dicts 
d_deghubs = dict(zip(G.nodes(),df_centralities['degs']))
d_clos = dict(zip(G.nodes(), df_centralities['clos']))
d_betw = dict(zip(G.nodes(), df_centralities['betw']))
d_eigen = dict(zip(G.nodes(), df_centralities['eigen']))

# feature list 
degs = list(df_centralities['degs'])
clos = list(df_centralities['clos'])
betw = list(df_centralities['betw'])
eigen = list(df_centralities['eigen'])

d_centralities = dict(zip(list(G.nodes),zip(degs,clos,betw,eigen)))
l_features = []
for i in d_centralities.items():
    k=list(i)
    l_features.append(k)

In [6]:
DM_centralities = pd.DataFrame(distance.squareform(distance.pdist(df_centralities, 'euclidean')))
# DM_centralities.to_csv(r'output_csv/DistanceMatrix_centralities_Dataframe_'+organism+'.csv', index = True)
# DM_centralities = pd.read_csv('output_csv/DistanceMatrix_centralities_Dataframe_'+organism+'.csv', index_col=0)

### + FUNCTIONAL : Molecular Function

+ Molecular Function - Feature Matrix 

In [None]:
'''df_MF_raw = pd.read_csv('input/GO_MF.csv')
df_MF_mod = df_MF_raw.drop(['evidence'],axis=1)
d_MF_gene_goids = df_MF_mod.groupby('entrezid')['go_id'].apply(list).to_dict()
d_MF_goid_genes = df_MF_mod.groupby('go_id')['entrezid'].apply(list).to_dict()

genes_MF = list(d_MF_gene_goids.keys())
genes_goids_MF = list(d_MF_gene_goids.values())

# Get a list with all available Disease Ontology IDs
all_goids_MF = list(d_MF_goid_genes.keys())

# Get a list of all matches = search for matches of GO ID list and gene list 
matches_MF = []
for i in genes_goids_MF:
    gene_i = []
    for j in i:
        for idx,value in enumerate(all_goids_MF):
            if j == value:
                gene_i.append(idx)
    matches_MF.append(gene_i)
    
all_match_MF = []
for sub in matches_MF:
    gene_match = [0] * len(all_goids_MF)
    for elem in sub:
        for idx,value in enumerate(gene_match):
            if elem == idx:
                gene_match[idx] = 1
    all_match_MF.append(gene_match)   
    
matrix_MF = np.row_stack(all_match_MF)
df_MF = pd.DataFrame(matrix_MF, columns = all_goids_MF, index = genes_MF, dtype = int)
df_MF.to_csv(r'output_csv/Features_GO_MolFunc_Dataframe_'+organism+'.csv', index = True)'''

In [None]:
MF_array = df_MF.to_numpy()
DM_MF = pd.DataFrame(distance.squareform(distance.pdist(MF_array, 'euclidean')))
DM_MF.to_csv(r'output_csv/DistanceMatrix_goMF_Dataframe_'+organism+'.csv', index = True)

In [None]:
# import Distance Matrix of Go term : Molecular Functions  

DM_MF = pd.read_csv('output_csv/DistanceMatrix_goMF_Dataframe_'+organism+'.csv', index_col=0)

### + FUNCTIONAL : Cellular Components

In [None]:
'''df_CC_raw = pd.read_csv('input/GO_CC.csv')

df_CC_mod = df_CC_raw.drop(['evidence'],axis=1)
d_CC_gene_goids = df_CC_mod.groupby('entrezid')['go_id'].apply(list).to_dict()
d_CC_goid_genes = df_CC_mod.groupby('go_id')['entrezid'].apply(list).to_dict()

genes_CC = list(d_CC_gene_goids.keys())
genes_goids_CC = list(d_CC_gene_goids.values())

# Get a list with all available Disease Ontology IDs
all_goids_CC = list(d_CC_goid_genes.keys())

# Get a list of all matches = search for matches of GO ID list and gene list 
matches_CC = []
for i in genes_goids_CC:
    gene_i = []
    for j in i:
        for idx,value in enumerate(all_goids_CC):
            if j == value:
                gene_i.append(idx)
    matches_CC.append(gene_i)
    
all_match_CC = []
for sub in matches_CC:
    gene_match = [0] * len(all_goids_CC)
    for elem in sub:
        for idx,value in enumerate(gene_match):
            if elem == idx:
                gene_match[idx] = 1
    all_match_CC.append(gene_match)   
    
matrix_CC = np.row_stack(all_match_CC)

df_CC = pd.DataFrame(matrix_CC, columns = all_goids_CC, index = genes_CC, dtype = int)
df_CC.to_csv(r'output_csv/Features_GO_CellComp_Dataframe_'+organism+'.csv', index = True)'''

In [None]:
CC_array = df_CC.to_numpy()
DM_CC = pd.DataFrame(distance.squareform(distance.pdist(CC_array, 'euclidean')))
# DM_CC.to_csv(r'output_csv/DistanceMatrix_goCC_Dataframe_'+organism+'.csv', index = True)


# import Distance Matrix of Go term : Cellular Components 

# DM_CC = pd.read_csv('output_csv/DistanceMatrix_goCC_Dataframe_'+organism+'.csv', index_col=0)

### + FUNCTIONAL : Biological Processes

In [None]:
'''df_BP_raw = pd.read_csv('input/GO_BP.csv')

df_BP_mod = df_BP_raw.drop(['evidence'],axis=1)
d_BP_gene_goids = df_BP_mod.groupby('entrezid')['go_id'].apply(list).to_dict()
d_BP_goid_genes = df_BP_mod.groupby('go_id')['entrezid'].apply(list).to_dict()

genes_BP = list(d_BP_gene_goids.keys())
genes_goids_BP = list(d_BP_gene_goids.values())

# Get a list with all available Disease Ontology IDs
all_goids_BP = list(d_BP_goid_genes.keys())

# Get a list of all matches = search for matches of GO ID list and gene list 
matches_BP = []
for i in genes_goids_BP:
    gene_i = []
    for j in i:
        for idx,value in enumerate(all_goids_BP):
            if j == value:
                gene_i.append(idx)
    matches_BP.append(gene_i)
    
all_match_BP = []
for sub in matches_BP:
    gene_match = [0] * len(all_goids_BP)
    for elem in sub:
        for idx,value in enumerate(gene_match):
            if elem == idx:
                gene_match[idx] = 1
    all_match_BP.append(gene_match)   
    
matrix_BP = np.row_stack(all_match_BP)
df_BP = pd.DataFrame(matrix_BP, columns = all_goids_BP, index = genes_BP, dtype = int)'''

In [None]:
BP_array = df_BP.to_numpy()
DM_BP = pd.DataFrame(distance.squareform(distance.pdist(BP_array, 'euclidean')))
DM_BP.to_csv(r'output_csv/DistanceMatrix_goBP_Dataframe_'+organism+'.csv', index = True)

# import Distance Matrix of GO term: Biological Processes

# DM_BP = pd.read_csv('output_csv/DistanceMatrix_goBP_Dataframe_'+organism+'.csv', index_col=0)

### + FUNCTIONAL : Disease Annotation

In [None]:
'''all_doids = list(d_do_genes.keys())

genes_do = list(d_gene_do.keys())
genes_doids = d_gene_do.values()

# Get a list of all matches = search for matches of Disease ID list and gene list 
matches_DO = []
for i in genes_doids:
    gene_i = []
    for j in i:
        for idx,value in enumerate(all_doids):
            if j == value:
                gene_i.append(idx)
    matches_DO.append(gene_i)
    
all_match_DO = []
for sub in matches_DO:
    gene_match = [0] * len(all_doids)
    for elem in sub:
        for idx,value in enumerate(gene_match):
            if elem == idx:
                gene_match[idx] = 1
    all_match_DO.append(gene_match)   

matrix_DIS = np.row_stack(all_match_DO)

df_DIS = pd.DataFrame(matrix_DIS, columns = all_doids, index = genes_do, dtype = int)'''

In [7]:
df_DIS = pd.read_csv('output_csv/Features_Disease_Dataframe_'+organism+'.csv', index_col=0)
DIS_array = df_DIS.to_numpy()
DM_disease = pd.DataFrame(distance.squareform(distance.pdist(DIS_array, 'euclidean')))
# DM_disease.to_csv(r'output_csv/DistanceMatrix_Disease_Dataframe_'+organism+'.csv', index = True)

# import Distance Matrix of Disease Annotations

# DM_disease = pd.read_csv('output_csv/DistanceMatrix_Disease_Dataframe_'+organism+'.csv', index_col=0)

In [8]:
genes_dis = list(df_DIS.index)

### GRAPH MATRICES

In [13]:
%%time

A = nx.adjacency_matrix(G)
DM_adj = A.toarray()

CPU times: user 1.3 s, sys: 393 ms, total: 1.69 s
Wall time: 1.69 s


In [None]:
%%time

d_idx_entz = {}
cc = 0
for entz in sorted(G.nodes()):
    d_idx_entz[cc] = entz
    cc += 1

Mspl = np.zeros(len(list(G.nodes())))

for n1 in range(len(list(G.nodes()))):
    vec = []
    for n2 in range(len(list(G.nodes()))):
        geneA = d_idx_entz[n1]
        geneB = d_idx_entz[n2]
        try:
            spl = nx.shortest_path_length(G,geneA,geneB)
            vec.append(spl)
        except nx.NetworkXNoPath:
            print('no path')
        
    Mspl = np.vstack((Mspl,vec))
Mspl = np.delete(Mspl, (0), axis=0)

DM_spl = Mspl 

In [14]:
%%time

# Restart probability
r = .8 # originally 0.8

alpha = 1.0 # indicating "randomness" 
DM_m = rnd_walk_matrix2(A, r, alpha, len(G.nodes()))

DM_m_df = pd.DataFrame(DM_m)
# DM_m_df.sum(axis=0)
DM_m_transposed = DM_m_df.T
DM_m_transposed.index = list(G.nodes())
 
metric = "correlation" # "cosine" 
DM_m_new = pd.DataFrame(distance.squareform(distance.pdist(DM_m_transposed, metric)))
DM_m = DM_m_new

#min_log = lambda t: -np.log(t)
#DM_mlog = np.array([min_log(x/max(x)) for x in DM_m])

CPU times: user 8min 53s, sys: 15.7 s, total: 9min 8s
Wall time: 2min 49s


In [31]:
DM_m_transposed # treat as feature matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16366,16367,16368,16369,16370,16371,16372,16373,16374,16375
66008,8.008658e-01,4.520070e-03,4.455932e-03,4.456926e-03,4.537024e-03,4.512701e-03,0.004506,4.491262e-03,4.454142e-03,4.454571e-03,...,1.504981e-09,5.508334e-09,6.287790e-09,5.471668e-10,2.775942e-09,6.287790e-09,6.287790e-09,1.908271e-09,1.024123e-08,1.346178e-08
8473,1.232746e-03,8.008761e-01,6.891988e-06,6.899931e-06,1.223150e-03,3.077520e-05,0.000122,2.179722e-05,8.638112e-06,7.172424e-06,...,2.302025e-09,6.326237e-09,1.736803e-09,8.766784e-10,2.348336e-09,1.736803e-09,1.736803e-09,9.241806e-09,2.198002e-09,3.031738e-08
2561,2.291622e-02,1.299632e-04,8.013103e-01,1.296763e-04,1.299687e-04,2.358130e-04,0.000142,1.303170e-04,1.275204e-04,1.275638e-04,...,5.986801e-10,4.734484e-10,8.979762e-07,1.762871e-10,6.156596e-10,8.979762e-07,8.979762e-07,2.461029e-09,1.429076e-09,3.435240e-09
3759,1.146067e-02,6.505649e-05,6.483813e-05,8.013944e-01,6.503713e-05,9.594821e-05,0.000158,6.647058e-05,6.389786e-05,6.393234e-05,...,9.006554e-10,2.954594e-07,1.111945e-09,1.274212e-10,8.306815e-09,1.111945e-09,1.111945e-09,7.633852e-10,1.651495e-09,2.865356e-09
22906,9.607815e-03,9.497399e-03,5.351651e-05,5.355999e-05,8.004302e-01,9.492829e-03,0.000104,5.708623e-05,5.369236e-05,5.644224e-05,...,1.185778e-09,2.266814e-08,1.491328e-09,2.082298e-10,2.944247e-09,1.491328e-09,1.491328e-09,7.595546e-09,7.574344e-09,1.993389e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143903,2.263604e-07,2.292580e-07,6.285833e-06,1.556723e-08,2.535257e-08,1.003349e-06,0.000032,7.171347e-07,6.942407e-08,7.203535e-08,...,1.947552e-10,4.032026e-10,2.498103e-04,8.580999e-11,1.726974e-09,8.002498e-01,2.498103e-04,1.472123e-08,1.416548e-08,5.030356e-09
10861,2.263604e-07,2.292580e-07,6.285833e-06,1.556723e-08,2.535257e-08,1.003349e-06,0.000032,7.171347e-07,6.942407e-08,7.203535e-08,...,1.947552e-10,4.032026e-10,2.498103e-04,8.580999e-11,1.726974e-09,2.498103e-04,8.002498e-01,1.472123e-08,1.416548e-08,5.030356e-09
51471,6.869777e-08,1.219918e-06,1.722720e-08,1.068739e-08,1.291243e-07,1.425650e-06,0.000199,1.306458e-06,5.204928e-08,1.754490e-07,...,3.980639e-09,1.633388e-09,1.472123e-08,1.513449e-10,4.037683e-09,1.472123e-08,1.472123e-08,8.001841e-01,4.076528e-10,2.713402e-09
221044,3.686844e-07,2.901363e-07,1.000353e-08,2.312094e-08,1.287638e-07,4.822233e-07,0.000021,2.428146e-06,4.180987e-08,9.072940e-07,...,3.545973e-10,2.821125e-10,1.416548e-08,2.620196e-10,3.761740e-09,1.416548e-08,1.416548e-08,4.076528e-10,8.001080e-01,2.159148e-08


In [32]:
metric = "correlation" # "cosine" 
DM_m_new = pd.DataFrame(distance.squareform(distance.pdist(DM_m_transposed, metric)))

In [33]:
DM_m_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16366,16367,16368,16369,16370,16371,16372,16373,16374,16375
0,0.000000,0.992881,0.965984,0.980232,0.982024,0.993420,0.994350,0.993694,0.986777,0.988068,...,1.000092,1.000091,1.000091,1.000092,1.000092,1.000091,1.000091,1.000091,1.000087,1.000090
1,0.992881,0.000000,0.999878,0.999981,0.986667,1.000007,0.999921,1.000041,0.999988,1.000033,...,1.000091,1.000088,1.000092,1.000092,1.000090,1.000092,1.000092,1.000085,1.000091,1.000085
2,0.965984,0.999878,0.000000,0.999429,0.999509,0.999728,0.999912,0.999904,0.999670,0.999713,...,1.000093,1.000093,0.999932,1.000093,1.000093,0.999932,0.999932,1.000092,1.000092,1.000093
3,0.980232,0.999981,0.999429,0.000000,0.999766,0.999934,0.999887,0.999994,0.999858,0.999885,...,1.000093,1.000051,1.000093,1.000093,1.000092,1.000093,1.000093,1.000093,1.000092,1.000093
4,0.982024,0.986667,0.999509,0.999766,0.000000,0.987178,0.999949,1.000002,0.999880,0.999826,...,1.000092,1.000068,1.000092,1.000093,1.000092,1.000092,1.000092,1.000085,1.000088,1.000087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16371,1.000091,1.000092,0.999932,1.000093,1.000092,1.000090,1.000050,1.000091,1.000092,1.000092,...,1.000092,1.000091,0.960515,1.000091,1.000091,0.000000,0.960515,1.000086,1.000084,1.000091
16372,1.000091,1.000092,0.999932,1.000093,1.000092,1.000090,1.000050,1.000091,1.000092,1.000092,...,1.000092,1.000091,0.960515,1.000091,1.000091,0.960515,0.000000,1.000086,1.000084,1.000091
16373,1.000091,1.000085,1.000092,1.000093,1.000085,1.000085,0.999822,1.000089,1.000092,1.000088,...,1.000089,1.000090,1.000086,1.000091,1.000089,1.000086,1.000086,0.000000,1.000091,1.000091
16374,1.000087,1.000091,1.000092,1.000092,1.000088,1.000091,1.000062,1.000086,1.000092,1.000078,...,1.000091,1.000091,1.000084,1.000091,1.000090,1.000084,1.000084,1.000091,0.000000,1.000086


In [37]:
DM_m_new.index = list(G.nodes())

In [40]:
DM_m_new.columns = list(G.nodes())

In [42]:
# save as csv
DM_m_new.to_csv(r'output_csv/RWR_Dataframe_'+organism+'.csv', index = True)

In [43]:
DM_m = DM_m_new

In [46]:
df_m.to_csv(r'output_csv/RWR_Dataframe_'+organism+'.csv', index = True)

#### EXPORT GRAPH MATRICES TO CSV

In [None]:
organism = 'Human'

df_adj = pd.DataFrame(DM_adj, columns = list(G.nodes()), index=list(G.nodes()))
df_spl = pd.DataFrame(DM_spl, columns = list(G.nodes()), index=list(G.nodes()))
df_m = pd.DataFrame(DM_m, columns = list(G.nodes()), index=list(G.nodes()))

# -----------------------------------------------

# Without Score
df_adj.to_csv(r'output_csv/Adjacency_Dataframe_'+organism+'.csv', index = True)
df_spl.to_csv(r'output_csv/SPL_Dataframe_'+organism+'.csv', index = True)
df_m.to_csv(r'output_csv/RWR_Dataframe_'+organism+'.csv', index = True)

#### IMPORT GRAPH MATRICES (dataframe) FROM CSV

In [None]:
# potentially necessary --> SPL contains NaN ??? 

max_value = list(DM_spl.max())[0]
DM_spl_mod = DM_spl.fillna(max_value)

DM_spl = DM_spl_mod

### FEATURES BASED DISTANCE MATRICES 

In [None]:
# -------------------------------------------------
# CHOOSE FEATURE MATRIX (input = feature dataframe)
# -------------------------------------------------

# C E N T R A L I T I E S
#feature = 'centralities'
#DM_eucl = DM_centralities

# D I S E A S E S 
feature = 'disease'
DM_eucl = DM_disease 

# G O T E R M S (MF; BP; CC)
#feature = 'MF'
#DM_eucl = DM_MF

#feature = 'BP'
#DM_eucl = DM_BP

#feature = 'CC'
#DM_eucl = DM_CC

# WIP 

### CLUSTERING 

+ CLUSTER COLOURS

In [34]:
df = DM_eucl

number_of_clusters = 100
kmeans = KMeans(n_clusters = number_of_clusters)

NameError: name 'DM_eucl' is not defined

In [None]:
kmeans.fit(df)
labels = kmeans.predict(df)
centroids = kmeans.cluster_centers_

In [None]:
fig = plt.figure(figsize=(5,5))

colors = map(lambda x: colmap[x+1], labels)

plt.scatter(df['x'], df['y'], color=colors, alpha=0.5, edgecolor='k')
for idx, centroid in enumerate(centroids):
    plt.scatter(*centroid, color=colmap[idx+1])
plt.xlim(0, 80)
plt.ylim(0, 80)
plt.show()

### DISEASE SELECTION 

In [None]:
def color_disease_outgoingedges_(G, l_majorcolor_nodes, color):

    d_col_edges = {}
    for e in G.edges():
        for node in l_majorcolor_nodes:
            if e[0] == node:
                d_col_edges[e]=color
            elif e[1] == node:
                d_col_edges[e]=color
           
    d_ignored_edges = {}
    for node in l_majorcolor_nodes:
        for edge in G.edges():
            if node not in edge: 
                d_ignored_edges[edge] = 'black'

    d_edges_all = {**d_col_edges, **d_ignored_edges}
    d_edges_all_sorted = {edge:d_edges_all[edge] for edge in G.edges()}
    edge_color = list(d_edges_all.values())
    
    return d_edges_all_sorted

In [None]:
disease = 'asthma'
disease_col = 'blue'
disease_genes = get_disease_genes(G, d_names_do, d_do_genes, disease)

# NODE COLOURS
colours = color_diseasecategory(G, d_names_do, d_do_genes, disease, disease_col)

# EDGE COLOURS
#edge_color = list(color_disease_outgoingedges_(G, disease_genes, disease_col).values())

____________
# VISUALIZATION SETTINGS

### NODE SIZE

In [None]:
# input d_node_size should be of same order as G.nodes()

def draw_node_size(G, d_node_size, scalef):
    
    # sort according to Graph 
    d_node_size_sorted = {key:d_node_size[key] for key in G.nodes()}    
    
    l_size = []
    for nd,val in d_node_size_sorted.items():
        R = scalef * (1 + val**1.1)      
        l_size.append(R)
        
    return l_size

In [None]:
def draw_node_size3D(G, d_node_size, scalef):
    x = 20
    ring_frac = (x-1.)/x

    d_node_size_sorted = {key:d_node_size[key] for key in G.nodes()}
    l_size = []
    for node,val in d_node_size_sorted.items():
        R = scalef * (1+val**1.5)
        r = ring_frac * R
        l_size.append(r)
        
    l_size_n = []
    for i in l_size:
        j = np.sqrt(np.sqrt(i/max(l_size)))
        l_size_n.append(j*scalef)
        
    return l_size_n

In [None]:
# Node size e.g. DEGREE

node_factor = 2
size = draw_node_size(G, d_deghubs, node_factor) # node size based on degree

edge_color = 'grey'
edge_width = 0.1

opacity_nodes = 1
opacity_edges = 0.3
node_edge_col = 'dimgrey'

### NODE COLOURS

+ DISEASE COLOURED

In [32]:
def color_diseasecategory(G, disease_category, colour):
    
    # get all genes from disease category
    l_disease_genes = []
    for d_name in d_names_do.keys():
        if d_name.find(disease_category) != -1:
            try:
                l_genes = d_do_genes[d_names_do[d_name]]
                for gene in l_genes:
                    l_disease_genes.append(gene)
            except:
                    pass
                
    set_disease_genes = set(l_disease_genes)
    
    # assign colours to disease cat.(colour1) and other nodes(grey)
    d_col = {}
    for node in set_disease_genes:
        d_col[node] = colour
    
    d_rest = {}
    for i in G.nodes():
        if i not in d_col.keys():
            d_rest[i] = 'dimgrey'
        
    d_allnodes_col = {**d_col, **d_rest}
    d_allnodes_col_sorted = {key:d_allnodes_col[key] for key in G.nodes()}
    
    colours = list(d_allnodes_col_sorted.values())
    return colours

In [33]:
colours = color_diseasecategory(G, 'cancer', 'red')

+ CLOSENESS CENTRALITY COLOR 

In [None]:
# Define colour parameter
d_to_be_coloured = d_clos # dict sorted by dict.values (that way the biggest value matches darkest colour of palette)


# Colouring
colour_groups = set(d_to_be_coloured.values())
colour_count = len(colour_groups)
palette = sns.color_palette('Reds', colour_count)

d_colourgroups = {}
for n in colour_groups:
    d_colourgroups[n] = [k for k in d_to_be_coloured.keys() if d_to_be_coloured[k] == n]
    
d_colourgroups_sorted = {key:d_colourgroups[key] for key in sorted(d_colourgroups.keys())}

d_val_col = {}
for idx,val in enumerate(d_colourgroups_sorted):
    for ix,v in enumerate(palette):
        if idx == ix:
            d_val_col[val] = v
d_node_colour = {}
for y in d_to_be_coloured.items(): # y[0] = node id, y[1] = val
    for x in d_val_col.items(): # x[0] = val, x[1] = (col,col,col)
        if x[0] == y[1]:
            d_node_colour[y[0]]=x[1]
            
# SORT dict based on G.nodes
d_node_colour_sorted = dict([(key, d_node_colour[key]) for key in G.nodes()])

l_col_clos = list(d_node_colour_sorted.values())

colours = l_col_clos 

# 2D PORTRAITS

#### 2D SPRING

In [None]:
%%time

posG_spring = nx.spring_layout(G)

plt.figure(figsize=(18,18))
plt.title('Organic spring', size=16)
plt.box(False)
nx.draw_networkx_nodes(G, posG_spring,
                       edgecolors = node_edge_col, 
                       linewidths = 0.5, 
                       node_color='lightgrey', node_size=1)
nx.draw_networkx_edges(G, pos = posG_spring, width = edge_width, edge_color = 'lightgrey', alpha = opacity_edges)
print('Network: ', organism)
print('Number of Nodes:', len(G.nodes()))

plt.savefig('output_plots/Organic_spring_layout2D_'+organism+'.png')
plt.show()

### tSNE EMBEDDING 2D

In [None]:
prplxty = 50 # range: 5-50 / default = 30
density = 1.2 # default 12.
l_rate = 200 # default 200.
steps = 250 # min 250
metric = 'euclidean'

In [None]:
# TO DO : check if get_posG function still works (G, replaced with list(G.nodes()))

In [None]:
%%time
tsne_adj = embed_tsne_2D(DM_adj, prplxty, density, l_rate, steps, metric)
posG_tsne_adj = get_posG(list(G.nodes()),tsne_adj)

In [None]:
%%time
tsne_spl = embed_tsne_2D(DM_spl, prplxty, density, l_rate, steps, metric)
posG_tsne_spl = get_posG(list(G.nodes()),tsne_spl)

In [None]:
%%time
tsne_m = embed_tsne_2D(DM_m.T, prplxty, density, l_rate, steps, metric)
posG_tsne_m = get_posG(list(G.nodes()),tsne_m)

In [None]:
%%time
tsne_eucl = embed_tsne_2D(DM_eucl, prplxty, density, l_rate, steps, metric)

In [None]:
def get_posG_2D(l_nodes, embed):
    posG = {}
    cc = 0
    for entz in l_nodes:
        posG[str(entz)] = (embed[cc,0],embed[cc,1])
        cc += 1

    return posG

In [None]:
posG_disease = get_posG_2D(genes_dis, tsne_eucl)

# WIP - how to plot rest nodes 

In [None]:
# might be replaced by mapping rest nodes to some geometry 

coords_disease = list(posG_disease.values())

x_max = max(coords_disease)[0]
x_min = min(coords_disease)[0]
y_max = max(coords_disease)[1]
y_min = min(coords_disease)[1]

rand_coords = [(np.random.uniform(x_min,x_max), np.random.uniform(y_min,y_max)) for i in range(len(genes_rest))]

In [None]:
import numpy.linalg as la

t = np.linspace(0,2*np.pi,len(genes_rest))
xc = 1*np.random.rand() #x-coordinate of center of circle 
yc = 1*np.random.rand() #y-coordinate of center of circle
r = 6*np.random.rand() + 0.5 #radius of circle

x = r*np.cos(t) + np.random.normal(scale = 1.0/7, size = len(t)) + xc  
y = r*np.sin(t) + np.random.normal(scale = 1.0/7, size = len(t)) + yc

In [None]:
genes_rest = []
for g in G.nodes():
    if int(g) not in posG_disease.keys():
        genes_rest.append(str(g))
        
posG_rest = {}
for g in genes_rest:
    for c in rand_coords:
        posG_rest[str(g)] = c

posG_all = {**posG_disease, **posG_rest}
posG_tsne_eucl = {key:posG_all[key] for key in G.nodes()}

### 2D tSNE PORTRAITS

In [None]:
plt.figure(figsize=(25,25))
plt.title('Adj | t-SNE | Metric: '+ metric, size= 16)

nx.draw_networkx_nodes(G, posG_tsne_adj, edgecolors = node_edge_col, linewidths = 0.5, node_color=colours, node_size=size, alpha = opacity_nodes)
nx.draw_networkx_edges(G, pos = posG_tsne_adj, width = edge_width, edge_color = edge_color, alpha = opacity_edges)
#nx.draw_networkx_labels(G, pos = posG_adj, font_size = fontsize_labels, font_color = 'black')
plt.box(False)

plt.savefig('output_plots/2Dtsne_adj_'+metric+'_'+organism+'.png')

plt.show()

In [None]:
plt.figure(figsize=(25,25))
plt.title('SPL | t-SNE | Metric: '+ metric, size=16)

nx.draw_networkx_nodes(G, posG_tsne_spl, edgecolors = node_edge_col, linewidths = 0.5, node_color=colours, node_size=size, alpha = opacity_nodes)
nx.draw_networkx_edges(G, pos = posG_tsne_spl, width = edge_width, edge_color = edge_color, alpha = opacity_edges)
#nx.draw_networkx_labels(G, pos = posG_spl, font_size = fontsize_labels, font_color = 'black')
plt.box(False)

plt.savefig('output_plots/2Dtsne_spl_'+metric+'_'+organism+'.png')

plt.show()

In [None]:
plt.figure(figsize=(25,25))
plt.title('Markov | t-SNE | Metric: '+metric, size=16)

nx.draw_networkx_nodes(G, posG_tsne_m, edgecolors = node_edge_col, linewidths = 0.5, node_color=colours, node_size=size, alpha = opacity_nodes)
nx.draw_networkx_edges(G, pos = posG_tsne_m, width = edge_width, edge_color = edge_color, alpha = opacity_edges)
plt.box(False)

plt.savefig('output_plots/2Dtsne_m_'+metric+'_'+ organism +'.png')

plt.show()

In [None]:
plt.figure(figsize=(25,25))
plt.title('PDist euclidean | t-SNE | Metric: '+metric, size=16)

nx.draw_networkx_nodes(G, posG_tsne_eucl, edgecolors = node_edge_col, linewidths = 0.5, node_color=colours, node_size=size, alpha = opacity_nodes)
nx.draw_networkx_edges(G, pos = posG_tsne_eucl, width = edge_width, edge_color = edge_color, alpha = opacity_edges)
plt.box(False)

plt.savefig('output_plots/2Dtsne_'+feature+'_'+metric+'_'+organism+'.png')

plt.show()

### UMAP EMBEDDING 2D 
+https://umap-learn.readthedocs.io/en/latest/embedding_space.html

In [None]:
n_neighbors = 10 # balance between local and global structure in the data
spread = 2.
min_dist = 0.5 # defines how dense points are stacked together 
n_components = 2 # for 2D

U = umap.UMAP(
        n_neighbors = n_neighbors,
        spread = spread,
        min_dist = min_dist,
        n_components = n_components,
        metric = metric)

In [None]:
%%time 
umap_adj = U.fit_transform(DM_adj)
posG_umap_adj = get_posG(G,umap_adj)

In [None]:
%%time 
umap_spl = U.fit_transform(DM_spl)
posG_umap_spl = get_posG(G,umap_spl)

In [None]:
%%time 
umap_m = U.fit_transform(DM_m)
posG_umap_m = get_posG(G, umap_m)

In [None]:
%%time 
umap_eucl = U.fit_transform(DM_eucl)
posG_umap_eucl = get_posG(G, umap_eucl)

### 2D UMAP PORTRAITS

In [None]:
plt.figure(figsize=(25,25))
plt.title('Adj | UMAP | Metric: '+metric, size=16)

nx.draw_networkx_nodes(G, posG_umap_adj, edgecolors = node_edge_col, linewidths = 0.5, node_color=colours, node_size=size, alpha = opacity_nodes)
nx.draw_networkx_edges(G, pos = posG_umap_adj, width = edge_width, edge_color = edge_color, alpha = opacity_edges)
#nx.draw_networkx_labels(G, pos = posG_umap_adj, font_size = fontsize_labels, font_color = 'black')
plt.box(False)

plt.savefig('output_plots/2Dumap_adj_'+metric+'_'+organism+'.png')

plt.show()

In [None]:
plt.figure(figsize=(25,25))
plt.title('SPL | UMAP | Metric: '+metric, size=16)

nx.draw_networkx_nodes(G, posG_umap_spl, edgecolors = node_edge_col, linewidths = 0.5, node_color=colours, node_size=size, alpha = opacity_nodes)
nx.draw_networkx_edges(G, pos = posG_umap_spl, width = edge_width, edge_color = edge_color, alpha = opacity_edges)
#nx.draw_networkx_labels(G, pos = posG_umap_spl, font_size = fontsize_labels, font_color = 'black')
plt.box(False)

plt.savefig('output_plots/2Dumap_spl_'+metric+'_'+organism+'.png')

plt.show()

In [None]:
plt.figure(figsize=(25,25))
plt.title('Markov | UMAP | Metric: '+metric, size=16)

nx.draw_networkx_nodes(G, posG_umap_m, edgecolors = node_edge_col, linewidths = 0.5, node_color=colours, node_size=size, alpha = opacity_nodes)
nx.draw_networkx_edges(G, pos = posG_umap_m, width = edge_width, edge_color = edge_color, alpha = opacity_edges)
#nx.draw_networkx_labels(G, pos = posG_umap_spl, font_size = fontsize_labels, font_color = 'black')
plt.box(False)

plt.savefig('output_plots/2Dumap_m_'+metric+'_'+organism+'.png')

plt.show()

In [None]:
plt.figure(figsize=(25,25))
plt.title('PW dist EUCLIDEAN | UMAP | Metric: '+metric, size=16)

nx.draw_networkx_nodes(G, posG_umap_eucl, edgecolors = node_edge_col, linewidths = 0.5, node_color=colours, node_size=size, alpha = 1)
nx.draw_networkx_edges(G, pos = posG_umap_eucl, width = edge_width, edge_color = edge_color, alpha = opacity_edges)
#nx.draw_networkx_labels(G, pos = posG_umap_spl, font_size = fontsize_labels, font_color = 'black')
plt.box(False)

plt.savefig('output_plots/2Dumap_eucl_'+metric+'_'+organism+'.png')

plt.show()

# LANDSCAPES

In [None]:
#dict_z = {key:d_essentiality_scores[key] for key in G.nodes()}
dict_z = {key:d_clos[key] for key in G.nodes()}

z_list = list(dict_z.values())

In [None]:
node_factor = 20 # node size factor
size3d = draw_node_degree_3D(G, node_factor) # node size based on degree
#size3d = 5

### tSNE

In [None]:
%%time

tsne_adj_x, tsne_adj_y, tsne_adj_z = get_coords_landscape(G, posG_tsne_adj)
tsne_adj_trace_z = get_trace_nodes_landscape(tsne_adj_x, tsne_adj_y, z_list, colours, size3d)
tsne_data_adj = [tsne_adj_trace_z] #,adj_edges]

In [None]:
%%time

tsne_spl_x, tsne_spl_y, tsne_spl_z = get_coords_landscape(G, posG_tsne_spl)
tsne_spl_trace_z = get_trace_nodes_landscape(tsne_spl_x, tsne_spl_y, z_list, colours, size3d)
tsne_data_spl = [tsne_spl_trace_z] #, spl_edges]%%time

In [None]:
%%time

tsne_m_x, tsne_m_y, tsne_m_z = get_coords_landscape(G, posG_tsne_m)
tsne_m_trace_z = get_trace_nodes_landscape(tsne_m_x, tsne_m_y, z_list, colours, size3d)
tsne_data_m = [tsne_m_trace_z] #,mlog_edges]

In [None]:
%%time 

tsne_eucl_x, tsne_eucl_y, tsne_eucl_z = get_coords_landscape(G, posG_tsne_eucl)
tsne_eucl_trace_z = get_trace_nodes_landscape(tsne_eucl_x, tsne_eucl_y, z_list, colours, size3d)
tsne_data_eucl = [tsne_eucl_trace_z] #,eucl_edges]

In [None]:
print('Perplexity:', prplxty)
print('Early Exaggeration:', density)
print('Learning rate:', l_rate)
print('Iterations:', steps)


fig_1 = make_subplots(rows = 1, cols = 4,
                    specs=[4 * [{'type': 'scatter3d'}]],
                    print_grid=False, subplot_titles=('Adjacency | t-SNE | Metric: '+ metric, 
                                                     'SPL | t-SNE | Metric: '+ metric,
                                                     'Markov | t-SNE | Metric: '+ metric,
                                                      'PW Euclidean | tSNE | Metric: '+ metric)
                    )

    
for i in tsne_data_adj:
    fig_1.add_trace(i, row = 1, col = 1)


for i in tsne_data_spl:
    fig_1.add_trace(i, row = 1, col = 2)
    
for i in tsne_data_m:
    fig_1.add_trace(i, row = 1, col = 3)

for i in tsne_data_eucl:
    fig_1.add_trace(i, row = 1, col = 4)

fig_1.update_layout(template='plotly_dark', height = 1000, width = 2200)
#fig_1.update_layout(template=None, height = 1000, width = 2200)
py.iplot(fig_1)

plotly.offline.plot(fig_1, filename = 'output_plots/Landscapes_tsne_'+metric+'_'+organism+'.html', auto_open=False)

### UMAP

In [None]:
%%time

umap_adj_x, umap_adj_y, umap_adj_z = get_coords_landscape(G, posG_umap_adj)
umap_adj_trace_z = get_trace_nodes_landscape(umap_adj_x, umap_adj_y, z_list, colours, size3d)
umap_data_adj = [umap_adj_trace_z]

In [None]:
%%time

umap_spl_x, umap_spl_y, umap_spl_z = get_coords_landscape(G, posG_umap_spl)
umap_spl_trace_z = get_trace_nodes_landscape(umap_spl_x, umap_spl_y, z_list, colours, size3d)
umap_data_spl = [umap_spl_trace_z]

In [None]:
%%time

umap_m_x, umap_m_y, umap_m_z = get_coords_landscape(G, posG_umap_m)
umap_m_trace_z = get_trace_nodes_landscape(umap_m_x, umap_m_y, z_list, colours, size3d)
umap_data_m = [umap_m_trace_z]

In [None]:
%%time

umap_eucl_x, umap_eucl_y, umap_eucl_z = get_coords_landscape(G, posG_umap_eucl)
umap_eucl_trace_z = get_trace_nodes_landscape(umap_eucl_x, umap_eucl_y, z_list, colours, size3d)
umap_data_eucl = [umap_eucl_trace_z]

In [None]:
print('# Neighbors: ', n_neighbors)
print('Spread: ', spread)
print('Min. Distance: ', min_dist)
print('Metric: ', metric)


fig_1 = make_subplots(rows = 1, cols = 4,
                    specs=[4 * [{'type': 'scatter3d'}]],
                    print_grid=False, subplot_titles=('Adjacency | UMAP | Metric: '+ metric, 
                                                     'SPL | UMAP | Metric: '+ metric,
                                                     'Markov | UMAP | Metric: '+ metric,
                                                      'PW Euclidean | UMAP | Metric: '+ metric)
                    )

    
for i in umap_data_adj:
    fig_1.add_trace(i, row = 1, col = 1)


for i in umap_data_spl:
    fig_1.add_trace(i, row = 1, col = 2)
    
for i in umap_data_m:
    fig_1.add_trace(i, row = 1, col = 3)

for i in umap_data_eucl:
    fig_1.add_trace(i, row = 1, col = 4)

fig_1.update_layout(template='plotly_dark', height = 1000, width = 2200)
#fig_1.update_layout(template=None, height = 1000, width = 2200,)
py.iplot(fig_1)

plotly.offline.plot(fig_1, filename = 'output_plots/Landscapes_umap_'+metric+'_'+organism+'.html', auto_open=False)

### LANDSCAPE | EXPORT Coordinates x,y,z,rgba

In [None]:
'''
organism = 'Yeast'

colours_rgba = []
for i in colours: 
    if i == ess_col:
        colours_rgba.append('rgba(66, 117, 154, 0.6)')
    elif i == no_ess_col:
        colours_rgba.append('rgba(162, 193, 216, 0.6)')
    else:
        colours_rgba.append('rgba(0,0,0,0.4)')


df_xyz_rgba_landscape_adj = pd.DataFrame(posG3d_adj, index=["x","y","z"]).T
df_xyz_rgba_landscape_adj['color'] = colours_rgba

df_xyz_rgba_landscape_spl = pd.DataFrame(posG3d_spl, index=["x","y","z"]).T
df_xyz_rgba_landscape_spl['color'] = colours_rgba

df_xyz_rgba_landscape_mlog = pd.DataFrame(posG3d_mlog, index=["x","y","z"]).T
df_xyz_rgba_landscape_mlog['color'] = colours_rgba

df_xyz_rgba_landscape_eucl = pd.DataFrame(posG3d_eucl, index=["x","y","z"]).T
df_xyz_rgba_landscape_eucl['color'] = colours_rgba


# -----------------------------------------------

df_xyz_rgba_3Dadj.to_csv(r'output_csv/landscape__XYZ_RGBA_Adjacency_'+organism+'.csv', index = True)
df_xyz_rgba_3Dspl.to_csv(r'output_csv/landscape__XYZ_RGBA_SPL_'+organism+'.csv', index = True)
df_xyz_rgba_3Dmlog.to_csv(r'output_csv/landscape__XYZ_RGBA_RWRlog_'+organism+'.csv', index = True)
df_xyz_rgba_3Deucl.to_csv(r'output_csv/landscape__XYZ_RGBA_PW_eucl_'+organism+'.csv', index = True)

____________
# 3D

### 3D Visualization Settings

In [8]:
node_factor = 20
size3d = draw_node_degree_3D(G, node_factor) # node size based on degree 

edge_color = 'grey'
#edge_color = 'dimgrey'

## SPRING

In [None]:
%%time 

#Fruchterman-Reingold force-directed algorithm
posG3d_spring = nx.spring_layout(G, dim=3)

spring_edges = get_trace_edges(G, posG3d_spring, edge_color) 
spring_nodes = get_trace_nodes(G, posG3d_spring,  l_features, colours, size3d)
data_spring = [spring_edges, spring_nodes]

fig_ = pgo.Figure()
for i in data_spring:
    fig_.add_trace(i)
fig_.update_layout(template = None, height = 800, width = 800)
py.iplot(fig_)
plotly.offline.plot(fig_, filename = 'output_plots/3DSpring_'+organism+'.html', auto_open=False)

## PORTRAITS

In [9]:
prplxty3d = 50 # range: 5-50 / default = 30.#
density3d = 1.2 # default 12.
l_rate3d = 5000 # default 200.
steps3d = 1000 #min 250

metric = 'euclidean'

#### tSNE

In [None]:
%%time
tsne_posG3d_adj = embed_tsne_3D(G, DM_adj, prplxty3d, density3d, l_rate3d, steps3d, metric)

In [None]:
%%time
tsne_posG3d_spl = embed_tsne_3D(G, DM_spl, prplxty3d, density3d, l_rate3d, steps3d, metric)

In [10]:
%%time
tsne_posG3d_m = embed_tsne_3D(G, DM_m, prplxty3d, density3d, l_rate3d, steps3d, metric)


# EXPORT as dataframe 
df_3Dportrait_markov = pd.DataFrame(tsne_posG3d_m).T
df_3Dportrait_markov.to_csv(r'output_csv/3DPortrait_XYZ_markov_'+organism+'.csv', index = True)


# IMPORT + convert to dict with nodes: X,Y,Z
df_tsne_posG3d_m = pd.read_csv('output_csv/3DPortrait_XYZ_markov_'+organism+'.csv', index_col=0)
X = list(df_tsne_posG3d_m['0'])
Y = list(df_tsne_posG3d_m['1'])
Z = list(df_tsne_posG3d_m['2'])

tsne_posG3d_m = dict(zip(list(G.nodes()),zip(X,Y,Z)))

CPU times: user 2h 47min, sys: 23.4 s, total: 2h 47min 24s
Wall time: 2h 23min 12s


In [None]:
%%time
tsne_posG3d_eucl = embed_tsne_3D(G, DM_eucl, prplxty3d, density3d, l_rate3d, steps3d)

In [None]:
%%time 

tsne_adj_edges = get_trace_edges(G, tsne_posG3d_adj, edge_color) 
tsne_adj_nodes = get_trace_nodes(G, tsne_posG3d_adj,  l_features, colours, size3d)
tsne_data_adj = [tsne_adj_edges, tsne_adj_nodes]

In [None]:
%%time 

tsne_spl_edges = get_trace_edges(G, tsne_posG3d_spl, edge_color) 
tsne_spl_nodes = get_trace_nodes(G, tsne_posG3d_spl,  l_features, colours, size3d)
tsne_data_spl = [tsne_spl_edges, tsne_spl_nodes]

In [34]:
%%time

tsne_m_edges = get_trace_edges(G, tsne_posG3d_m, edge_color) 
tsne_m_nodes = get_trace_nodes(G, tsne_posG3d_m, l_features, colours, size3d)
tsne_data_m = [tsne_m_edges, tsne_m_nodes]

CPU times: user 7.38 s, sys: 388 ms, total: 7.77 s
Wall time: 8.09 s


In [None]:
%%time

tsne_eucl_edges = get_trace_edges(G, tsne_posG3d_eucl, edge_color) 
tsne_eucl_nodes = get_trace_nodes(G, tsne_posG3d_eucl,l_features, colours, size3d)
tsne_data_eucl = [tsne_eucl_edges, tsne_eucl_nodes]

In [None]:
print('Perplexity:', prplxty3d)
print('Early Exaggeration:', density3d)
print('Learning rate:', l_rate3d)
print('Iterations:', steps3d)


fig = pgo.Figure()
for i in tsne_data_m:
    fig.add_trace(i)
fig.update_layout(template='plotly_dark', showlegend=False, width=2000, height=2000)
py.iplot(fig)

plotly.offline.plot(fig, filename = 'output_plots/3Dportrait_tsne_RWR_'+metric+'_'+organism+'.html', auto_open=False)

In [None]:
print('Perplexity:', prplxty3d)
print('Early Exaggeration:', density3d)
print('Learning rate:', l_rate3d)
print('Iterations:', steps3d)


fig1 = make_subplots(rows = 1, cols = 4,
                    specs=[4 * [{'type': 'scatter3d'}]],
                    print_grid=False, subplot_titles=('Adjacency | tSNE | Metric: '+metric, 
                                                     'SPL | tSNE | Metric: '+metric,
                                                     'RWR | tSNE | Metric: '+metric,
                                                     'PW Euclidean | tSNE | Metric: '+metric)
                    )

    
for i in tsne_data_adj:
    fig1.add_trace(i, row = 1, col = 1)
    
for i in tsne_data_spl:
    fig1.add_trace(i, row = 1, col = 2)
    
for i in tsne_data_m:
    fig1.add_trace(i, row = 1, col = 3)

for i in tsne_data_eucl:
    fig1.add_trace(i, row = 1, col = 4)

#fig1.update_layout(template=None, showlegend = False, width = 2000, height = 800)
fig1.update_layout(template='plotly_dark', showlegend = False, width = 2000, height = 800)
py.iplot(fig1)

plotly.offline.plot(fig1, filename = 'output_plots/3Dportrait_tsne_'+metric+'_'+organism+'.html', auto_open=False)

#### tSNE PORTRAIT EXPORT Coordinates

In [None]:
'''
organism = 'Human'


colours_rgba = []
for i in colours: 
    if i == ess_col:
        colours_rgba.append('rgba(66, 117, 154, 0.6)')
    elif i == no_ess_col:
        colours_rgba.append('rgba(162, 193, 216, 0.6)')
    else:
        colours_rgba.append('rgba(0,0,0,0.4)')
        
        
#df_xyz_rgba_3Dspring = pd.DataFrame(posG3d_spring, index=["x","y","z"]).T
#df_xyz_rgba_3Dspring['color'] = colours_rgba

df_xyz_rgba_3Dadj = pd.DataFrame(tsne_posG3d_adj, index=["x","y","z"]).T
df_xyz_rgba_3Dadj['color'] = colours_rgba

df_xyz_rgba_3Dspl = pd.DataFrame(tsne_posG3d_spl, index=["x","y","z"]).T
df_xyz_rgba_3Dspl['color'] = colours_rgba

df_xyz_rgba_3Dm = pd.DataFrame(tsne_posG3d_m, index=["x","y","z"]).T
df_xyz_rgba_3Dm['color'] = colours_rgba

df_xyz_rgba_3Deucl = pd.DataFrame(tsne_posG3d_eucl, index=["x","y","z"]).T
df_xyz_rgba_3Deucl['color'] = colours_rgba


# -----------------------------------------------

df_xyz_rgba_spring.to_csv(r'output_csv/3Dspring_XYZ_RGBA_'+organism+'.csv', index = True)

df_xyz_rgba_3Dadj.to_csv(r'output_csv/3Dportrait_tsne_XYZ_RGBA_Adjacency_'+organism+'.csv', index = True)
df_xyz_rgba_3Dspl.to_csv(r'output_csv/3Dportrait_tsne_XYZ_RGBA_SPL_'+organism+'.csv', index = True)
df_xyz_rgba_3Dm.to_csv(r'output_csv/3Dportrait_tsne_XYZ_RGBA_RWR_'+organism+'.csv', index = True)
df_xyz_rgba_3Deucl.to_csv(r'output_csv/3Dportrait_tsne_XYZ_RGBA_PW_eucl_'+organism+'.csv', index = True)
'''

#### UMAP

In [None]:
n_neighbors = 10 # balance between local and global structure in the data
spread = 1.
min_dist = 0.1 # defines how dense points are stacked together 
metric='euclidean'

In [None]:
%%time 
umap_posG3d_adj = embed_umap_3D(G, DM_adj, n_neighbors, spread, min_dist, metric)

In [None]:
%%time 
umap_posG3d_spl = embed_umap_3D(G, DM_spl, n_neighbors, spread, min_dist, metric)

In [None]:
%%time 
umap_posG3d_m = embed_umap_3D(G, DM_m, n_neighbors, spread, min_dist, metric)

In [None]:
%%time 
umap_posG3d_eucl = embed_umap_3D(G, DM_eucl, n_neighbors, spread, min_dist, metric)

In [None]:
%%time 
umap_adj_edges = get_trace_edges(G, umap_posG3d_adj, edge_color) 
umap_adj_nodes = get_trace_nodes(G, umap_posG3d_adj,  l_features, colours, size3d)
umap_data_adj = [umap_adj_edges, umap_adj_nodes]

In [None]:
%%time 
umap_spl_edges = get_trace_edges(G, umap_posG3d_spl, edge_color) 
umap_spl_nodes = get_trace_nodes(G, umap_posG3d_spl,  l_features, colours, size3d)
umap_data_spl = [umap_spl_edges, umap_spl_nodes]

In [None]:
%%time 
umap_m_edges = get_trace_edges(G, umap_posG3d_m, edge_color) 
umap_m_nodes = get_trace_nodes(G, umap_posG3d_m,  l_features, colours, size3d)
umap_data_m = [umap_m_edges, umap_m_nodes]

In [None]:
%%time 
umap_eucl_edges = get_trace_edges(G, umap_posG3d_eucl, edge_color) 
umap_eucl_nodes = get_trace_nodes(G, umap_posG3d_eucl,  l_features, colours, size3d)
umap_data_eucl = [umap_eucl_edges, umap_eucl_nodes]

In [None]:
print('# Neighbors: ', n_neighbors)
print('Spread: ', spread)
print('Min. Distance: ', min_dist)
print('Metric: ', metric)

fig1 = make_subplots(rows = 1, cols = 4,
                    specs=[4 * [{'type': 'scatter3d'}]],
                    print_grid=False, subplot_titles=('Adjacency | UMAP | Metric: '+metric, 
                                                     'SPL | UMAP | Metric: '+metric,
                                                     'RWR | UMAP | Metric: '+metric,
                                                     'PW Euclidean | UMAP | Metric: '+metric)
                    )

    
for i in umap_data_adj:
    fig1.add_trace(i, row = 1, col = 1)
    
for i in umap_data_spl:
    fig1.add_trace(i, row = 1, col = 2)
    
for i in umap_data_m:
    fig1.add_trace(i, row = 1, col = 3)

for i in umap_data_eucl:
    fig1.add_trace(i, row = 1, col = 4)

fig1.update_layout(template='plotly_dark', showlegend = False, width = 2000, height = 800)
#fig1.update_layout(template=None, showlegend = False, width = 2000, height = 800)
py.iplot(fig1)

plotly.offline.plot(fig1, filename = 'output_plots/3Dportrait_umap_'+metric+'_'+organism+'.html', auto_open=False)

#### UMAP PORTRAIT Coordinates EXPORT

In [None]:
'''
organism = 'Yeast'


colours_rgba = []
for i in colours: 
    if i == ess_col:
        colours_rgba.append('rgba(66, 117, 154, 0.6)')
    elif i == no_ess_col:
        colours_rgba.append('rgba(162, 193, 216, 0.6)')
    else:
        colours_rgba.append('rgba(0,0,0,0.4)')
        

df_xyz_rgba_3Dadj = pd.DataFrame(umap_posG3d_adj, index=["x","y","z"]).T
df_xyz_rgba_3Dadj['color'] = colours_rgba

df_xyz_rgba_3Dspl = pd.DataFrame(umap_posG3d_spl, index=["x","y","z"]).T
df_xyz_rgba_3Dspl['color'] = colours_rgba

df_xyz_rgba_3Dmlog = pd.DataFrame(umap_posG3d_mlog, index=["x","y","z"]).T
df_xyz_rgba_3Dmlog['color'] = colours_rgba

df_xyz_rgba_3Deucl = pd.DataFrame(umap_posG3d_eucl, index=["x","y","z"]).T
df_xyz_rgba_3Deucl['color'] = colours_rgba


# -----------------------------------------------

df_xyz_rgba_spring.to_csv(r'output_csv/3Dspring_XYZ_RGBA_'+organism+'.csv', index = True)

df_xyz_rgba_3Dadj.to_csv(r'output_csv/3Dportrait_umap_XYZ_RGBA_Adjacency_'+organism+'.csv', index = True)
df_xyz_rgba_3Dspl.to_csv(r'output_csv/3Dportraitumap_XYZ_RGBA_SPL_'+organism+'.csv', index = True)
df_xyz_rgba_3Dmlog.to_csv(r'output_csv/3Dportraitumap_XYZ_RGBA_RWRlog_'+organism+'.csv', index = True)
df_xyz_rgba_3Deucl.to_csv(r'output_csv/3Dportraitumap_XYZ_RGBA_PW_eucl_'+organism+'.csv', index = True)
'''

## SPHERE

#### UMAP

In [None]:
metric = 'cosine'

# Radius parameter
d_param = d_essentiality_scores_sorted

In [None]:
%%time 
umap_sphere_adj = embed_umap_sphere(G, DM_adj, metric)
umap_sphere_adj_withrad = get_posG_with_sphere_radius(G, umap_sphere_adj, d_param)

In [None]:
%%time 
umap_sphere_spl = embed_umap_sphere(G, DM_spl, metric)
umap_sphere_spl_withrad = get_posG_with_sphere_radius(G, umap_sphere_spl, d_param)

In [None]:
%%time 
umap_sphere_m = embed_umap_sphere(G, DM_m, metric)
umap_sphere_m_withrad = get_posG_with_sphere_radius(G, umap_sphere_m, d_param)

In [None]:
%%time 
umap_sphere_eucl = embed_umap_sphere(G, DM_eucl, metric)
umap_sphere_eucl_withrad = get_posG_with_sphere_radius(G, umap_sphere_eucl, d_param)

In [None]:
%%time 
umap_sphere_trace_adj_edges = get_trace_edges(G, umap_sphere_adj_withrad, edge_color)
umap_sphere_trace_adj_nodes = get_trace_umap_sphere(umap_sphere_adj_withrad, l_features, colours, size3d)
umap_sphere_data_adj = [umap_sphere_trace_adj_edges, umap_sphere_trace_adj_nodes]

In [None]:
%%time
umap_sphere_trace_spl_edges = get_trace_edges(G, umap_sphere_spl_withrad, edge_color)
umap_sphere_trace_spl_nodes = get_trace_umap_sphere(umap_sphere_spl_withrad, l_features, colours, size3d)
umap_sphere_data_spl = [umap_sphere_trace_spl_edges, umap_sphere_trace_spl_nodes]

In [None]:
%%time
umap_sphere_trace_m_edges = get_trace_edges(G, umap_sphere_m_withrad, edge_color)
umap_sphere_trace_m_nodes = get_trace_umap_sphere(umap_sphere_m_withrad, l_features, colours, size3d)
umap_sphere_data_m = [umap_sphere_trace_m_edges, umap_sphere_trace_m_nodes]

In [None]:
%%time
umap_sphere_trace_eucl_edges = get_trace_edges(G, umap_sphere_eucl_withrad, edge_color)
umap_sphere_trace_eucl_nodes = get_trace_umap_sphere(umap_sphere_eucl_withrad, l_features, colours, size3d)
umap_sphere_data_eucl = [umap_sphere_trace_eucl_edges, umap_sphere_trace_eucl_nodes]

In [None]:
print('Metric: ', metric)

fig = make_subplots(rows = 1, cols = 4,
                    specs=[4 * [{'type': 'scatter3d'}]],
                    print_grid=False, subplot_titles=('Adjacency | UMAP Sphere | Metric: '+metric, 
                                                     'SPL | UMAP Sphere | Metric: '+metric,
                                                     'RWR -log | UMAP Sphere | Metric: '+metric,
                                                     'PW Euclidean | UMAP Sphere | Metric: '+metric)
                    )

    
for i in umap_sphere_data_adj:
    fig.add_trace(i, row = 1, col = 1)
    
for i in umap_sphere_data_spl:
    fig.add_trace(i, row = 1, col = 2)
    
for i in umap_sphere_data_m:
    fig.add_trace(i, row = 1, col = 3)

for i in umap_sphere_data_eucl:
    fig.add_trace(i, row = 1, col = 4)

#fig.update_layout(template=None, showlegend = False, width = 2000, height = 800)
fig.update_layout(template='plotly_dark', showlegend = False, width = 2000, height = 800,
                 scene=dict(xaxis = dict(
                        nticks = 0, color= 'black'),
                           yaxis = dict(
                        nticks = 0, color= 'black'), 
                           zaxis = dict(
                        nticks = 0, color= 'black')))
py.iplot(fig)

plotly.offline.plot(fig, filename = 'output_plots/3Dsphere_umap_'+metric+'_'+organism+'.html', auto_open=False)

# -----some additional wip stuff-----

### COLOURING SUBCATEGORIES e.g. Disease subcategory (i.e. functional property)

In [None]:
# input = dict

n = len(sub_categories)
colors = generate_colorlist_nodes(n)

doid_coloured = {}
c=0
for k,v in sub_categories.items():
    doid_coloured[k[1]]=colors[c]
    c+=1
    
d_gene_colours = {}
for doid, gene in d_do_genes.items(): 
    for i in gene:
        for do, col in doid_coloured.items():
            if doid == do:
                d_gene_colours[i]=col

# SORT dict based on G.nodes
d_gene_colours_sorted = {key:d_gene_colours[key] for key in G.nodes()}

l_col_subcat = list(d_gene_colours_sorted.values())


# NODE COLOURING based on Subcategory
colours = l_col_subcat 
edge_color = 'lightgrey'

#sns.palplot(colours)

### COLOURING DISEASE SCORES

In [None]:
# TO DO : MAKE FUNCTION FOR COLORING 

gene_list = gene_list_retisarc



# COLOURS GENERATED / disease score and neighboring nodes

n = len(gene_list)
color_major = generate_colorlist_nodes(n)
 
factor = 1.7 # the higher the lighter
color_minor = []
for i in color_major:
    r,g,b = hex_to_rgb(i)
    color_light = adjust_color_lightness(r,g,b,factor)
    color_minor.append(color_light)
    
    
# ------------------------------------------------------
# NODES coloured 

# Major nodes coloured
d_col_major = {}
for n in d_major_score.keys():
    for i in color_major:
        d_col_major[n] = i

d_col_minor = {}
for node,col in d_col_major.items():
    for node in d_col_major.keys():
        for nd,neigh in major_neigh.items():
            for n in neigh:
                for i in color_minor:
                    if node==nd and n not in d_col_major.keys():
                        d_col_minor[n] = i
                    
d_col = {**d_col_major,**d_col_minor}

d_grey = {}
for i in G.nodes():
    if i not in d_col.keys():
        d_grey[i] = 'lightgrey'
        
        
d_col_all = {**d_col_major, **d_col_minor, **d_grey}
d_col_all_sorted = {key:d_col_all[key] for key in G.nodes()}

l_col_all = list(d_col_all_sorted.values())

colours = l_col_all

# ------------------------------------------------------
# EDGES coloured 

edge_lst = []
for edge in G.edges():
    for e in edge:
        for node in d_col_major.keys():
            if e == node:
                edge_lst.append(edge)
                
d_col_edges = {}
for e in edge_lst:
    for node,col in d_col_major.items():
        if e[0] == node:
            d_col_edges[e]=col
        elif e[1] == node:
            d_col_edges[e]=col

d_grey_edges = {}
for edge in G.edges():
    if edge not in d_col_edges.keys(): 
        d_grey_edges[edge] = 'lightgrey'
        
d_edges_all = {**d_col_edges, **d_grey_edges}

# Sort according to G.edges()
d_edges_all_sorted = {key:d_edges_all[key] for key in G.edges()}

edge_color = list(d_edges_all_sorted.values())

In [None]:
'''# CHOOSE DISEASE CATEGORY 
disease_category = 'cancer'

l_disease_genes = []
for d_name in d_names_do.keys():
    if d_name.find(disease_category) != -1:
        #print(d_name,d_names_do[d_name])
        try:
            l_genes = d_do_genes[d_names_do[d_name]]
            for gene in l_genes:
                l_disease_genes.append(gene)
        except:
            #print(d_names_do[d_name],d_name)
                pass
set_disease_genes = set(l_disease_genes)
print('\nThere are %s genes found to be associated with "%s".' %(len(set_disease_genes),disease_category))

sub_categories = {}
for d_name in d_names_do.keys():
    if d_name.find(disease_category) != -1:
        try:
            sub_categories[d_name,d_names_do[d_name]]=len(d_do_genes[d_names_do[d_name]])
            #print('specific disease: %s (%s) ; # associated genes: %s' %(d_name,d_names_do[d_name],len(d_do_genes[d_names_do[d_name]])))
        except:
            pass'''