-----
# cartoGRAPHs
precalculated large networks , e.g. the Interactome

A Notebook to produce 2D and 3D network layouts from any Graph,
including interactive visualization (html files) and export functions 
to import into the VRNetzer analytics platform by Pirch et al.

-----

In [1]:
from cartoGRAPHs import * 

from func_load_data import *
from func_visual_properties import * 
from func_calculations import * 
from func_embed_plot import *
from func_exportVR import * 

_____
# HUMAN NETWORK
_____

In [2]:
organism = 'human'

G = load_graph(organism) 

d_gene_sym = load_genesymbols(G, organism)
l_features = list(d_gene_sym.values())

d_gene_do = pickle.load( open( "input/d_gene_do.pkl", "rb" ) )
d_do_genes = pickle.load( open( "input/d_do_genes.pkl", "rb" ) )
d_do_names = pickle.load( open( "input/DO_names.pkl", "rb" ) )
d_names_do = {y:x for x,y in d_do_names.items()}

In [3]:
d_centralities = load_centralities(G, organism)
df_centralities = pd.DataFrame(d_centralities).T
df_centralities.columns = ['degree','closeness', 'betweeness', 'eigenvector']
essential_genes,non_ess_genes,notdefined_genes = load_essentiality(G, organism)

_____ 
# FIGURE 2B - disease layout with ENHANCING FACTOR 
_____

In [4]:
# get Layout basis --> global layout matrix 
DF_structural = load_datamatrix(G,organism, 'global')
df_max = DF_structural.max()
l_max_visprob = max(list(df_max.values))


# get Disease Feature Matrix
FM_disease = pd.read_csv('input/Features_Disease_Dataframe_'+organism+'.csv', index_col=0)

# VECTOR : NF + DISEASE MODULE 
e.g.
+ DOID:0111253 / Neurofibromatosis1
+ DOID:8712 / Neurofibromatosis
+ DOID:962 / Neurofibroma
+ DOID:0050736 / Autosomal Dominant Disease

In [5]:
path_nf = 'input/neurofibromatosis/'

# gene of interest
nf_goi_ = str(int(np.loadtxt(path_nf+'nf_gene_of_interest.txt', unpack=False)))
nf_goi = list(nf_goi_.split(" "))
nf_goi_str = ' '.join([str(item) for item in nf_goi])

nf_neighbors = []
for edge in G.edges():
    if edge[0] == nf_goi_str:
        nf_neighbors.append(edge[1])
    elif edge[1] == nf_goi_str: 
        nf_neighbors.append(edge[0])
        
        
# get disease module genes 
num_doid = '8712' #'0050736'

# define specific DOID 
dismod = FM_disease.loc[:,'DOID:'+num_doid]
dismod.index = FM_disease.index
dismod_id = 'DOID'+num_doid

dismod_genes = [] 
for i,v in enumerate(dismod.values):
    if v == 1:
        dismod_genes.append(dismod.index[i])
    else:
        pass

print('Number of Genes associated to DOID', num_doid,':',len(dismod_genes))

l_genes_dismod = [str(i) for i in dismod_genes]# + nf_neighbors
print('Genes to be enhanced:', len(l_genes_dismod))

# create a dataframe (sorted by DF global) and fill 1 when disease-assoc. gene
DF_dismod = pd.DataFrame(dismod.reindex(DF_structural.index ,fill_value=0))
for i in DF_dismod.index:
    if i in l_genes_dismod:
        DF_dismod.loc[i] = 1
    
DF_dismod.columns = ['Neurofib 8712']

Number of Genes associated to DOID 8712 : 12
Genes to be enhanced: 12


In [6]:
# test if dataframe contains value 1 at respective position of genes in l_genes_.. 

#for i,val in enumerate(DF_dismod.values):
#    if val == 1: 
#        print(DF_dismod.index[i])

#### SPECIFIC NODE + EDGE COLORS

In [7]:
# color nodes
node_col_dismod = '#00D9FC' #'#008792' #'#00E3DF' #'#FF8E00' 

# color edges    
edge_color = node_col_dismod
d_edge_col_dismod = color_edges_from_nodelist_specific(G, l_genes_dismod, node_col_dismod)
len(d_edge_col_dismod)

3

# VECTOR : FIRST LEVEL DISEASE MODULE

In [8]:
l_genes_dismod_neigh = nf_neighbors
print('Genes to be enhanced:', len(l_genes_dismod_neigh))

# create a dataframe (sorted by DF global) and fill 1 when disease-assoc. gene
DF_dismod_neigh = pd.DataFrame(dismod.reindex(DF_structural.index ,fill_value=0))
for i in DF_dismod_neigh.index:
    if i in l_genes_dismod_neigh:
        DF_dismod_neigh.loc[i] = 1
        
DF_dismod_neigh.columns = ['NF1 neighbors'] # not including NF1 

Genes to be enhanced: 49


#### SPECIFIC NODE + EDGE COLORS

In [9]:
# color nodes
node_col_neigh = node_col_dismod

# color edges    
edge_color = node_col_neigh
d_edge_col_neigh = color_edges_from_nodelist_specific(G, l_genes_dismod_neigh, node_col_neigh)
len(d_edge_col_neigh)

68

# VECTORS : Disease Context 
+ DOID:3165 - Skin benign neoplasm
+ DOID:10534 - Stomach Cancer
+ DOID:3620 - Central Nervous System Cancer
+ DOID:767 - Muscular Atrophy
+ resources: https://disease-ontology.org/ https://en.wikipedia.org/wiki/Neurofibromatosis

### First Disease Vector | DOID:3165 - Skin benign neoplasm

In [10]:
# -----------------------------
num_doid_1 = '3165'
# -----------------------------

# define specific DOID 
dismod_1 = FM_disease.loc[:,'DOID:'+num_doid_1]
dismod_1.index = FM_disease.index
dismod_id_1 = 'DOID'+num_doid_1

# get all genes associated to disease
dismod_genes_1 = [] 
for i,v in enumerate(dismod_1.values):
    if v == 1:
        dismod_genes_1.append(dismod_1.index[i])
    else:
        pass

l_genes_dismod_1 = [str(i) for i in dismod_genes_1]
print('Number of Genes associated to DOID', num_doid_1,':',len(l_genes_dismod_1))

# create a dataframe (sorted by DF global) and fill 1 when disease-assoc. gene
DF_dismod_1 = pd.DataFrame(dismod_1.reindex(DF_structural.index ,fill_value=0))
for i in DF_dismod_1.index:
    if i in l_genes_dismod_1:
        DF_dismod_1.loc[i] = 1

Number of Genes associated to DOID 3165 : 163


#### SPECIFIC NODE + EDGE COLORS

In [11]:
# color nodes
node_col_1 = '#FF6919' # (orange) #'#B00160'

# color edges    
edge_color = node_col_1
d_edge_col_1 = color_edges_from_nodelist_specific(G, l_genes_dismod_1, node_col_1)
len(d_edge_col_1)

446

### Second Disease Vector | DOID:10534 - Stomach Cancer

In [12]:
# -----------------------------
num_doid_2 = '10534' #'1612' #'3883'
# -----------------------------

# define specific DOID 
dismod_2 = FM_disease.loc[:,'DOID:'+num_doid_2]
dismod_2.index = FM_disease.index
dismod_id_2 = 'DOID'+num_doid_2

# get all genes associated to disease
dismod_genes_2 = [] 
for i,v in enumerate(dismod_2.values):
    if v == 1:
        dismod_genes_2.append(dismod_2.index[i])
    else:
        pass
    
l_genes_dismod_2 = [str(i) for i in dismod_genes_2]
print('Number of Genes associated to DOID', num_doid_2,':',len(l_genes_dismod_2))

# create a dataframe (sorted by DF global) and fill 1 when disease-assoc. gene
DF_dismod_2 = pd.DataFrame(dismod_2.reindex(DF_structural.index ,fill_value=0))
for i in DF_dismod_2.index:
    if i in l_genes_dismod_2:
        DF_dismod_2.loc[i] = 1

Number of Genes associated to DOID 10534 : 303


#### SPECIFIC NODE + EDGE COLORS

In [13]:
# color nodes
node_col_2 = '#FFC433' #(orangeyellow)


# color edges    
edge_color = node_col_2
d_edge_col_2 = color_edges_from_nodelist_specific(G, l_genes_dismod_2, node_col_2)
len(d_edge_col_2)

910

### Third Disease Vector | DOID:3620 - Central Nervous System Cancer

In [14]:
# -----------------------------
num_doid_3 = '3620' #'0060115' 
# -----------------------------

# define specific DOID 
dismod_3 = FM_disease.loc[:,'DOID:'+num_doid_3]
dismod_3.index = FM_disease.index
dismod_id_3 = 'DOID'+num_doid_3

dismod_genes_3 = [] 
for i,v in enumerate(dismod_3.values):
    if v == 1:
        dismod_genes_3.append(dismod_3.index[i])
    else:
        pass

l_genes_dismod_3 = [str(i) for i in dismod_genes_3]
print('Number of Genes associated to DOID', num_doid_3,':',len(l_genes_dismod_3))

# create a dataframe (sorted by DF global) and fill 1 when disease-assoc. gene
DF_dismod_3 = pd.DataFrame(dismod_3.reindex(DF_structural.index ,fill_value=0))
for i in DF_dismod_3.index:
    if i in l_genes_dismod_3:
        DF_dismod_3.loc[i] = 1

Number of Genes associated to DOID 3620 : 173


#### SPECIFIC NODE + EDGE COLORS

In [15]:
# color nodes
node_col_3 = '#AA0707' # (red)

# color edges    
edge_color = node_col_3
d_edge_col_3 = color_edges_from_nodelist_specific(G, l_genes_dismod_3, node_col_3)
len(d_edge_col_3)

496

### Fourth Disease Vector | DOID:767 - Muscular Atrophy

In [16]:
# -----------------------------
num_doid_4 = '767'
# -----------------------------

# define specific DOID 
dismod_4 = FM_disease.loc[:,'DOID:'+num_doid_4]
dismod_4.index = FM_disease.index
dismod_id_4 = 'DOID'+num_doid_4

dismod_genes_4 = [] 
for i,v in enumerate(dismod_4.values):
    if v == 1:
        dismod_genes_4.append(dismod_4.index[i])
    else:
        pass
    
l_genes_dismod_4 = [str(i) for i in dismod_genes_4]
print('Number of Genes associated to DOID', num_doid_4,':',len(l_genes_dismod_4))

# create a dataframe (sorted by DF global) and fill 1 when disease-assoc. gene
DF_dismod_4 = pd.DataFrame(dismod_4.reindex(DF_structural.index ,fill_value=0))
for i in DF_dismod_4.index:
    if i in l_genes_dismod_4:
        DF_dismod_4.loc[i] = 1

Number of Genes associated to DOID 767 : 149


#### SPECIFIC NODE + EDGE COLORS

In [17]:
# color nodes
node_col_4 = '#9BEE00' # (yellowgreen) '#FFE000' #(yellow) F800F4' #(pink)

# color edges    
edge_color = node_col_4
d_edge_col_4 = color_edges_from_nodelist_specific(G, l_genes_dismod_4, node_col_4)
len(d_edge_col_4)

191

### All Disease Vectors > and multiply with enhancing factor

In [18]:
# --------------------------------------
#
scalar_val_dismod = 3
#
# --------------------------------------
enhance_factor_dismod = round((1-l_max_visprob)*scalar_val_dismod,10)
print('ENHANCING FACTOR Disease Module: ',enhance_factor_dismod)


# --------------------------------------
#
scalar_val_discontext = 3
#
# -------------------------------------- 
enhance_factor_discontext = round((1-l_max_visprob)*scalar_val_discontext,10)
print('ENHANCING FACTOR Disease Context : ', enhance_factor_discontext)

ENHANCING FACTOR Disease Module:  0.281271
ENHANCING FACTOR Disease Context :  0.281271


In [19]:
# multiply with enhancing factor 

DF_dismod_scal = DF_dismod * enhance_factor_dismod
DF_dismod_neigh_scal = DF_dismod_neigh * enhance_factor_dismod

DF_dismod_1_scal = DF_dismod_1 * enhance_factor_discontext
DF_dismod_2_scal = DF_dismod_2 * enhance_factor_discontext
DF_dismod_3_scal = DF_dismod_3 * enhance_factor_discontext
DF_dismod_4_scal = DF_dismod_4 * enhance_factor_discontext

# merge into one dataframe 
DF_merge_discontext = pd.concat([
                                DF_structural, # original global matrix 
                                DF_dismod_scal, # Disease Module 1 Vector 
                                #DF_dismod_neigh_scal, # Autosomal Dominant Disease (i.e. larger Context of NF)
                                DF_dismod_1_scal, # Disease Context Vectors 1-4
                                DF_dismod_2_scal,
                                DF_dismod_3_scal,
                                DF_dismod_4_scal],axis=1
                                )

DF_merge_discontext

Unnamed: 0,66008,8473,2561,3759,22906,4928,1994,8481,81610,51361,...,143903,10861,51471,221044,29965,Neurofib 8712,DOID:3165,DOID:10534,DOID:3620,DOID:767
66008,0.900242,0.000687,0.012865,0.006433,0.005347,0.000460,0.000051,0.000345,0.003464,0.002905,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
8473,0.002519,0.900245,0.000036,0.000018,0.005316,0.000005,0.000002,0.000003,0.000012,0.000008,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
2561,0.002502,0.000002,0.900367,0.000018,0.000015,0.000002,0.000000,0.000001,0.000010,0.000008,...,0.000001,0.000001,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
3759,0.002502,0.000002,0.000036,0.900389,0.000015,0.000002,0.000000,0.000001,0.000010,0.000008,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
22906,0.002525,0.000685,0.000036,0.000018,0.900120,0.000459,0.000000,0.000001,0.000010,0.000009,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143903,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.900070,0.000070,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
10861,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000070,0.900070,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
51471,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.900052,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0
221044,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.90003,0.000000,0.0,0.0,0.0,0.0,0.0


### 2D PORTRAIT POSITIONING 

In [20]:
n_n = 8  #6 #20
spr = 1.0
md = 0.0 #0.4 #0.4
metr = 'cosine'

In [None]:
%%time 
posG_umap2D = layout_portrait_umap(G,DF_merge_discontext, 2, 
                                   n_neighbors = n_n, 
                                   spread = spr, 
                                   min_dist = md, 
                                   metric = metr) 
posG = posG_umap2D

### VISUAL SETTINGS + PLOT PREPARATION

#### GENERAL NODE + EDGE COLORS 

In [None]:
opacity_nodes = 0.8
node_edge_col = '#696969' 
node_linewidth = 1

scale_factor = 0.5
size_plotly = list(draw_node_degree(G, scale_factor/100).values())

width_edges = 0.35
opacity_edges = 0.15

#### MERGE NODE COLORS > 4 diseases = 4 different colors 

In [None]:
# merge all node colors (and sort like G.nodes)

col_param = list(df_centralities['closeness'].values)
d_node_colors = dict(zip(list(G.nodes()), col_param))

c1='#E6E6E6' 
c2='#9B9B9B' 
n=len(set(col_param))

colors_for_palette = []
for x in range(n+1):
    colors_for_palette.append(colorFader(c1,c2,x/n))    

customPalette = sns.set_palette(sns.color_palette(colors_for_palette))
d_colors = color_nodes_from_dict(G, d_node_colors, palette = customPalette)

d_col_all = {}
for k,v in d_colors.items():
    
    if k in l_genes_dismod:
        d_col_all[k] = node_col_dismod
        
    elif k in l_genes_dismod_neigh:
        d_col_all[k] = node_col_dismod
        
    elif k in l_genes_dismod_1:
        d_col_all[k] = node_col_1
        
    elif k in l_genes_dismod_2:
        d_col_all[k] = node_col_2
        
    elif k in l_genes_dismod_3:
        d_col_all[k] = node_col_3
    
    elif k in l_genes_dismod_4:
        d_col_all[k] = node_col_4
        
    else:
        d_col_all[k]=v
        
colors = list(d_col_all.values())

l_genes_all = l_genes_dismod + l_genes_dismod_1 + l_genes_dismod_2 + l_genes_dismod_3 + l_genes_dismod_4 + l_genes_dismod_neigh

### PLOT NETWORK PORTRAIT - plotly 

In [None]:
# plot nodes based on focus > separate to background/foreground 

posG_foreground = {}
posG_background = {}
for k,v in posG.items():
    if k in l_genes_all:
        posG_foreground[k]=v
    else: 
        posG_background[k]=v
    
d_colors_foreground = {}
d_colors_background = {}
for i,c in d_col_all.items():
    if i in posG_foreground.keys():
        d_colors_foreground[i]=c
    else: 
        d_colors_background[i]=c

colors_foreground = list(d_colors_foreground.values())
colors_background = list(d_colors_background.values())

d_feat_foreground = {}
d_feat_background = {}
for k,v in d_gene_sym.items():
    if k in posG_foreground: 
        d_feat_foreground[k]=v
    else:
        d_feat_background[k]=v
        
feat_foreground = list(d_feat_foreground.values())
feat_background = list(d_feat_background.values())

d_size_plotly = draw_node_degree(G, scale_factor/100)
d_size_plotly_foreground = {}
d_size_plotly_background = {}
for k,v in d_size_plotly.items():
    if k in posG_foreground.keys():
        d_size_plotly_foreground[k]=v
    else:
        d_size_plotly_background[k]=v
        
size_plotly_foreground = list(d_size_plotly_foreground.values())
size_plotly_background = list(d_size_plotly_background.values())


umap_nodes_foreground = get_trace_nodes_2D(posG_foreground, feat_foreground, colors_foreground, size_plotly_foreground, 
                                           node_linewidth*0.25, 0.8)
umap_nodes_foreground_ = get_trace_nodes_2D(posG_foreground, feat_foreground, colors_foreground, 5,
                                           None, 0.2)
umap_nodes_background = get_trace_nodes_2D(posG_background, feat_background, colors_background, size_plotly_background, 
                                           None,0.5) #node_linewidth*0.05, 0.5)
umap_nodes_background_ = get_trace_nodes_2D(posG_background, feat_background, colors_background, 2, 
                                            None,0.5)

umap_edges_dismod = get_trace_edges_specific2D(d_edge_col_dismod, posG, linew=width_edges, opac=opacity_edges)
umap_edges_dismod_neigh = get_trace_edges_specific2D(d_edge_col_neigh, posG, linew=width_edges, opac=opacity_edges)

umap_edges_1 = get_trace_edges_specific2D(d_edge_col_1, posG, linew=width_edges, opac=opacity_edges)
umap_edges_2 = get_trace_edges_specific2D(d_edge_col_2, posG, linew=width_edges, opac=opacity_edges)
umap_edges_3 = get_trace_edges_specific2D(d_edge_col_3, posG, linew=width_edges, opac=opacity_edges)
umap_edges_4 = get_trace_edges_specific2D(d_edge_col_4, posG, linew=width_edges, opac=opacity_edges)

data = [umap_edges_1, 
        umap_edges_2,
        umap_edges_3,
        umap_edges_4,
        umap_edges_dismod,
        umap_edges_dismod_neigh,
        umap_nodes_background_, 
        umap_nodes_background, 
        umap_nodes_foreground_,
        umap_nodes_foreground]

path = 'output_plots/2Dlayouts/'
schema = 'light' #'dark' 
fname = '2Dportrait_NF8712_'+dismod_id_1+dismod_id_2+dismod_id_3+dismod_id_4+'_enhDismod'+str(scalar_val_dismod)+'_enhContext'+str(scalar_val_discontext)+'_nn'+str(n_n)+'_spr'+str(spr)+'_md'+str(md)+'_'+str(metr)+'_'+organism
plot_2D(data,path,fname)

# FIGURE PART 1
highlighting only Neurofibromatosis

In [194]:
col_param = list(df_centralities['closeness'].values)
d_node_colors = dict(zip(list(G.nodes()), col_param))

c1='#E6E6E6' 
c2='#9B9B9B' 
n=len(set(col_param))

colors_for_palette = []
for x in range(n+1):
    colors_for_palette.append(colorFader(c1,c2,x/n))    

customPalette = sns.set_palette(sns.color_palette(colors_for_palette))
d_colors = color_nodes_from_dict(G, d_node_colors, palette = customPalette)

d_col_all = {}
for k,v in d_colors.items():
    if k in l_genes_dismod:
        d_col_all[k] = node_col_dismod
    
    else:
        d_col_all[k]=v
        
colors = list(d_col_all.values())
l_genes_all = l_genes_dismod

In [195]:
# plot nodes based on focus > separate to background/foreground 

posG_foreground = {}
posG_background = {}
for k,v in posG.items():
    if k in l_genes_all:
        posG_foreground[k]=v
    else: 
        posG_background[k]=v
    
d_colors_foreground = {}
d_colors_background = {}
for i,c in d_col_all.items():
    if i in posG_foreground.keys():
        d_colors_foreground[i]=c
    else: 
        d_colors_background[i]=c

colors_foreground = list(d_colors_foreground.values())
colors_background = list(d_colors_background.values())

d_feat_foreground = {}
d_feat_background = {}
for k,v in d_gene_sym.items():
    if k in posG_foreground: 
        d_feat_foreground[k]=v
    else:
        d_feat_background[k]=v
        
feat_foreground = list(d_feat_foreground.values())
feat_background = list(d_feat_background.values())

d_size_plotly = draw_node_degree(G, scale_factor/100)
d_size_plotly_foreground = {}
d_size_plotly_background = {}
for k,v in d_size_plotly.items():
    if k in posG_foreground.keys():
        d_size_plotly_foreground[k]=v
    else:
        d_size_plotly_background[k]=v
        
size_plotly_foreground = list(d_size_plotly_foreground.values())
size_plotly_background = list(d_size_plotly_background.values())


umap_nodes_foreground = get_trace_nodes_2D(posG_foreground, feat_foreground, colors_foreground, size_plotly_foreground, 
                                           node_linewidth*0.25, 0.8)
umap_nodes_foreground_ = get_trace_nodes_2D(posG_foreground, feat_foreground, colors_foreground, 5,
                                           None, 0.2)
umap_nodes_background = get_trace_nodes_2D(posG_background, feat_background, colors_background, size_plotly_background, 
                                           None,0.5) #node_linewidth*0.05, 0.5)
umap_nodes_background_ = get_trace_nodes_2D(posG_background, feat_background, colors_background, 2, 
                                            None,0.5)

umap_edges_dismod = get_trace_edges_specific2D(d_edge_col_dismod, posG, linew=width_edges, opac=1) #opacity_edges)

data = [
        umap_edges_dismod,
        umap_nodes_background_, 
        umap_nodes_background, 
        umap_nodes_foreground_,
        umap_nodes_foreground]

path = 'output_plots/2Dlayouts/'
schema = 'light' #'dark' 
fname = '2Dportrait_NF8712_FirstPart_NF8712only_'+dismod_id_1+dismod_id_2+dismod_id_3+dismod_id_4+'_enhDismod'+str(scalar_val_dismod)+'_enhContext'+str(scalar_val_discontext)+'_nn'+str(n_n)+'_spr'+str(spr)+'_md'+str(md)+'_'+str(metr)+'_'+organism
plot_2D(data,path,fname)

'output_plots/2Dlayouts/2Dportrait_NF8712_FirstPart_NF8712only_DOID3165DOID10534DOID3620DOID767_enhDismod1.5_enhContext1.5_nn8_spr1.0_md0.0_cosine_human.html'

# FIGURE PART 2
highlighting Neurofibromatosis + neighbors of gene NF1

In [129]:
col_param = list(df_centralities['closeness'].values)
d_node_colors = dict(zip(list(G.nodes()), col_param))

c1='#E6E6E6' 
c2='#9B9B9B' 
n=len(set(col_param))

colors_for_palette = []
for x in range(n+1):
    colors_for_palette.append(colorFader(c1,c2,x/n))    

customPalette = sns.set_palette(sns.color_palette(colors_for_palette))
d_colors = color_nodes_from_dict(G, d_node_colors, palette = customPalette)

d_col_all = {}
for k,v in d_colors.items():
    if k in l_genes_dismod_neigh:
        d_col_all[k] = node_col_dismod
    
    else:
        d_col_all[k]=v
        
colors = list(d_col_all.values())

In [130]:
# plot nodes based on focus > separate to background/foreground 

posG_foreground = {}
posG_background = {}
for k,v in posG.items():
    if k in l_genes_dismod_neigh:
        posG_foreground[k]=v
    else: 
        posG_background[k]=v
    
d_colors_foreground = {}
d_colors_background = {}
for i,c in d_col_all.items():
    if i in posG_foreground.keys():
        d_colors_foreground[i]=c
    else: 
        d_colors_background[i]=c

colors_foreground = list(d_colors_foreground.values())
colors_background = list(d_colors_background.values())

d_feat_foreground = {}
d_feat_background = {}
for k,v in d_gene_sym.items():
    if k in posG_foreground: 
        d_feat_foreground[k]=v
    else:
        d_feat_background[k]=v
        
feat_foreground = list(d_feat_foreground.values())
feat_background = list(d_feat_background.values())

d_size_plotly = draw_node_degree(G, scale_factor/100)
d_size_plotly_foreground = {}
d_size_plotly_background = {}
for k,v in d_size_plotly.items():
    if k in posG_foreground.keys():
        d_size_plotly_foreground[k]=v
    else:
        d_size_plotly_background[k]=v
        
size_plotly_foreground = list(d_size_plotly_foreground.values())
size_plotly_background = list(d_size_plotly_background.values())


umap_nodes_foreground = get_trace_nodes_2D(posG_foreground, feat_foreground, colors_foreground, size_plotly_foreground, 
                                           node_linewidth*0.25, 0.8)
umap_nodes_foreground_ = get_trace_nodes_2D(posG_foreground, feat_foreground, colors_foreground, 5,
                                           None, 0.2)
umap_nodes_background = get_trace_nodes_2D(posG_background, feat_background, colors_background, size_plotly_background, 
                                           None,0.5) #node_linewidth*0.05, 0.5)
umap_nodes_background_ = get_trace_nodes_2D(posG_background, feat_background, colors_background, 2, 
                                            None,0.5)

umap_edges_dismod = get_trace_edges_specific2D(d_edge_col_auto, posG, linew=width_edges, opac=1) #opacity_edges)

data = [
        umap_edges_dismod,
        umap_nodes_background_, 
        umap_nodes_background, 
        umap_nodes_foreground_,
        umap_nodes_foreground]

path = 'output_plots/2Dlayouts/'
schema = 'light' #'dark' 
fname = '2Dportrait_NF8712_SecondPart_NFandNeigh'+dismod_id_1+dismod_id_2+dismod_id_3+dismod_id_4+'_enhDismod'+str(scalar_val_dismod)+'_enhContext'+str(scalar_val_discontext)+'_nn'+str(n_n)+'_spr'+str(spr)+'_md'+str(md)+'_'+str(metr)+'_'+organism
plot_2D(data,path,fname)

'output_plots/2Dlayouts/2Dportrait_NF8712_SecondPart_NFandAutosomalDOID3165DOID10534DOID3620DOID767_enhDismod8_enhContext3_nn8_spr1.0_md0.0_cosine_human.html'

# VERY MUCH WORK IN PROGRESS 

# 4. | STRUC + FUNC (complete disease matrix)

In [329]:
FM_disease = pd.read_csv('input/Features_Disease_Dataframe_'+organism+'.csv', index_col=0)

In [331]:
# --------------------------------------
#
scalar_val = 4
#
# --------------------------------------

enhance_factor = round((1-l_max_visprob)*scalar_val,10) # for emphasizing functional features > should be >= max of structural matrix values  # if significantly higher than max values in structural matrix > causes isolation of nodes with functional features enhanced
print('ENHANCING FACTOR functional features: ',enhance_factor)

n_n = 14
spr = 1
md = 0.0
metr = 'cosine'

ENHANCING FACTOR functional features:  0.375028


In [332]:
# create an empty matrix for zeros of rest genes (not associated to any disease)
rest = []
for i in G.nodes():
    if str(i) in FM_disease.index or int(i) in FM_disease.index:
        pass
    else: 
        rest.append(str(i))
        
empty = np.zeros(shape=(len(rest),len(FM_disease.columns)))
empty_stacked = np.vstack(empty)
df_empty = pd.DataFrame(empty_stacked)
df_empty.index = rest
df_empty.columns = FM_disease.columns

In [333]:
# combine disease vector with rest of genes in graph
df_diseases_complete = pd.concat([FM_disease, df_empty])
df_diseases_complete

Unnamed: 0,DOID:11054,DOID:3996,DOID:0050686,DOID:162,DOID:14566,DOID:4,DOID:615,DOID:9500,DOID:74,DOID:7,...,DOID:0060485,DOID:12577,DOID:732,DOID:0050816,DOID:0060589,DOID:0060202,DOID:14116,DOID:3137,DOID:100,DOID:0110648
389289,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4524,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4353,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27127,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5925,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
143903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
221044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [334]:
# reorder according to G.nodes
DF_diseases_reorder = df_diseases_complete.reindex(DF_structural.index)

# fill nan positions (due to reordering) with 1 
DF_diseases_new = DF_diseases_reorder.replace(np.nan,1)

# multiply with enhancing factor
DF_diseases_scal = DF_diseases_new*enhance_factor 

DF_merge_diseases = pd.concat([DF_structural.T, 
                             DF_diseases_scal],axis=1)

DF_merge_diseases

Unnamed: 0,66008,8473,2561,3759,22906,4928,1994,8481,81610,51361,...,DOID:0060485,DOID:12577,DOID:732,DOID:0050816,DOID:0060589,DOID:0060202,DOID:14116,DOID:3137,DOID:100,DOID:0110648
66008,0.900242,0.002519,0.002502,0.002502,0.002525,0.002517,0.002512,0.002511,0.002502,0.002501,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8473,0.000687,0.900245,0.000002,0.000002,0.000685,0.000008,0.000031,0.000006,0.000002,0.000002,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2561,0.012865,0.000036,0.900367,0.000036,0.000036,0.000065,0.000037,0.000036,0.000036,0.000036,...,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028
3759,0.006433,0.000018,0.000018,0.900389,0.000018,0.000027,0.000041,0.000018,0.000018,0.000018,...,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028
22906,0.005347,0.005316,0.000015,0.000015,0.900120,0.005314,0.000026,0.000015,0.000015,0.000016,...,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143903,0.000000,0.000000,0.000001,0.000000,0.000000,0.000000,0.000004,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10861,0.000000,0.000000,0.000001,0.000000,0.000000,0.000000,0.000004,0.000000,0.000000,0.000000,...,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028,0.375028
51471,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000054,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
221044,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000003,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### 2D PORTRAIT

In [335]:
n_n = 12
spr = 1
md = 0.0
metr = 'cosine'

In [336]:
%%time 

posG_umap2D = layout_portrait_umap(G,DF_merge_diseases, 2,
                                   n_neighbors = n_n,
                                   spread = spr,
                                   min_dist = md,
                                   metric = metr) 
posG = posG_umap2D

CPU times: user 3min 9s, sys: 15.3 s, total: 3min 25s
Wall time: 1min 8s


## VISUAL SETTINGS + PLOT PREPARATION

#### NODES - GENERAL

In [337]:
opacity_nodes = 0.8
node_edge_col = '#696969' 
node_linewidth = 1

scale_factor = 0.5
size_plotly = list(draw_node_degree(G, scale_factor/100).values())

In [338]:
color_method = 'NF1'
l_genes = enhance_genes

node_col = '#00E3DF' #'#FF8E00' 
undefined_col = '#DEDEDE'
rest_col_nodes = '#DEDEDE'

d_col_all = color_nodes_from_list(G, l_genes, node_col)
colors = list(d_col_all.values())
    
edge_color = node_col 
d_edge_col = color_edges_from_nodelist_specific(G, l_genes, node_col)
width_edges = 0.25
opacity_edges = 0.5

## PLOT NETWORK PORTRAIT - plotly 

In [339]:
# plot nodes based on focus > separate to background/foreground 

posG_foreground = {}
posG_background = {}
for k,v in posG.items():
    if k in l_genes:
        posG_foreground[k]=v
    else: 
        posG_background[k]=v
    
d_colors_foreground = {}
d_colors_background = {}
for i,c in d_col_all.items():
    if i in posG_foreground.keys():
        d_colors_foreground[i]=c
    else: 
        d_colors_background[i]=c

colors_foreground = list(d_colors_foreground.values())
colors_background = list(d_colors_background.values())

d_feat_foreground = {}
d_feat_background = {}
for k,v in d_gene_sym.items():
    if k in posG_foreground: 
        d_feat_foreground[k]=v
    else:
        d_feat_background[k]=v
        
feat_foreground = list(d_feat_foreground.values())
feat_background = list(d_feat_background.values())

d_size_plotly = draw_node_degree(G, scale_factor/100)
d_size_plotly_foreground = {}
d_size_plotly_background = {}
for k,v in d_size_plotly.items():
    if k in posG_foreground.keys():
        d_size_plotly_foreground[k]=v
    else:
        d_size_plotly_background[k]=v
        
size_plotly_foreground = list(d_size_plotly_foreground.values())
size_plotly_background = list(d_size_plotly_background.values())

In [340]:
umap_nodes_foreground = get_trace_nodes_2D(posG_foreground, feat_foreground, colors_foreground, size_plotly_foreground, node_linewidth)
umap_nodes_background = get_trace_nodes_2D(posG_background, feat_background, colors_background, size_plotly_background, node_linewidth*0.25)
umap_nodes_background_ = get_trace_nodes_2D(posG_background, feat_background, colors_background, 0.5, node_linewidth*0.5)

umap_edges = get_trace_edges_specific2D(d_edge_col, posG, linew=width_edges, opac=opacity_edges)
data = [umap_edges, umap_nodes_background_, umap_nodes_background, umap_nodes_foreground]

path = 'output_plots/2Dlayouts/'
schema = 'light' #'dark' 
fname = '2Dportrait_NetlayoutDisease_DiseaseFeatureMatrix_enhance'+str(enhance_factor)+'_'+color_method+'_nn'+str(n_n)+'_spr'+str(spr)+'_md'+str(md)+'_'+str(metr)+'_'+organism

plot_2D(data,path,fname)

'output_plots/2Dlayouts/2Dportrait_NetlayoutDisease_DiseaseFeatureMatrix_enhance0.375028_NF1_nn12_spr1_md0.0_cosine_human.html'

### TOPOGRAPHIC MAP 

In [58]:
# ---------------------------------------------------------------------------
#
# Choose a z-parameter e.g. essentiality, centravg, disease 
# or choose any dictionary with nodeID: z-value
# by setting d_z = {nodeID: val, ...}
#
# ---------------------------------------------------------------------------
z_feat = 'disease'
# ---------------------------------------------------------------------------

if z_feat == 'essentiality':

    ''' Essentiality state of each node displayed on layers of "essential", "non-essential", "non-categorized" ''' 

    value_ess = 5
    value_noness = 3 
    value_notdef = 1

    d_ess = {}
    for i in essential_genes:
        d_ess[i] = value_ess
    d_noness = {}
    for i in non_ess_genes:
        d_noness[i] = value_noness
    d_notdef = {}
    for i in notdefined_genes:
        d_notdef[i] = value_notdef

    d_alless_unsort = {**d_ess,**d_noness,**d_notdef}
    d_z = {key:d_alless_unsort[key] for key in G.nodes()}
    
    l_genes = essential_genes
    
    node_col = '#00abff' 
    undefined_col = '#d3d3d3'
    rest_col_nodes = '#d3d3d3'

    d_col_all = color_nodes_from_list(G, essential_genes, node_col)
    colors = list(d_col_all.values())
    
    edge_color = '#ACACAC' 
    d_edge_col = color_edges_from_nodelist_specific(G, l_genes, node_col)
    width_edges = 0.15
    opacity_edges = 0.1
    
    umap_nodes = get_trace_nodes_3D(posG, l_features, colors, size, opacity_nodes)
    umap_edges = get_trace_edges_specific3D(d_edge_col, posG, linew=width_edges, opac=opacity_edges)  
    data = [umap_edges, umap_nodes]
    
    
elif z_feat == 'centravg':
    
    ''' Average of Centrality Metrics displayed in z-height ''' 
    
    d_clos = {}
    for k,v in d_centralities.items():
        d_clos[k] = v[1]

    d_nodecolors = d_clos 
    col_pal = 'YlOrRd'

    d_colors = color_nodes_from_dict(G, d_nodecolors, palette = col_pal)
    colors = list(d_colors.values())
    
    edge_color = '#ACACAC' 
    width_edges = 0.15
    opacity_edges = 0.1
    
    umap_nodes = get_trace_nodes_3D(posG, l_features, colors, size, opacity_nodes)
    umap_edges = get_trace_edges_3D(G, posG, edge_color, opac = 0.05, linewidth = 0.5)
    data = [umap_edges, umap_nodes]
    
    
elif z_feat == 'disease':
    
    ''' Disease count associated to a node displayed in z-height ''' 

    DM_feature  = pd.read_csv('input/Features_Disease_Dataframe_Human.csv', index_col=0)

    # set gene list (= G.nodes())
    genes_assoc = []
    for i in DM_feature.index:
        genes_assoc.append(str(i))

    genes_notassoc = [] 
    for g in G.nodes():
        if g not in genes_assoc:
            genes_notassoc.append(g)

    features_counted = (DM_feature == 1).astype(int).sum(axis=1)
    d_features_counted = dict(zip(genes_assoc, list(features_counted)))
    
    d_rest = {}
    for i in genes_notassoc: 
        d_rest[i] = -20

    d_param_unsorted = {**d_features_counted, **d_rest}
    d_z = {key:d_param_unsorted[key] for key in G.nodes()}
    
    d_clos = {}
    for k,v in d_centralities.items():
        d_clos[k] = v[1]

    d_nodecolors = d_clos 
    col_pal = 'YlOrRd'

    d_colors = color_nodes_from_dict(G, d_nodecolors, palette = col_pal)
    colors = list(d_colors.values())
    edge_color = '#ACACAC'
    width_edges = 0.15
    opacity_edges = 0.1

    umap_nodes = get_trace_nodes_3D(posG, l_features, colors, size, opacity_nodes)
    umap_edges = get_trace_edges_3D(G, posG, edge_color, opac = 0.05, linewidth = 0.5)
    data = [umap_edges, umap_nodes]
    
else:
    pass

#### PLOT 

In [59]:
# ---------------------------------------------------------------------------
#
# Choose a 2D layout as basis
#
# 1. either calculate one e.g.: 
# posG_2Dportrait = layout_portrait_umap(G,DM,2,n_neighbors=20, spread=1, min_dist=0.0, metric='cosine') 
# ---------------------------------------------------------------------------
#
# 2. or use a precalculated 2D layout 
posG_2D = posG_umap2D
# ---------------------------------------------------------------------------

posG = layout_topographic(posG_2D, d_z)

path = 'output_plots/Topographic/'
schema = 'light' #'dark'
fname = 'topographic_z_'+z_feat+'_'+netlayout+'_'+organism+'_'+schema

plot_3D(data,path,fname, schema)

'output_plots/Topographic/topographic_z_disease_local_human_light.html'

### GEODESIC MAP

In [60]:
# ---------------------------------------------------------------------------
#
# Choose a radius-parameter e.g. autocore, essentiality
# or choose any dictionary with nodeID: z-value
# by setting d_radius = {nodeID: val, ...}
#
# ---------------------------------------------------------------------------
r_feat = 'autocore'
# ---------------------------------------------------------------------------


if r_feat == 'autocore':

    # Load Gene Information for Gene lists 
    variants = pd.read_csv('input/julias_variants_mod.txt')
    variant_list = [str(i) for i in list(variants['variants'])]
    variant_sym = pd.read_csv('input/julias_variants_sym.txt')
    genes_sym_list = [str(i) for i in list(variant_sym['variant_sym'])]
    d_gene_sym_variants = dict(zip(variant_list,genes_sym_list))

    df_seeds = pd.read_csv('input/seeds_from_genelists.txt')
    df_seeds.columns = ['seeds']
    df_seeds_z = pd.read_csv('input/seeds_from_zNumbers.txt', sep='\n')
    seeds_list = [str(i) for i in list(df_seeds['seeds'])]

    d_seeds_idsym = {}
    for k,v in d_gene_sym.items():
        for i in seeds_list:
            if v == i:
                d_seeds_idsym[k]=i
    seed_list = list(d_seeds_idsym.keys())

    FM_BP = pd.read_csv('input/Features_GO_BiolProc_Dataframe_human.csv', index_col=0)
    bioproc_list = list(FM_BP.index)

    # From obtained Gene lists select those to be mapped onto different shells 
    # select genes for shells 
    shell_one = variant_list
    shell_two = seed_list
    shell_three = [str(i) for i in bioproc_list]

    small_rad = 1
    mid_rad = 5
    large_rad = 30
    outershell_rad = 50

    d_param = {}
    for i in G.nodes():
        if str(i) in variant_list:
            d_param[str(i)] = small_rad
        elif i in seed_list:
            d_param[str(i)] = mid_rad 
        elif int(i) in bioproc_list:
            d_param[str(i)] = large_rad 
        else:
            d_param[str(i)] = outershell_rad

    d_radius = d_param
    genes_rad = variant_list + seed_list + bioproc_list

    # check how many shells of spheres there will be :
    print('Number of Spherical Shells:', len(set(d_param.values())))


    # Specifying coloring based on genes on different shells 
    d_nodecol = d_clos 
    d_colors = color_nodes_from_dict(G, d_nodecol, palette = col_pal)

    d_colors_spec_genes = {}
    for k,v in d_colors.items():
        if k in shell_one:
            d_colors_spec_genes[k]='#8b0000' 
        elif k in shell_two:
            d_colors_spec_genes[k]='#FF4500' 
        elif k in shell_three:
            d_colors_spec_genes[k]=v
        else: 
            d_colors_spec_genes[k]='#d3d3d3'

    d_colors_spec_genes_sort = {key:d_colors_spec_genes[key] for key in G.nodes()}
    colors = list(d_colors_spec_genes_sort.values())

    
elif r_feat == 'essentiality':

    small_rad = 1
    mid_rad = 10
    outershell_rad = 20

    d_ess_scores = {}
    for i in G.nodes():
        if i in essential_genes:
            d_ess_scores[i] = small_rad
        elif i in non_ess_genes:
            d_ess_scores[i] = mid_rad
        elif str(i) in notdefined_genes:
            d_ess_scores[i] = outershell_rad

    d_radius = d_ess_scores
    genes_rad = list(G.nodes())

    # check how many shells of spheres there will be :
    print('Number of Spherical Shells:', len(set(d_param.values())))

    # Specifying coloring based on genes on different shells 
    d_colors_spec_genes = {}
    for k in G.nodes():
        if k in essential_genes:
            d_colors_spec_genes[k]='#0080bf' 
        elif k in non_ess_genes:
            d_colors_spec_genes[k]='#95dcff' 
        else: 
            d_colors_spec_genes[k]='#d3d3d3'

    d_colors_spec_genes_sort = {key:d_colors_spec_genes[key] for key in G.nodes()}
    colors = list(d_colors_spec_genes_sort.values())
    
    
else: 
    pass

Number of Spherical Shells: 4


#### PLOT 

In [65]:
# ---------------------------------------------------------------------------
#
# SELECT a r - Parameter:
#
# d_radius > a dictionary with keys=G.nodes and values=any radius assigned to each node
# ---------------------------------------------------------------------------

posG_sphere = layout_geodesic(G, d_radius, n_neighbors=20, spread=1, min_dist=0.0)#, DM=DM)
posG = posG_sphere

umap_nodes = get_trace_nodes_3D(posG, l_features, colors, size3d, opacity_nodes)
umap_edges = get_trace_edges_3D(G, posG, edge_color, opac = 0.05, linewidth = 0.5)
data = [umap_nodes]#,umap_edges]

path = 'output_plots/Geodesic/'
schema = 'light' #'dark'
fname = 'geodesic_r_'+r_feat+'_'+netlayout+'_'+organism+'_'+schema

plot_3D(data,path,fname, 'dark')

DM precalc used


'output_plots/Geodesic/geodesic_r_r-autocore_local_human_light.html'