In [1]:
import pandas
import numpy as np
from matplotlib import pyplot as plt
import graph_tools_construction as gt

# With Biological Networks

In [2]:

def make_network(pathway_name, all_edge_dataframe, undirected):
    '''
    Make a network from the known edges.

    Inputs:
        pathway_name: a string for the name of a pathway
        all_edge_dataframe: a dataframe with all the edges
    
    Outputs:
        A: a numpy array of the adjacency matrix (directed)
        node_eids: a list of EntrezIDs whose indices correspond to the entries of A
    '''

    #columns of pathway network
    edge_dataframe = all_edge_dataframe[all_edge_dataframe['pathway_id'] == pathway_name]

    #nodes in the pathway
    node_eids = np.array(list(set(edge_dataframe['src']).union(set(edge_dataframe['dest']))))

    n_nodes = len(node_eids)

    #adjacency matrix
    A = np.zeros((n_nodes, n_nodes))

    #fill the adjacency matrix
    for _,row in edge_dataframe.iterrows():
        i = np.where(node_eids == row['src'])[0][0]
        j = np.where(node_eids == row['dest'])[0][0]
        A[i,j] = 1
        if undirected or row['direction'] == 'undirected':
            A[j,i] = A[i,j].copy()
    
    return A, node_eids

In [3]:
centrality_measure = 'page_rank'
undirected = True

#load the data


pathway_edges = pandas.read_csv('/data3/darpa/omics_databases/ensembl2pathway/reactome_edges_overlap_fixed_isolated.csv').drop('other_genes', 1).dropna()
pathway_edges['dest'] = pandas.to_numeric(pathway_edges['dest'], downcast='integer') 
pathway_edges['src'] = pandas.to_numeric(pathway_edges['src'], downcast='integer') 

#OLD:
# pathway_edges = pandas.read_csv('/data3/darpa/omics_databases/ensembl2pathway/reactome_human_pathway_edges.csv').dropna()
# vardata = pandas.read_csv('/data4/kehoe/GSE73072/GSE73072_vardata.csv')
# data = pandas.read_csv('/data4/kehoe/GSE73072/GSE73072_data.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)
  pathway_edges = pandas.read_csv('/data3/darpa/omics_databases/ensembl2pathway/reactome_edges_overlap_fixed_isolated.csv').drop('other_genes', 1).dropna()


In [4]:
#make an empty dataframe using all the eids from pathway_edges

eids = [str(int(eid)) for eid in np.sort(list(set(pathway_edges['src']).union(set(pathway_edges['dest']))))]

big_pathway_centralities = pandas.DataFrame(columns = eids)

In [5]:
# load names of the pathways and init pathway dataframe
pathway_names = np.unique(np.array(pathway_edges['pathway_id']))

n_pathways = len(pathway_names)

ii=0
# go through every pathway name
for pathway_name in pathway_names:

    #make adjacency matrix
    A, n_eids = make_network(pathway_name, pathway_edges, undirected)

    #node eids as strings
    string_node_eids = [str(int(node)) for node in n_eids]

    if centrality_measure == 'degree':
        degrees = np.sum(A,axis = 0)
        scores = degrees/np.max(degrees)/len(string_node_eids)
    else:
        #calculate pathway scores
        scores = gt.centrality_scores(A, centrality_measure)

    #unit vector rows
    scores = scores/np.sum(scores)

    #all eids
    column_names = list(big_pathway_centralities.columns)

    #row to add to dataframe
    row_data = pandas.DataFrame([len(column_names)*[0]], columns=column_names, index=[pathway_name])

    #add scores to row
    for i in range(len(scores)):
        row_data[string_node_eids[i]] = scores[i]

    #add row to dataframe
    big_pathway_centralities = big_pathway_centralities.append(row_data)

    #count the number of pathways finished
    print('pathway '+str(ii) +' out of '+str(n_pathways)+' done')
    ii+=1



pathway 0 out of 1655 done
pathway 1 out of 1655 done
pathway 2 out of 1655 done
pathway 3 out of 1655 done
pathway 4 out of 1655 done
pathway 5 out of 1655 done
pathway 6 out of 1655 done
pathway 7 out of 1655 done
pathway 8 out of 1655 done
pathway 9 out of 1655 done
pathway 10 out of 1655 done
pathway 11 out of 1655 done
pathway 12 out of 1655 done
pathway 13 out of 1655 done
pathway 14 out of 1655 done
pathway 15 out of 1655 done
pathway 16 out of 1655 done
pathway 17 out of 1655 done
pathway 18 out of 1655 done
pathway 19 out of 1655 done
pathway 20 out of 1655 done
pathway 21 out of 1655 done
pathway 22 out of 1655 done
pathway 23 out of 1655 done
pathway 24 out of 1655 done
pathway 25 out of 1655 done
pathway 26 out of 1655 done
pathway 27 out of 1655 done
pathway 28 out of 1655 done
pathway 29 out of 1655 done
pathway 30 out of 1655 done
pathway 31 out of 1655 done
pathway 32 out of 1655 done
pathway 33 out of 1655 done
pathway 34 out of 1655 done
pathway 35 out of 1655 done
pa

In [6]:
if undirected:
    big_pathway_centralities.to_csv('/data4/mankovic/GSE73072/network_centrality/pathway_matrix/2-4hr/gse73072_undirected_'+centrality_measure+'_all_genes.csv', index = True)
else:
    big_pathway_centralities.to_csv('/data4/mankovic/GSE73072/network_centrality/pathway_matrix/2-4hr/gse73072_directed_'+centrality_measure+'_all_genes.csv', index = True)

In [7]:
new = pandas.read_csv('/data4/mankovic/GSE73072/network_centrality/pathway_matrix/sanity/gse73072_directed_'+centrality_measure+'_all_genes.csv')
old = pandas.read_csv('/data4/mankovic/GSE73072/network_centrality/pathway_matrix/gse73072_directed_'+centrality_measure+'_all_genes.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/data4/mankovic/GSE73072/network_centrality/pathway_matrix/gse73072_directed_page_rank_all_genes.csv'

In [14]:
new.equals(old)

True