In [1]:
import GLPE
#https://github.com/ekehoe32/orthrus
import sys
sys.path.append('/home/katrina/a/mankovic/ZOETIS/Fall2021/Orthrus/orthrus')
import orthrus
from orthrus import core
from orthrus.core import dataset
import numpy as np
# from NetworkDataAnalysis import graph_tools_construction as gt
from matplotlib import pyplot as plt
import pandas
# from orthrus.core.pipeline import *
from sklearn.preprocessing import FunctionTransformer
from orthrus.preprocessing.imputation import HalfMinimum
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from orthrus.core.helper import load_object

In [2]:
#load incidence data
pathway_edges = pandas.read_csv('/data3/darpa/omics_databases/ensembl2pathway/reactome_human_pathway_edges.csv').dropna()

incidence_data = np.array(pathway_edges)[:,[2,4,5,7]]

node_ids = np.unique(incidence_data[:,:2])
translate_dict = { node_ids[i] :i  for i in range(len(node_ids))}
incidence_data[:,:2] = np.vectorize(translate_dict.get)(incidence_data[:,:2])

incidence_data

array([[4013, 1687, 'directed', 'R-HSA-1059683'],
       [4013, 1687, 'undirected', 'R-HSA-1059683'],
       [4013, 1594, 'directed', 'R-HSA-1059683'],
       ...,
       [7555, 1727, 'undirected', 'R-HSA-997272'],
       [7555, 1728, 'undirected', 'R-HSA-997272'],
       [7555, 1118, 'undirected', 'R-HSA-997272']], dtype=object)

In [3]:
my_clpe = GLPE.CLPE('degree', 'precomputed',incidence_data, heat_kernel_param = 2)
my_clpe

CLPE(centrality_measure='degree', heat_kernel_param=2.0,
     incidence_matrix=array([[4013, 1687, 'directed', 'R-HSA-1059683'],
       [4013, 1687, 'undirected', 'R-HSA-1059683'],
       [4013, 1594, 'directed', 'R-HSA-1059683'],
       ...,
       [7555, 1727, 'undirected', 'R-HSA-997272'],
       [7555, 1728, 'undirected', 'R-HSA-997272'],
       [7555, 1118, 'undirected', 'R-HSA-997272']], dtype=object),
     network_type='precomputed')

In [4]:
my_clpe.fit()

CLPE(centrality_measure='degree', heat_kernel_param=2.0,
     incidence_matrix=array([[4013, 1687, 'directed', 'R-HSA-1059683'],
       [4013, 1687, 'undirected', 'R-HSA-1059683'],
       [4013, 1594, 'directed', 'R-HSA-1059683'],
       ...,
       [7555, 1727, 'undirected', 'R-HSA-997272'],
       [7555, 1728, 'undirected', 'R-HSA-997272'],
       [7555, 1118, 'undirected', 'R-HSA-997272']], dtype=object),
     network_type='precomputed')

In [7]:
my_clpe.pathway_transition_matrix_, my_clpe.pathway_names_

(array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00042105, 0.00042105, 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]),
 array(['R-HSA-1059683', 'R-HSA-109581', 'R-HSA-109582', ...,
        'R-HSA-983712', 'R-HSA-991365', 'R-HSA-997272'], dtype=object))

In [8]:
other_PTM = pandas.read_csv('/data4/mankovic/GSE73072/network_centrality/pathway_matrix/gse73072_directed_degree_all_genes.csv', index_col = 0)

In [24]:
np.sum(np.around(np.array(other_PTM), decimals=8) != np.around(my_clpe.pathway_transition_matrix_, decimals=8))

0

In [25]:
#load data
Z40_dataset = dataset.load_dataset('/data4/mankovic/De-Identified_CZ/DeId_TPM_C1_Z40_Z34.ds')
Z40_dataset.metadata = Z40_dataset.metadata.query("Project == 'Z40' & Treatment == 'High' & Timepoint == 0.0")
sidx = list(Z40_dataset.metadata.index)
Z40_dataset.data = Z40_dataset.data.loc[sidx]

#transform data (according to Kartikay)
transform = make_pipeline(HalfMinimum(missing_values=0), FunctionTransformer(np.log2))
Z40_dataset.normalize(transform, norm_name='HalfMinimum + Log2')
Z40_data = Z40_dataset.data

#which genes are in which pathways
pathway_data = pandas.read_csv('/data4/mankovic/De-Identified_CZ/deidentified_fcpw.csv')

#restrict pathway data to to genes that are actually there
pathway_data = pathway_data[['RandID']+list(Z40_data.columns)]


In [7]:
better_pathway_data=pandas.DataFrame(columns = ['feature_id', 'pathway_id'])
gene_names = pathway_data.columns
for row in np.array(pathway_data):
    idx = np.where(row == True)
    # print(row)
    for g in gene_names[idx]:
        better_pathway_data = better_pathway_data.append({'feature_id': int(g), 'pathway_id':row[0]}, ignore_index = True)


In [8]:
better_pathway_data = np.array(better_pathway_data)

node_ids = np.unique(better_pathway_data[:,0])

translate_dict = { node_ids[i] :i  for i in range(len(node_ids))}
better_pathway_data[:,0] = np.vectorize(translate_dict.get)(better_pathway_data[:,0])

In [9]:
np.unique(better_pathway_data[:,0])

array([0, 1, 2, ..., 8797, 8798, 8799], dtype=object)

In [10]:
my_other_clpe = GLPE.CLPE('degree', 
                    'correlation',
                    better_pathway_data, 
                    heat_kernel_param = 2)
my_other_clpe

CLPE(centrality_measure='degree', heat_kernel_param=2.0,
     incidence_matrix=array([[81, 'p_0'],
       [342, 'p_0'],
       [432, 'p_0'],
       ...,
       [7890, 'p_999'],
       [8500, 'p_999'],
       [8695, 'p_999']], dtype=object),
     network_type='correlation')

In [11]:
small_dataset = np.array(Z40_data[node_ids.astype(str)])
small_dataset.shape

(8, 8800)

In [12]:
my_other_clpe.fit(small_dataset)

[  81  342  432  448  920 1196 1400 1690 2129 2542 3592 3709 4093 4461
 4766 4877 4963 6029 6044 6333 6429 6544 7204 7666 7735 8277]
[0. 0. 0. ... 0. 0. 0.]
[1269 3376 4497 8453]
[0. 0. 0. ... 0. 0. 0.]
[ 164  317  418  738 1387 1696 2166 2618 2941 3111 3189 3384 3848 3925
 4334 4953 4959 5074 5712 6314 6603 6668 6856 6947 7240 7424 8085 8101
 8469 8516 8560]
[0. 0. 0. ... 0. 0. 0.]
[  15   17   85  142  164  170  187  202  225  231  246  247  277  284
  302  319  382  414  424  451  452  484  497  505  530  558  574  584
  600  616  636  684  710  718  723  736  742  746  790  874  916  939
 1008 1032 1033 1079 1095 1104 1138 1160 1214 1216 1220 1285 1369 1387
 1390 1402 1465 1478 1489 1493 1532 1544 1563 1587 1617 1639 1656 1671
 1691 1696 1701 1723 1729 1759 1767 1786 1791 1838 1839 1874 1886 1916
 1926 1969 1975 1983 1987 2001 2015 2017 2058 2073 2080 2111 2126 2132
 2191 2221 2296 2330 2338 2347 2366 2385 2432 2439 2444 2456 2494 2503
 2522 2540 2580 2619 2642 2669 2764 2776 2808 

CLPE(centrality_measure='degree', heat_kernel_param=2.0,
     incidence_matrix=array([[81, 'p_0'],
       [342, 'p_0'],
       [432, 'p_0'],
       ...,
       [7890, 'p_999'],
       [8500, 'p_999'],
       [8695, 'p_999']], dtype=object),
     network_type='correlation')

In [11]:
small_dataset.shape

(8, 8800)

In [None]:
np.array(better_pathway_data)[:,0]