# Validation Set 1.2: diffuPy + PathMe  

In [1]:
import os
dir_path = os.path.dirname(os.path.realpath('__file__'))

In [2]:
from openpyxl import load_workbook
from collections import defaultdict
import networkx as nx

In [3]:
import pybel
import pybel_tools as pbt
from pybel.dsl import Abundance, BiologicalProcess, CentralDogma, ListAbundance, Reaction

from pathme.constants import REACTOME_BEL, KEGG_BEL, WIKIPATHWAYS_BEL, PATHME_DIR

pybel.get_version()

'0.13.1'

In [4]:
from diffuPy.diffuse import diffuse
from diffuPy.matrix import Matrix, LaplacianMatrix

## 1. Load Data Set 1: Input Scores

In [5]:
def munge_labels(label):
    """Process ene"""
    remove_set = ['*', ' ', '|', '-', '"', "'"]
    
    label = str(label).lower()
    
    for symb in remove_set:
        if symb in label:
            label = label.replace(symb, '')
    
    if '/' in label:
        label = tuple(set(label.split('/')))
        if len(label) == 1:
            label = label[0]
    
    return label


def parse_set1(path):
    
    wb = load_workbook(filename = path)

    sheet_titles = []
    omics_data = defaultdict(lambda:defaultdict(lambda:set()))
    omics_labels = defaultdict(lambda:set())

    for sheet in wb:
        cell_value = sheet['A3'].value

    #     if "Expression data (FC) of the differentially expressed" in sheet['A1'].value:
    #         sheet_title = sheet['A1'].value.split("Expression data (FC) of the differentially expressed ",1)[1]
    #         sheet_title = sheet_title.split(" of HepG2 cells after treatment with ")
    #         sheet_title[1] = sheet_title[1].replace(". Statistical significance (p value < 0.05) is indicated.", "").replace(" CsA for", "")
    #         sheet_titles.append(sheet_title)

        if cell_value and ("Significant " in cell_value or "Metabolite" == cell_value):
            if  cell_value == "Metabolite":
                sheet_title = ("Metabolite", '3 µM', ' 24h or 72h')
                min_row = 3

            else:
                sheet_title = cell_value.split("Significant ",1)[1]
                sheet_title = sheet_title.split(" CsA ")
                sheet_title.append(sheet_title[1].split(" ")[0] + ' h')
                sheet_title[1] = sheet_title[0].split(" ")[1]+ ' µM'
                sheet_title[0] = sheet_title[0].split(" ")[0]
                min_row = 4

            for col in sheet.iter_cols(min_row=min_row):
                col_label = col[0].value
                sheet_omic = sheet_title[0]

                if col_label in ['MicroRNA', 'hgnc symbol', 'Metabolite']:
                    omics_labels[sheet_omic.lower()].update(munge_labels(cell.value) for cell in col[1:])

            sheet_titles.append(sheet_title)

    return omics_labels
    
dataset1_omics_labels = parse_set1(os.path.join(dir_path, 'validation', 'set1.xlsx'))
dataset1_omics_labels

defaultdict(<function __main__.parse_set1.<locals>.<lambda>()>,
            {'genes': {'',
              'stx5',
              'tyw3',
              'pnrc1',
              'asphd1',
              'frmd4b',
              'mblac2',
              'dio1',
              'ppp1r15b',
              'txnl1',
              'fermt2',
              'tstd2',
              'nfs1',
              'spata20',
              'eny2',
              'parp2',
              'hexim1',
              'c10orf10',
              'fkbp7',
              'nt5dc1',
              'foxa3',
              'asb9',
              'gpn1',
              'fam46a',
              'mttp',
              'znf300',
              'mak16',
              'mthfd1l',
              'itpkc',
              'hnrnpcl1',
              'dnajb6',
              'fnip2',
              'apol1',
              'aldh1a1',
              'impa2',
              'rbm28',
              'tbc1d15',
              'upk3a',
              'c11orf93',
              

In [6]:
print(f'Total number of genes: ({len(dataset1_omics_labels["genes"])})')

print(f'Total number of metabolites: ({len(dataset1_omics_labels["metabolite"])})')

print(f'Total number of miRNAs: ({len(dataset1_omics_labels["micrornas"])})')

total_entities = len(dataset1_omics_labels["metabolite"]) + len(dataset1_omics_labels["micrornas"]) + len(dataset1_omics_labels["genes"])

print(total_entities)

Total number of genes: (4942)
Total number of metabolites: (21)
Total number of miRNAs: (100)
5063


In [7]:
all_omics_labels_dataset1 = set()
for labels in dataset1_omics_labels.values():
    all_omics_labels_dataset1.update(set(labels))
                                         
all_omics_labels_dataset1

{'',
 'stx5',
 'dek',
 'b4galt6',
 'tyw3',
 'pnrc1',
 'dab2ip',
 'asphd1',
 'frmd4b',
 'paxip1',
 'mblac2',
 'mlec',
 'c9orf91',
 'dio1',
 'srsf5',
 'ppp1r15b',
 'txnl1',
 'fermt2',
 'socs4',
 'tstd2',
 'nfs1',
 'spata20',
 'ptprg',
 'wbp2',
 'eny2',
 'parp2',
 'hexim1',
 'c10orf10',
 'crtc2',
 'fkbp7',
 'nt5dc1',
 'foxa3',
 'eif4ebp1',
 'asb9',
 'gpn1',
 'pacsin3',
 'fam46a',
 'mttp',
 'nars2',
 'znf300',
 'erbb2',
 'mak16',
 'aspm',
 'srsf10',
 'tram2',
 'ezh1',
 'mthfd1l',
 'c2cd2l',
 'itpkc',
 'hnrnpcl1',
 'cnn3',
 'dnajb6',
 'hsamir1915',
 'mgst2',
 'pura',
 'kiaa0284',
 'fnip2',
 'apol1',
 'aldh1a1',
 'impa2',
 'rbm28',
 'ahsg',
 'sox5',
 'tbc1d15',
 'upk3a',
 'igfbp3',
 'utrn',
 'naa40',
 'iqgap1',
 'gcdh',
 'c11orf93',
 'myeov2',
 'pepd',
 'wdfy2',
 'faim',
 'srrm1',
 'tctn3',
 'tst',
 'brwd1',
 'wdr34',
 'cdc6',
 ('hsamir193b', 'mmumir193b'),
 'cpne3',
 's100p',
 'angptl1',
 'ttc32',
 'khk',
 'shank3',
 'cuedc1',
 'rcan1',
 'leucine',
 'mzt2b',
 'daam1',
 'palmd',
 'thrb',
 'n

## 2. Load Backrgound Graph Universe

### PathMeUniverse import

In [8]:
pathme_graph_universe_no_explode = pybel.from_pickle(os.path.join(PATHME_DIR, 'pathme_graph_universe_explode.bel.pickle'))
pathme_graph_universe_no_explode


<pybel.struct.graph.BELGraph at 0x12d5968d0>

In [9]:
pathme_graph_universe_no_explode.summarize()

PathMe Universe v1.0.0
Number of Nodes: 20768
Number of Edges: 84945
Network Density: 1.97E-04
Number of Components: 11815


In [10]:
pybel.to_graphml(pathme_graph_universe_no_explode, os.path.join(PATHME_DIR, 'pathme_graph_universe_no_explode.gml'))

### Background  Matrix

In [11]:
background_mat = LaplacianMatrix(pathme_graph_universe_no_explode)
print(background_mat)

Columns labels are assigned to rows since duplicate labels is true.



matrix  
  [[41  0  0 ...  0  0  0]
 [ 0 53  0 ...  0  0  0]
 [ 0  0 37 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  1  0]
 [ 0  0  0 ...  0  0  1]] 
 row labels: 
  ['acetate', 'd-glucopyranose', 'alpha-d-glucose 1-phosphate', '2-phospho-d-glyceric acid', 'alpha-d-glucose', 'beta-d-glucose', 'dihydroxyacetone phosphate', 'beta-d-glucose 6-phosphate', 'alpha-d-glucose 6-phosphate', 'beta-d-fructofuranose 6-phosphate', 'phosphoenolpyruvic acid', '3-phospho-d-glyceric acid', '3-phospho-d-glyceroyl dihydrogen phosphate', '(s)-lactic acid', 'enzyme n6-(lipoyl)lysine', 'ethanol', 'pyruvic acid', '2-(1-hydroxyethyl)thiamine diphosphate', 'acetyl-coa', 'acetaldehyde', '[dihydrolipoyllysine-residue acetyltransferase] s-acetyldihydrolipoyllysine', 'enzyme n6-(dihydrolipoyl)lysine', 'beta-d-fructofuranose 1,6-bisphosphate', 'hydroquinone o-beta-d-glucopyranoside', 'arbutin 6-phosphate', 'salicin 6-phosphate', 'salicin', 'oxaloacetic acid', '2,3-bisphospho-d-glyceric acid', 'd-

### Row labels

In [12]:
print(len(background_mat.rows_labels))

background_labels = set(background_mat.rows_labels)
print(len(background_labels))
print(len(background_mat.mat))

20764
20764
20768


## 3. Dataset label mapping to PathMe

In [13]:
def check_substrings(dataset_nodes, db_nodes):
    intersection_close = set()
    for entity in dataset_nodes:
        if isinstance(entity, tuple):
            for subentity in entity:
                for entity_db in db_nodes:
                    if entity_db in subentity or subentity in entity_db:
                        intersection_close.add(entity_db)
                        break
        else:
            for entity_db in db_nodes:
                if entity_db in entity or entity in entity_db:
                    intersection_close.add(entity_db)
                    break
    return intersection_close

In [14]:
labels_mapping_substring = check_substrings(background_labels, all_omics_labels_dataset1)

In [15]:
labels_mapping_intersection = background_labels.intersection(all_omics_labels_dataset1)

In [16]:
labels_mapping = labels_mapping_intersection.union(labels_mapping_substring)
len(labels_mapping)

2572

### Mapping percentage

In [17]:
print(len(labels_mapping)/len(background_labels))

0.12386823348102485


In [18]:
print(len(labels_mapping)/len(all_omics_labels_dataset1))

0.5079992099545724


### Input vector/matrix construction

In [19]:
input_mat = Matrix(rows_labels=labels_mapping, cols_labels=['Dataset 1'], init=1)

In [20]:
input_mat = input_mat.match_missing_rows(background_mat.rows_labels, 0)
input_mat = input_mat.match_rows(background_mat)

## 4. Score Diffusion with diffuPy: Dataset as input + PathMe as background graph

### Input elements: Dataset input + Background matrix

In [21]:
len(input_mat.mat)

20765

In [22]:
len(background_mat.mat)

20768

In [23]:
print(input_mat)


matrix  
  [[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]] 
 row labels: 
  ['acetate', 'd-glucopyranose', 'alpha-d-glucose 1-phosphate', '2-phospho-d-glyceric acid', 'alpha-d-glucose', 'beta-d-glucose', 'dihydroxyacetone phosphate', 'beta-d-glucose 6-phosphate', 'alpha-d-glucose 6-phosphate', 'beta-d-fructofuranose 6-phosphate', 'phosphoenolpyruvic acid', '3-phospho-d-glyceric acid', '3-phospho-d-glyceroyl dihydrogen phosphate', '(s)-lactic acid', 'enzyme n6-(lipoyl)lysine', 'ethanol', 'pyruvic acid', '2-(1-hydroxyethyl)thiamine diphosphate', 'acetyl-coa', 'acetaldehyde', '[dihydrolipoyllysine-residue acetyltransferase] s-acetyldihydrolipoyllysine', 'enzyme n6-(dihydrolipoyl)lysine', 'beta-d-fructofuranose 1,6-bisphosphate', 'hydroquinone o-beta-d-glucopyranoside', 'arbutin 6-phosphate', 'salicin 6-phosphate', 'salicin', 'oxaloacetic acid', '2,3-bisphospho-d-glyceric acid', 'd-glyceraldehyde 3-phosphate', 'thiamine(1+) diphosphate', 'pentose phosphate pathway', 'starch and sucrose metabolism', '

In [24]:
print(background_mat)


matrix  
  [[41  0  0 ...  0  0  0]
 [ 0 53  0 ...  0  0  0]
 [ 0  0 37 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  1  0]
 [ 0  0  0 ...  0  0  1]] 
 row labels: 
  ['acetate', 'd-glucopyranose', 'alpha-d-glucose 1-phosphate', '2-phospho-d-glyceric acid', 'alpha-d-glucose', 'beta-d-glucose', 'dihydroxyacetone phosphate', 'beta-d-glucose 6-phosphate', 'alpha-d-glucose 6-phosphate', 'beta-d-fructofuranose 6-phosphate', 'phosphoenolpyruvic acid', '3-phospho-d-glyceric acid', '3-phospho-d-glyceroyl dihydrogen phosphate', '(s)-lactic acid', 'enzyme n6-(lipoyl)lysine', 'ethanol', 'pyruvic acid', '2-(1-hydroxyethyl)thiamine diphosphate', 'acetyl-coa', 'acetaldehyde', '[dihydrolipoyllysine-residue acetyltransferase] s-acetyldihydrolipoyllysine', 'enzyme n6-(dihydrolipoyl)lysine', 'beta-d-fructofuranose 1,6-bisphosphate', 'hydroquinone o-beta-d-glucopyranoside', 'arbutin 6-phosphate', 'salicin 6-phosphate', 'salicin', 'oxaloacetic acid', '2,3-bisphospho-d-glyceric acid', 'd-

In [25]:
import time
import copy

In [26]:
input_mat_raw = copy.copy(input_mat)
background_mat_raw = copy.copy(background_mat)

input_mat_z = copy.copy(input_mat)
background_mat_z = copy.copy(background_mat)

Columns labels are assigned to rows since duplicate labels is true.
Columns labels are assigned to rows since duplicate labels is true.


### Compute diffusion scores

#### Raw scores

In [27]:
then = time.time()
raw_scores = diffuse(input_mat_raw, 'ml', K = background_mat_raw)
now = time.time()
print("It took: ", now-then, " seconds")

It took:  2.969588041305542  seconds


In [28]:
print(raw_scores)


matrix  
  [[ -4]
 [-22]
 [-26]
 ...
 [  0]
 [  1]
 [  1]] 
 row labels: 
  ['acetate', 'd-glucopyranose', 'alpha-d-glucose 1-phosphate', '2-phospho-d-glyceric acid', 'alpha-d-glucose', 'beta-d-glucose', 'dihydroxyacetone phosphate', 'beta-d-glucose 6-phosphate', 'alpha-d-glucose 6-phosphate', 'beta-d-fructofuranose 6-phosphate', 'phosphoenolpyruvic acid', '3-phospho-d-glyceric acid', '3-phospho-d-glyceroyl dihydrogen phosphate', '(s)-lactic acid', 'enzyme n6-(lipoyl)lysine', 'ethanol', 'pyruvic acid', '2-(1-hydroxyethyl)thiamine diphosphate', 'acetyl-coa', 'acetaldehyde', '[dihydrolipoyllysine-residue acetyltransferase] s-acetyldihydrolipoyllysine', 'enzyme n6-(dihydrolipoyl)lysine', 'beta-d-fructofuranose 1,6-bisphosphate', 'hydroquinone o-beta-d-glucopyranoside', 'arbutin 6-phosphate', 'salicin 6-phosphate', 'salicin', 'oxaloacetic acid', '2,3-bisphospho-d-glyceric acid', 'd-glyceraldehyde 3-phosphate', 'thiamine(1+) diphosphate', 'pentose phosphate pathway', 'starch and sucrose me

#### Normalized z-scores

In [29]:
then = time.time()
z_scores = diffuse(input_mat_z, 'ml', K = background_mat_z, z = True)
now = time.time()
print("It took: ", now-then, " seconds")

It took:  8.586457014083862  seconds


  return np.subtract(col_raw, score_means) / np.sqrt(score_vars)


In [30]:
print(z_scores)


matrix  
  [[-0.14375626]
 [-0.62013732]
 [-1.0350357 ]
 ...
 [        nan]
 [ 0.37597523]
 [ 0.37597523]] 
 row labels: 
  ['acetate', 'd-glucopyranose', 'alpha-d-glucose 1-phosphate', '2-phospho-d-glyceric acid', 'alpha-d-glucose', 'beta-d-glucose', 'dihydroxyacetone phosphate', 'beta-d-glucose 6-phosphate', 'alpha-d-glucose 6-phosphate', 'beta-d-fructofuranose 6-phosphate', 'phosphoenolpyruvic acid', '3-phospho-d-glyceric acid', '3-phospho-d-glyceroyl dihydrogen phosphate', '(s)-lactic acid', 'enzyme n6-(lipoyl)lysine', 'ethanol', 'pyruvic acid', '2-(1-hydroxyethyl)thiamine diphosphate', 'acetyl-coa', 'acetaldehyde', '[dihydrolipoyllysine-residue acetyltransferase] s-acetyldihydrolipoyllysine', 'enzyme n6-(dihydrolipoyl)lysine', 'beta-d-fructofuranose 1,6-bisphosphate', 'hydroquinone o-beta-d-glucopyranoside', 'arbutin 6-phosphate', 'salicin 6-phosphate', 'salicin', 'oxaloacetic acid', '2,3-bisphospho-d-glyceric acid', 'd-glyceraldehyde 3-phosphate', 'thiamine(1+) diphosphate', 'pe