# Validation Set 1.2: diffuPy + PathMe  

In [1]:
import os
import itertools

dir_path = os.path.dirname(os.path.realpath('__file__'))

In [2]:
from openpyxl import load_workbook
from collections import defaultdict
import networkx as nx

In [3]:
import pybel
import pybel_tools as pbt

from pybel import BELGraph
from pybel.constants import RELATION
from pybel.dsl import Abundance, BiologicalProcess, CentralDogma, ListAbundance, Reaction

pybel.get_version()

'0.13.1'

In [4]:
from pathme.constants import REACTOME_BEL, KEGG_BEL, WIKIPATHWAYS_BEL, PATHME_DIR
from pathme.export_utils import to_gml, get_labels_by_db_and_omic_from_pathme

In [5]:
from diffuPy.diffuse import diffuse
from diffuPy.matrix import Matrix, LaplacianMatrix
from diffuPy.input_mapping import generate_categoric_input_from_labels, get_mapping, get_mapping_subsets, get_mapping_two_dim_subsets

In [6]:
from diffuPy.validation_sets_utils import parse_set1
from diffuPy.utils import print_dict_dimentions, get_labels_set_from_dict
from diffuPy.views import show_heatmap

In [7]:
import nbimporter
from global_database_comparison import get_set_database, calculate_database_sets_as_dict

Importing Jupyter notebook from global_database_comparison.ipynb


## 1. Load Data Set 1: Input Scores

In [8]:
dataset1_labels_by_omics = parse_set1(os.path.join(dir_path, 'validation', 'set1.xlsx'))

In [9]:
print_dict_dimentions(dataset1_labels_by_omics, 'Dataset1 imported labels:')

Dataset1 imported labels:
Total number of genes: 4941  
Total number of micrornas: 100  
Total number of metabolite: 21  
Total: 5062 


In [10]:
dataset1_all_labels = get_labels_set_from_dict(dataset1_labels_by_omics)

## 2. Load Backrgound Graph Universe

### 2.1. PathMeUniverse import

##### No database argument in node data

In [11]:
pathme_graph_universe_no_explode_v0_1 = pybel.from_pickle(os.path.join(PATHME_DIR, 'universe', 'pathme_graph_universe_explode_v0.1.bel.pickle'))
pathme_graph_universe_no_explode_v0_1


<pybel.struct.graph.BELGraph at 0x12c9210b8>

In [12]:
pathme_graph_universe_no_explode_v0_1.summarize()

PathMe Universe v1.0.0
Number of Nodes: 20768
Number of Edges: 84945
Network Density: 1.97E-04
Number of Components: 11815


##### Database argument in node data

In [13]:
pathme_graph_universe_no_explode = pybel.from_pickle(os.path.join(PATHME_DIR, 'universe', 'pathme_universe_bel_graph.bel.pickle'))
pathme_graph_universe_no_explode

<pybel.struct.graph.BELGraph at 0x12c9207f0>

In [14]:
pathme_graph_universe_no_explode.summarize()

PathMe Universe v1.0.0
Number of Nodes: 16153
Number of Edges: 17725
Network Density: 6.79E-05
Number of Components: 11886


##### Export to gml

In [15]:
to_gml(pathme_graph_universe_no_explode, os.path.join(PATHME_DIR, 'pathme_graph_universe_no_explode.gml'))

In [16]:
g = nx.read_gml(os.path.join(PATHME_DIR, 'pathme_graph_universe_no_explode.gml'))

In [17]:
nx.info(g)

'Name: \nType: MultiDiGraph\nNumber of nodes: 14389\nNumber of edges: 17725\nAverage in degree:   1.2318\nAverage out degree:   1.2318'

##### Check isolates / Unconnected nodes

In [18]:
isolates = list(nx.isolates(g))

#import json
#with open(os.path.join(PATHME_DIR, 'isolates.json'), 'w') as outfile:
#    d = json.dump(isolates, outfile)

print(len(isolates))
#list(nx.isolates(pathme_graph_universe_no_explode))

9862


### 2.1. Get labels subsets by entity type (omic) and database

##### PathMe package subsets

In [19]:
def get_labels_by_db_and_omic_from_pathme(databases):
    db_entites = defaultdict(dict)
    entites_db = defaultdict(lambda: defaultdict(set))
    
    for db in databases:
        genes, mirna, metabolites, bps = get_set_database(db)
        db_entites[db] = {'genes': genes, 'mirna': mirna, 'metabolites': metabolites, 'bps': bps}
        
        for entity_type, entities in db_entites[db].items():
            entites_db[entity_type][db] = entities
            
    return db_entites, entites_db

In [20]:
bg_labels_from_pathme_by_db_and_omic, bg_labels_from_pathme_by_omic_and_db = get_labels_by_db_and_omic_from_pathme(['reactome', 'kegg', 'wikipathways'])

print_dict_dimentions(bg_labels_from_pathme_by_db_and_omic, 'Entities in PathMe package :')



Entities in PathMe package :
Total number of reactome: genes(6328), mirna(11), metabolites(2559), bps(2101),  
Total number of kegg: genes(2086), mirna(0), metabolites(572), bps(147),  
Total number of wikipathways: genes(3016), mirna(83), metabolites(453), bps(107),  
Total: 17463 


#####  PathMeUniverse Graph Subsets

In [21]:
def get_labels_by_db_and_omic_from_graph(graph):
    
    db_subsets_pmu = defaultdict(set)
    db_entites_pmu = defaultdict(dict)

    #entity_type_map = {'Gene':'genes', 'mirna_nodes':'mirna', 'Abundance':'metabolites', 'BiologicalProcess':'bps'}

    for node, data in graph.nodes(data=True):
        for database in data['database']:
            db_subsets_pmu[database].add(node)

    for database, nodes in db_subsets_pmu.items():
        db_entites_pmu[database] = calculate_database_sets_as_dict(nodes, database)
        
    return db_entites_pmu, db_subsets_pmu

In [22]:
bg_labels_from_pathmeuniverse_by_db_and_omic, bg_labels_from_pathmeuniverse_by_omic_and_db =  get_labels_by_db_and_omic_from_graph(pathme_graph_universe_no_explode)

print_dict_dimentions(bg_labels_from_pathmeuniverse_by_db_and_omic, 'Entities in PathMeUniverse :')


Entities in PathMeUniverse :
Total number of kegg: gene_nodes(2086), mirna_nodes(0), metabolite_nodes(572), bp_nodes(147),  
Total number of reactome: gene_nodes(5401), mirna_nodes(11), metabolite_nodes(2547), bp_nodes(2101),  
Total number of wikipathways: gene_nodes(2577), mirna_nodes(83), metabolite_nodes(453), bp_nodes(107),  
Total: 16085 


###  Subgraphs 

In [23]:
# TODO
# def get_subgraphs_by_entity(graph: BELGraph) -> dict:
#     entities_types = ['genes', 'mirna', 'metabolites', 'bps']
#     subgraphs = {}
    
#     for entities_type in entities_types:
#         subgraphs[entities_type] = pbt.selection.get_subgraph_by_node_filter(graph, function=entities_type, value=f'PathME {entities_type}' )

#     return subgraphs

# get_subgraphs_by_entity(pathme_graph_universe_no_explode)

# TODO
#def get_subgraphs_by_resource(graph: BELGraph) -> None:

### Background  Matrix

In [24]:
len(list(pathme_graph_universe_no_explode_v0_1.nodes))

20768

In [25]:
background_mat = LaplacianMatrix(pathme_graph_universe_no_explode_v0_1)

Node name nor id not labeled. bp(KEGG:"path:map04688")
Node name nor id not labeled. bp(KEGG:"path:map4670")
Node name nor id not labeled. bp(KEGG:"path:map00517")
Node name nor id not labeled. bp(KEGG:"path:map09020")
Columns labels are assigned to rows since duplicate labels is true.


### Row labels

In [26]:
background_labels = set(background_mat.rows_labels)

## 3. Dataset label mapping to PathMeEntities

### General mapping

In [27]:
all_labels_mapping = get_mapping(dataset1_all_labels, background_labels, mirnas=dataset1_labels_by_omics, title = 'Global mapping: ', print_percentage = True)


Global mapping:  (2588) 51.12603713947057%


### Mapping by subsets

#### Mapping by entity type/omic

In [28]:
mapping_by_entity, _ = get_mapping_subsets(dataset1_labels_by_omics, background_labels, 'entity type/omic', submapping=all_labels_mapping)


Mapping by entity type/omic:
genes (2555) 51.710180125480676%
micrornas (17) 17.0%
metabolite (16) 76.19047619047619%
Total (2588) 51.12603713947057% 



#### Mapping by database

In [29]:
background_entites_by_db = {db : set(itertools.chain.from_iterable(entities.values())) for db, entities in bg_labels_from_pathme_by_db_and_omic.items()}
mapping_by_db, _ = get_mapping_subsets(background_entites_by_db, dataset1_all_labels, 'database', percentage_reference_labels = True, submapping=all_labels_mapping)


Mapping by database:
reactome (1365) 26.965626234689843%
kegg (692) 13.670485973923348%
wikipathways (948) 18.727775582773607%
Total (3005) 59.3638877913868% 



#### Mapping by entity type and database

In [30]:
mapping_by_database_and_entity = get_mapping_two_dim_subsets(bg_labels_from_pathmeuniverse_by_db_and_omic, dataset1_all_labels, all_labels_mapping)



Mapping by Kegg:
gene_nodes (689) 13.611220861319637%
mirna_nodes (0) 0%
metabolite_nodes (3) 0.05926511260371395%
bp_nodes (0) 0.0%
Total (692) 13.670485973923348% 

Mapping by Reactome:
gene_nodes (1321) 26.09640458316871%
mirna_nodes (0) 0.0%
metabolite_nodes (9) 0.17779533781114185%
bp_nodes (0) 0.0%
Total (1330) 26.27419992097985% 

Mapping by Wikipathways:
gene_nodes (762) 15.053338601343341%
mirna_nodes (0) 0.0%
metabolite_nodes (13) 0.2568154879494271%
bp_nodes (0) 0.0%
Total (775) 15.310154089292768% 



### Mapping descriptive view

In [31]:
all_count = []
all_percentage = []

for db_name, entities_by_type in mapping_by_database_and_entity.items():
    db_count = []
    db_percentage = []

    for entity_type, entities_tupple in entities_by_type.items():
        db_count.append(len(entities_tupple[0]))
        db_percentage.append(entities_tupple[1])
        
    all_count.append(db_count)
    all_percentage.append(db_percentage)

AttributeError: 'tuple' object has no attribute 'items'

In [None]:
databases = mapping_by_database_and_entity.keys()
entity_types = mapping_by_database_and_entity['kegg'].keys()

entity_number = np.array(all_percentage)
entity_count = np.array(all_count)

In [None]:
databases = mapping_by_database_and_entity.keys()
entity_types = mapping_by_database_and_entity['kegg'].keys()

entity_number = np.array(all_percentage)
entity_count = np.array(all_count)

In [None]:
show_heatmap(entity_number, databases, entity_types)

#### Input vector/matrix construction

In [None]:
input_mats_by_entity_type = {entity_type: generate_categoric_input(all_labels_mapping, ['Dataset 1']) for entity_type, map_value in mapping_by_entity.items()}

#### Mapping by resource/database

In [None]:
# TODO: Check further.
# def get_mapping_by_entity_type(dataset_entites_dict, background_entites_dict):
#     mapping = {}

#     background_entities_by_type = {entity_type: set(itertools.chain.from_iterable(v.values())) for entity_type, v in entites_db.items()}
    
#     print(len(list(itertools.chain.from_iterable(background_entities_by_type.values()))))
#     mapping = defaultdict(set)
#     for type_name, background_entites in background_entities_by_type.items():
#         mapping[type_name] = get_mapping(dataset_entites_dict[type_name], entites)

#    for type_name, entites in dataset_entites_dict.items():
#        mapping[type_name] = get_mapping(entites, background_labels)
#        print(type_name)
#        print(len(mapping[type_name]))
#    return mapping

In [None]:
#mapping_by_entity = get_mapping_by_entity_type(dataset1_omics_labels, background_labels)

#### Input vector/matrix construction

In [None]:
input_mat = generate_categoric_input_from_labels(all_labels_mapping, ['Dataset 1'], background_mat)

## 4. Score Diffusion with diffuPy: Dataset as input + PathMe as background graph

### Input elements: Dataset input + Background matrix

In [None]:
len(input_mat.mat)

In [None]:
len(background_mat.mat)

In [None]:
import time
import copy

In [None]:
input_mat_raw = copy.copy(input_mat)
background_mat_raw = copy.copy(background_mat)

input_mat_z = copy.copy(input_mat)
background_mat_z = copy.copy(background_mat)

### Compute diffusion scores

In [None]:
print(len(input_mat_raw.mat))
print(len(input_mat_raw.rows_labels))

#### Raw scores

In [None]:
then = time.time()
raw_scores = diffuse(input_mat_raw, 'ml', K = background_mat_raw)
now = time.time()
print("It took: ", now-then, " seconds")

In [None]:
print(raw_scores)

###### By entity

In [None]:
mapping_by_entity

#### Normalized z-scores

In [None]:
then = time.time()
z_scores = diffuse(input_mat_z, 'ml', K = background_mat_z, z = True)
now = time.time()
print("It took: ", now-then, " seconds")

In [None]:
print(z_scores)