# Validation Set 1 general validation

In [1]:
import os
import time

import itertools
from collections import defaultdict

from openpyxl import load_workbook
import networkx as nx

##### PyBEL imports

In [2]:
import pybel

from pybel import BELGraph
from pybel.constants import RELATION, ANNOTATIONS
from pybel.dsl import Abundance, BiologicalProcess, CentralDogma, ListAbundance, Reaction
from pybel.struct import get_subgraph_by_annotation_value
from pybel.struct.summary import count_functions

import matplotlib.pyplot as plt

pybel.get_version()

'0.13.2'

##### PathMe import 

In [None]:
from pathme.constants import REACTOME_BEL, KEGG_BEL, WIKIPATHWAYS_BEL, PATHME_DIR
from pathme.export_utils import to_gml

##### DiffuPath import 

In [None]:
from diffupath.constants import DEFAULT_DIFFUPY_DIR
from diffupath.validation_datasets_parsers import parse_set1

from diffupath.pathme_processing import get_labels_by_db_and_omic_from_pathme, get_labels_by_db_and_omic_from_graph
from diffupath.input_mapping import get_mapping, get_mapping_subsets, get_mapping_two_dim_subsets
from diffupath.utils import print_dict_dimensions, get_labels_set_from_dict, reduce_dict_dimension, get_count_and_labels_from_two_dim_dict, from_pickle, to_pickle, get_three_venn_intersections, random_disjoint_intersection_three_subsets
from diffupath.views import show_heatmap, show_venn_diagram

from diffupy.kernels import regularised_laplacian_kernel

## 1. Load Data Set 1: Input Scores

You have to change the data results into ~/.diffupath or write the full path to the dataset 1 here.

In [None]:
dataset1_labels_by_omics = parse_set1(os.path.join(DEFAULT_DIFFUPY_DIR, 'data','validation_datasets', 'set1.xlsx'))
dataset1_all_labels = get_labels_set_from_dict(dataset1_labels_by_omics)
print_dict_dimensions(dataset1_labels_by_omics, 'Dataset1 imported labels:')

In [None]:
mirnas_dataset =  dataset1_labels_by_omics['micrornas']

## 2. Load Backrgound Graph Universe

### 2.1. PathMeUniverse import

In [None]:
pathme_universe_graph_no_flatten = pybel.from_pickle(
    "/home/ddomingofernandez/.diffupath/data/pickles/pathme_universe_non_flatten_collapsed_names_13_03_2020.pickle"
)


In [None]:
count_functions(pathme_universe_graph_no_flatten)

##### Check isolates / Unconnected nodes

In [None]:
nx.number_of_isolates(pathme_universe_graph_no_flatten)

In [None]:
pathme_universe_graph_no_flatten.remove_nodes_from({
    node
    for node in nx.isolates(pathme_universe_graph_no_flatten)
})

In [None]:
nx.number_of_isolates(pathme_universe_graph_no_flatten)

In [None]:
print(pathme_universe_graph_no_flatten.summary_str())

### 2.2. Get labels subsets by entity type (omic) and database

#####  From PathMeUniverse Graph Subsets

In [None]:
bg_labels_from_pathmeuniverse_by_db_and_omic, bg_labels_from_pathmeuniverse_by_omic_and_db =  get_labels_by_db_and_omic_from_graph(pathme_universe_graph_no_flatten)

print_dict_dimensions(bg_labels_from_pathmeuniverse_by_db_and_omic, 'Entities in PathMeUniverse :')

In [None]:
bg_labels_from_pathmeuniverse_by_omic = reduce_dict_dimension(bg_labels_from_pathmeuniverse_by_omic_and_db)

### 2.3. Get Background  Matrix Kernel

##### Generate kernel OR load from pickle

DiffuPy to generate laplacian_kernel

In [None]:
dir_path = os.path.dirname(os.path.realpath('__file__'))
kernel_path = os.path.join(dir_path, 'diffuPy', 'data', 'kernels', 'regularized_kernel_pathme_universe.pickle')

In [None]:
then = time.time()

In [None]:
#TODO: change kegg_pathme_graph_no_flatten to pathme_universe_graph_no_flatten
background_mat = regularised_laplacian_kernel(pathme_universe_graph_no_flatten)

Import kernel from pickle

In [None]:
#import pickle
#input_path = os.path.join(DEFAULT_DIFFUPY_DIR, 'kernels', 'regularized_kernel_pathme_universe.pickle')

#with open(input_path, 'rb') as f:
#    unpickler = pickle.Unpickler(f)
#    background_mat = unpickler.load()

In [None]:
now = time.time()
print("It took: ", now-then, " seconds")

## 3. Dataset label mapping to PathMeEntities

##### All kernel matrix (background network) row labels for the mapping

In [None]:
background_labels = background_mat.rows_labels

### 3.1. General mapping

In [None]:
all_labels_mapping = get_mapping(
    dataset1_all_labels,
    background_labels,
    title = 'Global mapping: ',
    mirnas = mirnas_dataset,
    print_percentage = True
)

### 3.2. Mapping by subsets

#### 3.2.1. Mapping by entity type/omic

In [None]:
mapping_by_entity, _, _ = get_mapping_subsets(
    dataset1_labels_by_omics,
    background_labels,
    'entity type/omic',
    mirnas=mirnas_dataset
)


In [None]:
micrornas_mapping = mapping_by_entity['micrornas'][0]
micrornas_mapping

#### 3.2.2. Mapping by database

In [None]:
background_entites_by_db = {db : set(itertools.chain.from_iterable(entities.values())) for db, entities in bg_labels_from_pathmeuniverse_by_db_and_omic.items()}
mapping_by_db, _, _ = get_mapping_subsets(background_entites_by_db, dataset1_all_labels, 'database', mirnas=mirnas_dataset, submapping = background_labels, mirnas_mapping = micrornas_mapping, percentage_reference_labels = True)


#### 3.2.3. Mapping by entity type and database (two dimentions)

In [None]:
mapping_by_database_and_entity, total_percentage, total_dimention = get_mapping_two_dim_subsets(bg_labels_from_pathmeuniverse_by_db_and_omic, dataset1_all_labels, relative_statistics = dataset1_labels_by_omics, mirnas_mapping = micrornas_mapping)


### 3.3. Mapping descriptive view

In [None]:
count, percentage, db_labels, entity_labels  = get_count_and_labels_from_two_dim_dict(mapping_by_database_and_entity)

show_heatmap(count, percentage, db_labels, entity_labels)


### 3.4. Overlap view

In [None]:
db_intersections = get_three_venn_intersections(mapping_by_db['kegg'][0], mapping_by_db['reactome'][0], mapping_by_db['wikipathways'][0])

show_venn_diagram(db_intersections)


### 3.5. Database disjoint

In [None]:
disjoint_db_mapping = random_disjoint_intersection_three_subsets(mapping_by_db)

show_venn_diagram(get_three_venn_intersections(*list(disjoint_db_mapping.values())))

## 4. Cross-validation

In [None]:
from diffupath.views import box_plot_from_dict
from diffupath.cross_validation import cross_validation_by_method

In [None]:
results_path = os.path.join(DEFAULT_DIFFUPY_DIR, 'outputs', 'by_method_and_db_validation')

In [None]:
len(background_mat.rows_labels)

### 4.1. Random Scheme

In [None]:
auroc_metrics_by_method, auprc_metrics_by_method = cross_validation_by_method(
    all_labels_mapping,
    pathme_universe_graph_no_flatten,
    background_mat,
    k = 100,
)

In [None]:
box_plot_from_dict(auroc_metrics_by_method, 'Random Cross Validation', 'Method type', 'AUROC')

In [None]:
box_plot_from_dict(auprc_metrics_by_method, 'Random Cross Validation', 'Method type', 'AUPRC')

In [None]:
import json

with open('metrics_set1_universe.json', 'w') as outfile:  
    json.dump({'auroc_metrics_by_method': auroc_metrics_by_method, 'auprc_metrics_by_method': auprc_metrics_by_method}, outfile)

    