In [3]:
# This script contains everything the whole sequence of things to do
# to get our results.

import networkx as nx

from read_graph import read_graph
from common.pipeline import Pipeline
from common.feature_generators import ExpectedDegree
from common.feature_generators import ClusteringCoefficient
from common.feature_generators import Degree
from common.feature_generators import ClosenessCentrality
from common.feature_generators import BetweennessCentrality
from common.feature_generators import HITS
from common.feature_generators import PageRank
from common.feature_generators import Log10Wrapper
from common.feature_generators import NormalizeWrapper
from validation import compute_correlations

# Loading PPI graph
print("\n######### Loading Graph #########")
Graph, node_names = read_graph(directed=False)
print("Loaded graph:\n\t{} nodes\n\t{} edges".format(
    Graph.number_of_nodes(),
    Graph.number_of_edges()
))

#########################
# Computing node features
#########################

print("\n######### Computing/retrieving node features #########")

# The pipeline object takes as an argument the sequence of features we want
pipeline = Pipeline(Degree(default_dump=True, default_recomputing=False),
                    ExpectedDegree(default_dump=True, default_recomputing=False), ClusteringCoefficient(),
                    ClosenessCentrality(), BetweennessCentrality(), HITS(), PageRank(), Log10Wrapper(Degree())(),
                    NormalizeWrapper(Degree())(),NormalizeWrapper(HITS())())
features = pipeline.apply(Graph, verbose=True)

#########################
# Class prediction
#########################



#########################
# Features Correlation
#########################

print("\n######### Features Correlation #########")

pvalues = compute_correlations(features)

# Perform gene set enrichment analysis (GSEA) on a variety of gene sets directories
gene_sets_directories = [
    u'Cancer_Cell_Line_Encyclopedia',
    u'ChEA_2016',
    u'DrugMatrix',
    u'GeneSigDB',
    u'KEGG_2016',
    u'LINCS_L1000_Chem_Pert_down',
    u'LINCS_L1000_Chem_Pert_up',
    u'MSigDB_Computational',
    u'MSigDB_Oncogenic_Signatures',
    u'OMIM_Disease',
    u'OMIM_Expanded',
    u'PPI_Hub_Proteins',
    u'Panther_2016',
    u'Reactome_2016'
]
# enrichr = enrichr_validation(gene_query, gene_rank=None, outdir="validation_results", gene_sets='KEGG_2016')
# prerank = prerank_validation(gene_query, gene_rank, outdir="validation_results", gene_sets='KEGG_2016')



100%|██████████| 19576/19576 [00:00<00:00, 163064.16it/s]


######### Loading Graph #########
Reading Nodes list





Reading Edges list
Loaded graph:
	19576 nodes
	5676528 edges

######### Computing/retrieving node features #########
degree_undirected
expecteddegree_undirected
clusteringcoefficient
closeness
betweenness
hits
pagerank
log10-degree_undirected
normalized-degree_undirected
normalized-hits

######### Features Correlation #########
Computing correlations/pvalues for all features for different sources

Source = cancer
degree_undirected: pvalue Mann-Whitney = 0.00 	 pvalue hypergeometric = 0.00
expecteddegree_undirected: pvalue Mann-Whitney = 0.00 	 pvalue hypergeometric = 0.00
clusteringcoefficient: pvalue Mann-Whitney = 0.01 	 pvalue hypergeometric = 0.86
closeness: pvalue Mann-Whitney = 0.00 	 pvalue hypergeometric = 0.00
betweenness: pvalue Mann-Whitney = 0.00 	 pvalue hypergeometric = 0.00
hits_hubs: pvalue Mann-Whitney = 0.00 	 pvalue hypergeometric = 0.00
hits_authorities: pvalue Mann-Whitney = 0.00 	 pvalue hypergeometric = 0.00
pagerank: pvalue Mann-Whitney = 0.00 	 pvalue hypergeom

In [2]:
print ("tets")


tets


In [None]:
import pandas as pd
import numpy as np
from validation_import import get_ref_genes
from validation import compare_feature_distribution_mannwhitney, compare_feature_distribution_hypergeom

print ("Computing correlations/pvalues for all features for different sources\n")
feature_names = list(features.columns)
pvalues = pd.DataFrame(data=np.zeros((6,len(feature_names))), index=["cancer_Mann–Whitney", "drugbank_Mann–Whitney",
                                                                     "mendelian_Mann–Whitney", "cancer_hypergeom",
                                                                     "drugbank_hypergeom", "mendelian_hypergeom"],
                       columns = feature_names)
for source in ['cancer', 'drugbank', 'mendelian']:
    print("Source = %s"%source)
    ref_genes = get_ref_genes(source=source)
    for feature_name in feature_names:
        pvalue_MW = compare_feature_distribution_mannwhitney(features, feature_name, ref_genes, 'output/' + feature_name +
                                                             '_distribution_comparison_{}.png'.format(source),
                                                             title="{},{}".format(feature_name, source))
        pvalue_hypergeom = compare_feature_distribution_hypergeom(features, feature_name, ref_genes)
        pvalues.loc["%s_Mann–Whitney"%source:,feature_name] = pvalue_MW
        pvalues.loc["%s_hypergeom"%source:,feature_name] = pvalue_hypergeom
        print ("pvalue Mann-Whitney = %.2g \t pvalue hypergeometric = %.2g \t(%s)"%(pvalue_MW, pvalue_hypergeom, feature_name))
    print("############\n")
print ("Saving correlations/palues to output/pvalues")
pvalues.to_pickle("output/pvalues")

Computing correlations/pvalues for all features for different sources

Source = cancer
pvalue Mann-Whitney = 1.2e-70 	 pvalue hypergeometric = 9.6e-29 	(degree_undirected)
pvalue Mann-Whitney = 8.1e-77 	 pvalue hypergeometric = 9.5e-23 	(expecteddegree_undirected)
pvalue Mann-Whitney = 0.012 	 pvalue hypergeometric = 0.86 	(clusteringcoefficient)
pvalue Mann-Whitney = 1.8e-77 	 pvalue hypergeometric = 1.7e-27 	(closeness)
pvalue Mann-Whitney = 2.3e-69 	 pvalue hypergeometric = 1.7e-27 	(betweenness)
pvalue Mann-Whitney = 6e-84 	 pvalue hypergeometric = 2.8e-26 	(hits_hubs)
pvalue Mann-Whitney = 6e-84 	 pvalue hypergeometric = 2.8e-26 	(hits_authorities)
pvalue Mann-Whitney = 1.6e-75 	 pvalue hypergeometric = 1.3e-21 	(pagerank)
pvalue Mann-Whitney = 1.2e-70 	 pvalue hypergeometric = 9.6e-29 	(log10-degree_undirected)
pvalue Mann-Whitney = 1.2e-70 	 pvalue hypergeometric = 9.6e-29 	(normalized-degree_undirected)
pvalue Mann-Whitney = 6e-84 	 pvalue hypergeometric = 2.8e-26 	(normalized-