In [3]:
# This script contains everything the whole sequence of things to do
# to get our results.

import networkx as nx

from read_graph import read_graph
from common.pipeline import Pipeline
from common.feature_generators import ExpectedDegree
from common.feature_generators import ClusteringCoefficient
from common.feature_generators import Degree
from common.feature_generators import ClosenessCentrality
from common.feature_generators import BetweennessCentrality
from common.feature_generators import HITS
from common.feature_generators import PageRank
from common.feature_generators import Log10Wrapper
from common.feature_generators import NormalizeWrapper
from validation import compare_feature_distribution_mannwhitney
from validation import compare_feature_distribution_hypergeom
from validation_import import get_gene_ref

# Loading PPI graph
print("\n######### Loading Graph #########")
Graph, node_names = read_graph(directed=False)
print("Loaded graph:\n\t{} nodes\n\t{} edges".format(
    Graph.number_of_nodes(),
    Graph.number_of_edges()
))

#########################
# Computing node features
#########################

print("\n######### Computing/retrieving node features #########")

# The pipeline object takes as an argument the sequence of features we want
pipeline = Pipeline(Degree(default_dump=True, default_recomputing=False),
                    ExpectedDegree(default_dump=True, default_recomputing=False), ClusteringCoefficient(),
                    ClosenessCentrality(), BetweennessCentrality(), HITS(), PageRank(), Log10Wrapper(Degree())(),
                    NormalizeWrapper(Degree())(),NormalizeWrapper(HITS())())
features = pipeline.apply(Graph, verbose=True)

#########################
# Class prediction
#########################


100%|██████████| 19576/19576 [00:00<00:00, 167479.23it/s]


######### Loading Graph #########
Reading Nodes list
Reading Edges list





Loaded graph:
	19576 nodes
	5676528 edges

######### Computing/retrieving node features #########
degree_undirected
expecteddegree_undirected
clusteringcoefficient
closeness
betweenness
hits
pagerank
log10-degree_undirected
normalized-degree_undirected
normalized-hits


In [4]:
for source in ['cancer', 'drugbank', 'mendelian']:
    print(source)
    ref_genes = get_gene_ref(source=source)
    for feature_name in pipeline.get_generator_names():
        print("Comparing reference and whole sample on: [{}]".format(feature_name))
        print('\t',
              compare_feature_distribution_mannwhitney(features, feature_name, ref_genes,
                                                       'output/' + feature_name + '_distribution_comparison_{}.png'.format(source),
                                                       title="{}, {}".format(feature_name, source)))
        print("Hypergeom test on: [{}]".format(feature_name))
        print('\t',
              compare_feature_distribution_hypergeom(features, feature_name, ref_genes))
    print("############\n")

cancer
Comparing reference and whole sample on: [degree_undirected]
debug
feature_name
[6740.0, 6099.0, 5935.0, 5848.0, 5626.0, 5520.0, 5211.0, 5180.0, 5171.0, 5090.0, 4978.0, 4969.0, 4964.0, 4867.0, 4852.0, 4822.0, 4816.0, 4712.0, 4697.0, 4683.0, 4673.0, 4661.0, 4657.0, 4584.0, 4572.0, 4552.0, 4521.0, 4509.0, 4450.0, 4424.0, 4388.0, 4380.0, 4334.0, 4325.0, 4287.0, 4269.0, 4250.0, 4233.0, 4186.0, 4184.0, 4170.0, 4158.0, 4117.0, 4113.0, 4078.0, 4056.0, 4019.0, 4018.0, 4009.0, 4000.0, 3999.0, 3991.0, 3989.0, 3980.0, 3971.0, 3953.0, 3950.0, 3895.0, 3870.0, 3854.0, 3850.0, 3815.0, 3803.0, 3792.0, 3788.0, 3783.0, 3730.0, 3720.0, 3711.0, 3704.0, 3704.0, 3701.0, 3699.0, 3698.0, 3694.0, 3693.0, 3690.0, 3684.0, 3683.0, 3673.0, 3668.0, 3661.0, 3651.0, 3642.0, 3641.0, 3640.0, 3635.0, 3633.0, 3614.0, 3604.0, 3587.0, 3582.0, 3577.0, 3576.0, 3574.0, 3567.0, 3566.0, 3559.0, 3552.0, 3549.0, 3547.0, 3544.0, 3541.0, 3529.0, 3529.0, 3524.0, 3523.0, 3501.0, 3492.0, 3491.0, 3491.0, 3490.0, 3490.0, 3481.0, 

	 MannwhitneyuResult(statistic=3845941.0, pvalue=3.9581004536072154e-28)
Hypergeom test on: [degree_undirected]
	 0.969765531953
Comparing reference and whole sample on: [expecteddegree_undirected]
debug
feature_name
[2056.102000000014, 2025.8480000000163, 2013.5850000000482, 1913.1420000000048, 1857.3600000000181, 1844.2389999999928, 1759.4440000000111, 1721.3489999999956, 1714.4660000000101, 1684.9950000000101, 1673.8169999999818, 1665.0990000000083, 1659.5950000000182, 1638.9190000000212, 1637.9830000000143, 1629.2129999999834, 1625.5139999999831, 1623.4469999999837, 1618.9320000000071, 1504.9000000000015, 1498.4270000000206, 1481.0319999999954, 1479.0190000000005, 1451.4930000000072, 1447.5620000000258, 1412.3090000000034, 1402.4859999999969, 1395.4819999999847, 1392.7199999999955, 1388.0360000000032, 1359.4009999999937, 1355.3739999999955, 1354.8760000000127, 1354.2170000000017, 1349.2050000000095, 1347.2210000000057, 1343.3709999999821, 1340.2470000000117, 1332.7670000000003, 132

	 MannwhitneyuResult(statistic=3845789.5, pvalue=3.9062022565083782e-28)
Hypergeom test on: [clusteringcoefficient]
	 0.969765531953
Comparing reference and whole sample on: [closeness]
debug
feature_name
[0.60211051576243768, 0.59073348036046702, 0.58789426867955163, 0.58536221224927543, 0.58117298305396503, 0.58008774095507365, 0.57440127390063023, 0.57392962476077047, 0.57381183347530629, 0.57256956216442678, 0.57104925899691505, 0.57049993308095825, 0.57031705922433484, 0.56867645516031595, 0.56836267074235558, 0.56818116430083532, 0.56804923241349881, 0.56633967660220141, 0.5661594597414451, 0.5657830130479683, 0.56566854165969915, 0.5655868047378052, 0.56441240675824866, 0.56411956891425574, 0.56385952359164115, 0.56366464688062767, 0.56324287576890875, 0.56285410843353001, 0.56183614297359696, 0.56133658982768597, 0.56061301072306713, 0.56016372443592288, 0.55913948358214594, 0.55869255507685622, 0.55859687762136867, 0.55850123293026299, 0.55842155403721494, 0.55838968884539564,

	 MannwhitneyuResult(statistic=3845827.0, pvalue=3.9191174525268802e-28)
Hypergeom test on: [closeness]
	 0.969765531953
Comparing reference and whole sample on: [betweenness]
debug
feature_name
[0.0063680316477608342, 0.0058536633487776477, 0.0054475806309854553, 0.0052481511818570298, 0.0048838969191965379, 0.0044315039664970428, 0.0042568085414586678, 0.0041989654595353679, 0.0040069717011753598, 0.0035485697094464965, 0.0033083928714932407, 0.0030163834385609906, 0.0028403308030003345, 0.002348577750102443, 0.0022416732119670481, 0.0022358504622387362, 0.0021263415386304652, 0.0020627501607114913, 0.0019795993144659956, 0.0019602372704610057, 0.0019421780764354423, 0.001922345050534657, 0.0018738078129944622, 0.0017341779880628446, 0.0016900370069114984, 0.0016691571551835499, 0.0016685117140424774, 0.0016320200008964831, 0.0016266050067254923, 0.0016121713921893653, 0.0016060132294744938, 0.0015740429766455098, 0.0015663708671922202, 0.0015528858261640154, 0.001537676675688898, 0.

	 MannwhitneyuResult(statistic=3845802.0, pvalue=3.9105050867904829e-28)
Hypergeom test on: [betweenness]
	 0.969765531953
Comparing reference and whole sample on: [hits_hubs]
debug
feature_name
[0.00069125810335061305, 0.00067633226819379289, 0.00062394043358907016, 0.00062309380209031975, 0.00062209097936715655, 0.00061610076376146614, 0.00060832919140496307, 0.00058967596773643071, 0.00057976790977441609, 0.00057454546110140682, 0.00057061372561942336, 0.00056890187702757211, 0.00056413261776891758, 0.00056163576723301507, 0.00055983927106031278, 0.00055524014462314433, 0.00054551907549977218, 0.0005394244484034417, 0.00053839910778786355, 0.00053767411491535536, 0.0005060942526125046, 0.00050365799447015598, 0.00049086128158082342, 0.00048820705129982013, 0.00048761580887887322, 0.00048278044719218317, 0.00047900667635161146, 0.00047730611611710035, 0.0004747136288355234, 0.00047441353849665869, 0.00046959464639972513, 0.00046822781028621523, 0.00046570228305811798, 0.0004650198833

	 MannwhitneyuResult(statistic=3845818.5, pvalue=3.9161909288511175e-28)
Hypergeom test on: [hits_hubs]
	 0.969765531953
Comparing reference and whole sample on: [hits_authorities]
debug
feature_name
[0.0006912581033500806, 0.00067633226819407369, 0.00062394043358850214, 0.0006230938020897552, 0.00062209097936659027, 0.00061610076376100481, 0.0006083291914044613, 0.0005896759677365967, 0.00057976790977394034, 0.00057454546110110531, 0.00057061372561891064, 0.00056890187702729282, 0.00056413261776869889, 0.00056163576723256231, 0.00055983927105978802, 0.00055524014462320808, 0.00054551907549943641, 0.00053942444840353126, 0.000538399107787496, 0.00053767411491542052, 0.00050609425261207602, 0.00050365799446976459, 0.00049086128158052104, 0.00048820705129958605, 0.00048761580887937385, 0.00048278044719191933, 0.00047900667635150809, 0.00047730611611694667, 0.0004747136288352279, 0.00047441353849649671, 0.00046959464639943478, 0.00046822781028601769, 0.00046570228305787919, 0.000465019883

	 MannwhitneyuResult(statistic=3845818.5, pvalue=3.9161908062926655e-28)
Hypergeom test on: [hits_authorities]
	 0.969765531953
Comparing reference and whole sample on: [pagerank]
debug
feature_name
[0.0005857469834909053, 0.00056619404110808602, 0.0005290319755296214, 0.00052761678375683305, 0.00052450821383704901, 0.00052081145892706653, 0.00051721085446698929, 0.00050617649881736296, 0.00049948871996250322, 0.00049770727406190372, 0.00049422669821646962, 0.00048183650562230192, 0.000465456374470514, 0.00045417547912650477, 0.00043808674496252452, 0.0004350261474857347, 0.00042843647324097508, 0.00041822421562370815, 0.00041315034389907153, 0.00041233222828337767, 0.00041101340244838007, 0.00040831149094292863, 0.00040420201237827954, 0.00040257057243354565, 0.00040037950032493873, 0.0003952231552619437, 0.00039203499469072674, 0.00038850700820392551, 0.00038844831782688081, 0.00038438997228652252, 0.00038273929793970101, 0.00038209820837032516, 0.00038095379721783191, 0.000380428020

	 MannwhitneyuResult(statistic=3845819.0, pvalue=3.9163632105545179e-28)
Hypergeom test on: [pagerank]


KeyboardInterrupt: 

In [2]:
print ("test")

test
