# manual inspection of enrichment analysis for the first 5 components

- One need to place the data files in `<repo root>/enrichment/private_data/GREAT`
    - the data file is on google drive:
        - https://drive.google.com/file/d/1j2F0w_r9Y-cRdAh0UrDy-QZl4qWndK8W/view?usp=sharing
- Here, we can investigate the results of enrichment analysis. Please look at `<repo root>/enrichment/README.md` for method description.



In [1]:
% matplotlib inline

import numpy as np
import pandas as pd
import matplotlib, collections, itertools, os, re, textwrap, logging, sys
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from functools import reduce

from logging.config import dictConfig
from logging import getLogger

dictConfig(dict(
    version = 1,
    formatters = {'f': {'format': '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'}},
    handlers = {
        'h': {'class': 'logging.StreamHandler','formatter': 'f',
              'level': logging.DEBUG}},
    root = {'handlers': ['h'], 'level': logging.DEBUG,},
))

matplotlib.rc('font',**{'size':16, 'family':'sans-serif','sans-serif':['HelveticaNeue', 'Helvetica']})

logger = getLogger('notebook')


In [2]:
repo_dir=os.path.realpath(
    os.path.dirname(os.path.dirname(os.getcwd()))
)


In [3]:
data_dir=os.path.realpath(
    os.path.join(os.path.dirname(os.getcwd()), 'private_data')
)

In [4]:
sys.path.append(os.path.join(repo_dir, 'enrichement', 'src'))
from great import read_great_res_wrapper


## Gene Ontology (GO) Biological process

In [5]:
read_great_res_wrapper(data_dir, 0, 'GOBiologicalProcess').head()

Unnamed: 0,# ID,Desc,BPval,BFold
19,GO:0080111,DNA demethylation,3.780717e-08,3.904828
20,GO:0046133,pyrimidine ribonucleoside catabolic process,4.080519e-08,6.249585
26,GO:0006703,estrogen biosynthetic process,9.438592e-08,3.843655
29,GO:0045916,negative regulation of complement activation,1.792544e-07,4.370029
35,GO:0033081,regulation of T cell differentiation in thymus,5.234933e-07,2.308571


In [6]:
read_great_res_wrapper(data_dir, 1, 'GOBiologicalProcess').head()

Unnamed: 0,# ID,Desc,BPval,BFold
5,GO:0043901,negative regulation of multi-organism process,1.31894e-07,2.168513
8,GO:0050830,defense response to Gram-positive bacterium,5.854487e-07,2.436218
9,GO:0043374,"CD8-positive, alpha-beta T cell differentiation",7.974458e-07,2.954704
14,GO:0006572,tyrosine catabolic process,1.401399e-06,7.532955
17,GO:0002286,T cell activation involved in immune response,3.720987e-06,2.339229


In [7]:
read_great_res_wrapper(data_dir, 2, 'GOBiologicalProcess').head()

Unnamed: 0,# ID,Desc,BPval,BFold
30,GO:0060333,interferon-gamma-mediated signaling pathway,9.010372e-11,2.559231
39,GO:0045047,protein targeting to ER,3.183959e-10,2.280882
44,GO:0071346,cellular response to interferon-gamma,5.468346e-10,2.15966
45,GO:0038096,Fc-gamma receptor signaling pathway involved i...,5.472179e-10,2.147846
47,GO:0002431,Fc receptor mediated stimulatory signaling pat...,5.672283e-10,2.146109


In [8]:
read_great_res_wrapper(data_dir, 3, 'GOBiologicalProcess').head()

Unnamed: 0,# ID,Desc,BPval,BFold
92,GO:0075733,intracellular transport of virus,1.5e-05,2.074477
99,GO:0071624,positive regulation of granulocyte chemotaxis,1.7e-05,2.064859
106,GO:0090023,positive regulation of neutrophil chemotaxis,2.1e-05,2.068764
110,GO:0070127,tRNA aminoacylation for mitochondrial protein ...,2.1e-05,4.215114
139,GO:0018401,peptidyl-proline hydroxylation to 4-hydroxy-L-...,3.8e-05,2.430515


In [9]:
read_great_res_wrapper(data_dir, 4, 'GOBiologicalProcess').head()

Unnamed: 0,# ID,Desc,BPval,BFold
39,GO:0042219,cellular modified amino acid catabolic process,1e-05,2.204023
66,GO:0042447,hormone catabolic process,7.6e-05,2.530917
76,GO:0006590,thyroid hormone generation,0.000106,2.011829
79,GO:0048714,positive regulation of oligodendrocyte differe...,0.000115,2.025767
94,GO:0001542,ovulation from ovarian follicle,0.000138,2.243419


## Human Phenotype ontology

Human Phenotype Ontology is an ontology with human phenotypes (often derived from diseases) with gene <--> disease mapping

In [10]:
read_great_res_wrapper(data_dir, 0, 'HumanPhenotypeOntology').head()

Unnamed: 0,# ID,Desc,BPval,BFold
3,HP:0012140,Abnormality of cells of the lymphoid lineage,3.637427e-07,2.315639
5,HP:0001888,Lymphopenia,5.56784e-07,2.382784
11,HP:0001878,Hemolytic anemia,1.152956e-06,2.008804
13,HP:0002917,Hypomagnesemia,3.079205e-06,3.452539
20,HP:0004921,Abnormality of magnesium homeostasis,5.371737e-05,2.814905


In [11]:
read_great_res_wrapper(data_dir, 1, 'HumanPhenotypeOntology').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,HP:0004395,Malnutrition,5e-06,5.321346
1,HP:0001718,Mitral stenosis,6.3e-05,3.783898
2,HP:0001413,Micronodular cirrhosis,0.00012,2.821075
5,HP:0004333,Bone-marrow foam cells,0.000345,6.657703
6,HP:0003548,Subsarcolemmal accumulations of abnormally sha...,0.000527,6.137533


In [12]:
read_great_res_wrapper(data_dir, 2, 'HumanPhenotypeOntology').head()

Unnamed: 0,# ID,Desc,BPval,BFold
8,HP:0012103,Abnormality of the mitochondrion,1.981297e-08,2.430178
11,HP:0003287,Abnormality of mitochondrial metabolism,5.547277e-08,2.397021
16,HP:0010972,Anemia of inadequate production,1.467984e-07,2.084019
20,HP:0200042,Skin ulcer,2.149489e-07,2.02101
22,HP:0001581,Recurrent skin infections,3.26783e-07,2.644626


In [13]:
read_great_res_wrapper(data_dir, 3, 'HumanPhenotypeOntology').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,HP:0002697,Parietal foramina,5.21107e-08,2.381946
2,HP:0004425,Flat forehead,9.815947e-07,2.801818
3,HP:0002365,Hypoplasia of the brainstem,1.226172e-06,2.080713
4,HP:0004442,Sagittal craniosynostosis,1.499563e-06,2.798936
5,HP:0010054,Abnormality of the first metatarsal,1.545575e-06,2.636819


In [14]:
read_great_res_wrapper(data_dir, 4, 'HumanPhenotypeOntology').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,HP:0001029,Poikiloderma,0.000207,2.412633
1,HP:0003079,Defective DNA repair after ultraviolet radiati...,0.000212,2.784793
54,HP:0010758,Abnormality of the premaxilla,0.008194,2.271427
83,HP:0004326,Cachexia,0.013697,2.114202
108,HP:0003216,Generalized amyloid deposition,0.019079,2.903179


## MGI Phenotype ontology (mouse phenotype)

Mouse has comprehensive catalog of gene-phenotype mapping. (We can't perform experiments on human).


In [15]:
read_great_res_wrapper(data_dir, 0, 'MGIPhenotype').head()

Unnamed: 0,# ID,Desc,BPval,BFold
26,MP:0001856,myocarditis,3e-06,2.899189
34,MP:0001870,salivary gland inflammation,7e-06,2.334535
36,MP:0004041,increased susceptibility to kidney reperfusion...,8e-06,4.074445
44,MP:0002389,abnormal Peyer's patch follicle morphology,1.8e-05,2.146357
60,MP:0002392,abnormal Peyer's patch T cell area morphology,6.7e-05,3.54165


In [16]:
read_great_res_wrapper(data_dir, 1, 'MGIPhenotype').head()

Unnamed: 0,# ID,Desc,BPval,BFold
5,MP:0008552,abnormal circulating tumor necrosis factor level,5.025154e-11,2.13741
25,MP:0006309,decreased retinal ganglion cell number,1.466071e-07,2.196054
38,MP:0008553,increased circulating tumor necrosis factor level,9.253086e-07,2.065763
40,MP:0008392,decreased primordial germ cell number,1.07923e-06,2.054944
54,MP:0008554,decreased circulating tumor necrosis factor level,2.946674e-06,2.397094


In [17]:
read_great_res_wrapper(data_dir, 2, 'MGIPhenotype').head()

Unnamed: 0,# ID,Desc,BPval,BFold
17,MP:0005153,abnormal B cell proliferation,3.181409e-20,2.120136
21,MP:0008217,abnormal B cell activation,3.4360360000000002e-18,2.012082
37,MP:0005093,decreased B cell proliferation,7.882232e-15,2.142833
41,MP:0008180,abnormal marginal zone B cell morphology,2.404651e-14,2.22924
45,MP:0008495,decreased IgG1 level,9.733961e-14,2.015448


In [18]:
read_great_res_wrapper(data_dir, 3, 'MGIPhenotype').head()

Unnamed: 0,# ID,Desc,BPval,BFold
14,MP:0001771,abnormal circulating magnesium level,2.25823e-07,2.333532
18,MP:0003954,abnormal Reichert's membrane morphology,3.972634e-07,2.052724
48,MP:0010092,increased circulating magnesium level,6.732819e-06,2.691314
52,MP:0009545,abnormal dermis papillary layer morphology,9.978618e-06,2.329184
55,MP:0001669,abnormal glucose absorption,1.529968e-05,2.681345


In [19]:
read_great_res_wrapper(data_dir, 4, 'MGIPhenotype').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,MP:0004494,abnormal synaptic glutamate release,5.094085e-08,2.254817
2,MP:0004495,decreased synaptic glutamate release,1.044523e-07,2.613689
15,MP:0003276,esophageal atresia,2.694408e-05,2.157871
53,MP:0003282,gastric ulcer,0.0002993688,2.347075
57,MP:0010779,abnormal stomach muscularis externa morphology,0.000325307,2.088876


## MGI Phenotype ontology (knockout only) (mouse phenotype, derived from KO analysis)

KO subset of MGI Phenotype ontology is sometimes useful (to narrow down to causal relationship)


## GO Molecular function

In [20]:
read_great_res_wrapper(data_dir, 0, 'MGIPhenoSingleKO').head()

Unnamed: 0,# ID,Desc,BPval,BFold
24,MP:0001856,myocarditis,3e-06,3.121352
29,MP:0004041,increased susceptibility to kidney reperfusion...,8e-06,4.074445
35,MP:0008126,increased dendritic cell number,1.7e-05,2.8822
59,MP:0008862,asymmetric snout,7e-05,2.759692
63,MP:0003452,abnormal parotid gland morphology,8.7e-05,3.455479


In [21]:
read_great_res_wrapper(data_dir, 1, 'MGIPhenoSingleKO').head()

Unnamed: 0,# ID,Desc,BPval,BFold
16,MP:0008552,abnormal circulating tumor necrosis factor level,6.418123e-08,2.067143
47,MP:0009788,increased susceptibility to bacterial infectio...,3.363712e-06,2.019205
55,MP:0008554,decreased circulating tumor necrosis factor level,7.000738e-06,2.36779
61,MP:0009321,increased histiocytic sarcoma incidence,1.319572e-05,2.534863
75,MP:0006309,decreased retinal ganglion cell number,3.783763e-05,2.174031


In [22]:
read_great_res_wrapper(data_dir, 2, 'MGIPhenoSingleKO').head()

Unnamed: 0,# ID,Desc,BPval,BFold
29,MP:0005153,abnormal B cell proliferation,2.973987e-14,2.06846
40,MP:0005461,abnormal dendritic cell morphology,1.445606e-12,2.379902
42,MP:0001806,decreased IgM level,3.291375e-12,2.209544
53,MP:0005093,decreased B cell proliferation,1.718479e-10,2.145825
70,MP:0002418,increased susceptibility to viral infection,8.788483e-09,2.034574


In [23]:
read_great_res_wrapper(data_dir, 3, 'MGIPhenoSingleKO').head()

Unnamed: 0,# ID,Desc,BPval,BFold
5,MP:0012129,failure of blastocyst formation,7.850076e-10,2.053477
6,MP:0012128,abnormal blastocyst formation,9.198091e-10,2.046508
7,MP:0002663,failure to form blastocele,1.228421e-09,2.052544
9,MP:0004963,abnormal blastocoele morphology,2.117273e-09,2.009879
19,MP:0003954,abnormal Reichert's membrane morphology,3.972634e-07,2.052724


In [24]:
read_great_res_wrapper(data_dir, 4, 'MGIPhenoSingleKO').head()

Unnamed: 0,# ID,Desc,BPval,BFold
2,MP:0004494,abnormal synaptic glutamate release,8.093472e-07,2.224995
3,MP:0001661,extended life span,1.476395e-06,2.641572
5,MP:0004495,decreased synaptic glutamate release,1.703522e-06,2.536114
7,MP:0003990,decreased neurotransmitter release,5.253076e-06,2.196289
18,MP:0005109,abnormal talus morphology,5.978991e-05,2.180961


## GO Cellular component

In [25]:
read_great_res_wrapper(data_dir, 0, 'GOCellularComponent').head()

Unnamed: 0,# ID,Desc,BPval,BFold
4,GO:0042611,MHC protein complex,1.528314e-09,4.026781
17,GO:0071556,integral to lumenal side of endoplasmic reticu...,5.172804e-06,3.113123
20,GO:0042612,MHC class I protein complex,9.299293e-06,3.61047
21,GO:0042613,MHC class II protein complex,2.079056e-05,3.733819
26,GO:0005761,mitochondrial ribosome,4.195153e-05,2.000761


In [26]:
read_great_res_wrapper(data_dir, 1, 'GOCellularComponent').head()

Unnamed: 0,# ID,Desc,BPval,BFold
1,GO:0030125,clathrin vesicle coat,6.7e-05,2.55238
2,GO:0030934,anchoring collagen,0.000224,2.456392
5,GO:0005861,troponin complex,0.000632,7.499313
10,GO:0002199,zona pellucida receptor complex,0.00114,4.497283
12,GO:0031932,TORC2 complex,0.001389,3.227893


In [27]:
read_great_res_wrapper(data_dir, 2, 'GOCellularComponent').head()

Unnamed: 0,# ID,Desc,BPval,BFold
9,GO:0005758,mitochondrial intermembrane space,4.658954e-08,2.924884
15,GO:0000307,cyclin-dependent protein kinase holoenzyme com...,1.66169e-06,2.979384
20,GO:0000788,nuclear nucleosome,3.649093e-06,6.005715
27,GO:0044798,nuclear transcription factor complex,1.326775e-05,2.009501
31,GO:0031095,platelet dense tubular network membrane,2.117509e-05,3.726373


In [28]:
read_great_res_wrapper(data_dir, 3, 'GOCellularComponent').head()

Unnamed: 0,# ID,Desc,BPval,BFold
8,GO:0005606,laminin-1 complex,8e-06,2.211307
9,GO:0043256,laminin complex,3.2e-05,2.04688
22,GO:0019031,viral envelope,0.000486,2.907735
46,GO:0031080,nuclear pore outer ring,0.002169,2.309853
54,GO:0005666,DNA-directed RNA polymerase III complex,0.003262,2.282204


In [29]:
read_great_res_wrapper(data_dir, 4, 'GOCellularComponent').head()

Unnamed: 0,# ID,Desc,BPval,BFold
2,GO:0005750,mitochondrial respiratory chain complex III,6.2e-05,3.094238
9,GO:0032591,dendritic spine membrane,0.001123,2.926314
16,GO:0031094,platelet dense tubular network,0.005397,2.095931
24,GO:0031095,platelet dense tubular network membrane,0.011675,2.023305
52,GO:0005655,nucleolar ribonuclease P complex,0.056398,2.225361
