In [1]:
% matplotlib inline

import numpy as np
import pandas as pd
import matplotlib, collections, itertools, os, re, textwrap, logging
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from functools import reduce

from logging.config import dictConfig
from logging import getLogger

dictConfig(dict(
    version = 1,
    formatters = {'f': {'format': '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'}},
    handlers = {
        'h': {'class': 'logging.StreamHandler','formatter': 'f',
              'level': logging.DEBUG}},
    root = {'handlers': ['h'], 'level': logging.DEBUG,},
))

matplotlib.rc('font',**{'size':16, 'family':'sans-serif','sans-serif':['HelveticaNeue', 'Helvetica']})

logger = getLogger('notebook')


In [4]:
repo_dir=os.path.realpath(
    os.path.dirname(os.path.dirname(os.getcwd()))
)


In [6]:
data_dir=os.path.realpath(
    os.path.join(os.path.dirname(os.getcwd()), 'private_data')
)

In [63]:
def read_great_res_all(filename, nrows=None, BFold=None, BPval=None, topk=None, sort_by='BPval'):
    '''Read GREAT results file (tsv) file.
    '''
    df = pd.read_table(
        filename, sep='\t', skiprows=7, nrows=nrows
    )    
    
    trues = pd.Series([True] * len(df))
    
    if (BFold is not None):
        f_BFold = df['BFold'] >= BFold
    else:
        f_BFold = trues
    
    if (BPval is not None):
        f_BPval = df['BPval'] <= BPval
    else:
        f_BPval = trues
        
    df_filtered = df[f_BFold & f_BPval]
    
    if (topk is None):
        topk = len(df_filtered)
    
    if (sort_by == 'BFold'):
        ascending=False
    else:
        ascending=True
        
    return df_filtered.sort_values(by=sort_by, ascending=ascending).head(topk)

def read_great_res(filename, nrows=None, BFold=None, BPval=None, topk=None, sort_by='BPval'):
    return read_great_res_all(filename, nrows, BFold, BPval, topk, sort_by)[['# ID', 'Desc', 'BPval', 'BFold']]

In [71]:
def read_great_res_wrapper(pc, ontology):
    return read_great_res(
        os.path.join(
            data_dir, 'GREAT', 'PCA{}'.format(pc), '{}.tsv'.format(ontology)
        ), 
        BFold=2.0
    )

# manual inspection of enrichment analysis for the first 5 components

- One need to place the data files in `<repo root>/enrichment/private_data/GREAT`
    - the data file is on google drive:
        - https://drive.google.com/file/d/1VgjnKZ-TLPDPRB830YvYZSm8Luo7os9k/view?usp=sharing
- Here, we can investigate the results of enrichment analysis. Please look at `<repo root>/enrichment/README.md` for method description.



## Human Phenotype ontology

In [83]:
read_great_res_wrapper(0, 'HumanPhenotypeOntology').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,HP:0001525,Severe failure to thrive,4.3021059999999997e-44,29.68758
1,HP:0001833,Long foot,3.429376e-42,26.50561
2,HP:0000651,Diplopia,1.120952e-40,13.99734
3,HP:0011993,Impaired neutrophil bactericidal activity,4.846849e-37,26.36237
4,HP:0001930,Nonspherocytic hemolytic anemia,9.470414e-36,24.13428


In [84]:
read_great_res_wrapper(1, 'HumanPhenotypeOntology').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,HP:0001525,Severe failure to thrive,2.07289e-59,37.10948
1,HP:0000651,Diplopia,7.312066e-59,18.03504
2,HP:0001833,Long foot,5.131735e-57,33.13201
3,HP:0011993,Impaired neutrophil bactericidal activity,1.433868e-56,36.15411
4,HP:0001930,Nonspherocytic hemolytic anemia,8.824545e-55,33.09843


In [85]:
read_great_res_wrapper(2, 'HumanPhenotypeOntology').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,HP:0002329,Drowsiness,4.025085e-46,29.52977
1,HP:0002357,Dysphasia,4.68659e-46,29.41936
2,HP:0004570,Increased vertebral height,1.383946e-45,27.02947
3,HP:0000651,Diplopia,2.314844e-45,15.07406
4,HP:0002073,Progressive cerebellar ataxia,2.660039e-40,8.83547


In [86]:
read_great_res_wrapper(3, 'HumanPhenotypeOntology').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,HP:0011993,Impaired neutrophil bactericidal activity,7.387910000000001e-43,29.37522
1,HP:0001930,Nonspherocytic hemolytic anemia,2.054548e-41,26.89248
2,HP:0004570,Increased vertebral height,6.627623e-40,24.5151
3,HP:0011990,Abnormality of neutrophil physiology,1.568841e-39,23.95739
4,HP:0003413,Atlantoaxial abnormality,2.911973e-37,18.69677


In [87]:
read_great_res_wrapper(4, 'HumanPhenotypeOntology').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,HP:0011993,Impaired neutrophil bactericidal activity,1.433868e-56,36.15411
1,HP:0001930,Nonspherocytic hemolytic anemia,8.824545e-55,33.09843
2,HP:0011990,Abnormality of neutrophil physiology,1.905621e-52,29.48602
3,HP:0001525,Severe failure to thrive,4.2009999999999996e-50,32.65634
4,HP:0001833,Long foot,5.269005e-48,29.15617


## GO Biological process

In [73]:
read_great_res_wrapper(0, 'GOBiologicalProcess').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,GO:0000185,activation of MAPKKK activity,2.1706900000000002e-67,15.49817
1,GO:0043249,erythrocyte maturation,4.447322e-60,27.24031
2,GO:0071499,cellular response to laminar fluid shear stress,2.34992e-55,32.25595
3,GO:0007168,receptor guanylyl cyclase signaling pathway,6.591094e-53,18.70011
4,GO:0036003,positive regulation of transcription from RNA ...,2.003572e-43,17.97852


In [74]:
read_great_res_wrapper(1, 'GOBiologicalProcess').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,GO:0000185,activation of MAPKKK activity,1.383495e-97,19.97957
1,GO:0007168,receptor guanylyl cyclase signaling pathway,2.872543e-81,25.35608
2,GO:0043249,erythrocyte maturation,2.290335e-70,30.58561
3,GO:0071499,cellular response to laminar fluid shear stress,2.3754e-69,38.18051
4,GO:2001242,regulation of intrinsic apoptotic signaling pa...,4.5459169999999997e-63,5.364207


In [75]:
read_great_res_wrapper(2, 'GOBiologicalProcess').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,GO:0050881,musculoskeletal movement,4.779335e-49,7.800325
1,GO:0046928,regulation of neurotransmitter secretion,2.3493449999999998e-48,6.290903
2,GO:0000185,activation of MAPKKK activity,1.26863e-45,11.95039
3,GO:0014056,"regulation of acetylcholine secretion, neurotr...",1.089451e-39,20.45459
4,GO:0039528,cytoplasmic pattern recognition receptor signa...,1.7694750000000002e-39,23.88048


In [76]:
read_great_res_wrapper(3, 'GOBiologicalProcess').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,GO:0000185,activation of MAPKKK activity,1.86997e-73,16.43179
1,GO:0007168,receptor guanylyl cyclase signaling pathway,1.017146e-64,21.55266
2,GO:0039528,cytoplasmic pattern recognition receptor signa...,1.5091340000000001e-46,26.94208
3,GO:0043249,erythrocyte maturation,1.093361e-44,21.98341
4,GO:0010923,negative regulation of phosphatase activity,2.5840730000000003e-39,3.979831


In [77]:
read_great_res_wrapper(4, 'GOBiologicalProcess').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,GO:0000185,activation of MAPKKK activity,3.738095e-77,16.99197
1,GO:0007168,receptor guanylyl cyclase signaling pathway,1.644075e-71,23.13742
2,GO:0039528,cytoplasmic pattern recognition receptor signa...,7.480457999999999e-57,31.22832
3,GO:0043249,erythrocyte maturation,2.080543e-47,22.93921
4,GO:0051156,glucose 6-phosphate metabolic process,1.259892e-46,22.05714


## MGI Phenotype ontology (mouse phenotype)

In [78]:
read_great_res_wrapper(0, 'MGIPhenotype').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,MP:0004877,abnormal systemic vascular resistance,1.1757449999999999e-64,38.1387
1,MP:0003394,increased cardiac output,2.993274e-61,34.51117
2,MP:0008539,decreased susceptibility to induced colitis,1.875374e-55,10.22791
3,MP:0011951,increased cardiac stroke volume,5.387074e-54,26.06802
4,MP:0005348,increased T cell proliferation,8.861567e-54,3.680493


In [79]:
read_great_res_wrapper(1, 'MGIPhenotype').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,MP:0001800,abnormal humoral immune response,9.552576999999999e-87,2.416677
1,MP:0002490,abnormal immunoglobulin level,2.302866e-83,2.459878
2,MP:0008210,increased mature B cell number,2.886921e-83,4.035475
3,MP:0011951,increased cardiac stroke volume,2.586089e-79,34.59026
4,MP:0008539,decreased susceptibility to induced colitis,5.00718e-79,12.84436


In [80]:
read_great_res_wrapper(2, 'MGIPhenotype').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,MP:0009310,large intestine adenocarcinoma,2.993071e-42,14.54456
1,MP:0008210,increased mature B cell number,7.798192e-40,2.923224
2,MP:0003394,increased cardiac output,6.477863e-35,22.79039
3,MP:0006038,increased mitochondrial proliferation,8.018869e-35,10.5599
4,MP:0008840,abnormal spike wave discharge,3.616096e-33,5.614086


In [81]:
read_great_res_wrapper(3, 'MGIPhenotype').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,MP:0008210,increased mature B cell number,1.013412e-55,3.365273
1,MP:0004877,abnormal systemic vascular resistance,1.086916e-50,31.78225
2,MP:0004796,increased anti-histone antibody level,2.5666189999999997e-50,10.62057
3,MP:0003394,increased cardiac output,3.747434e-49,29.30193
4,MP:0009310,large intestine adenocarcinoma,1.3082339999999998e-44,15.09341


In [82]:
read_great_res_wrapper(4, 'MGIPhenotype').head()

Unnamed: 0,# ID,Desc,BPval,BFold
0,MP:0004796,increased anti-histone antibody level,3.875817e-63,12.29751
1,MP:0008539,decreased susceptibility to induced colitis,1.033266e-62,11.06042
2,MP:0008210,increased mature B cell number,1.089594e-60,3.493609
3,MP:0005348,increased T cell proliferation,1.070219e-59,3.861798
4,MP:0003394,increased cardiac output,1.198807e-56,32.5577
