# Fisher's Exact Test to determine GO Term Enrichment of Probes

In [1]:
#Boilerplate Import 
import sys
sys.path.append('..')
import os
from itertools import chain
from collections import defaultdict
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scripts import utils
from scripts.analysis import build_loci

In [2]:
BASE = '../data'

grouped_loci = utils.load(os.path.join(BASE,"grouped_loci.pkl.gz"))

In [3]:
import gseapy as gp
import goatools

obodag = goatools.obo_parser.GODag('go-basic.obo')

load obo file go-basic.obo
go-basic.obo: fmt(1.2) rel(2016-07-16) 46,953 GO Terms


In [4]:
def make_go2Gene_map(grouped_loci, ontology='MF'):
    ontology_map = {'MF': 'molecular_function', 'BP': 'biological_process', 'CC': 'cellular_component'}
    
    out = defaultdict(list)    
    for l in grouped_loci:
        if 'go' in l.annotations:
            for go in l.annotations['go']:
                if obodag[go].namespace == ontology_map[ontology]:
                    out[go].append(l.cluster_id)
                    for parent in obodag[go].get_all_parents():
                        if obodag[parent].namespace == ontology_map[ontology]:
                            out[parent].append(l.cluster_id)
                
    return out

In [5]:
def filter_go2gene_map(go_locus):
    
    # Remove "very broad" gene sets. Arbitrary definition: gene sets that emcompass >50% of all IDs
    all_ids = set(chain(*go_locus.values()))
    go_locus = {key: value for (key, value) in go_locus.items() if len(value) / len(all_ids) <= 0.5}

    # Remove terms with less than 5 members: changed from 10 to 5 becasue small #s of proteins compared
    # to what you would find wiht genes
    go_locus = {key: value for (key, value) in go_locus.items() if len(value) >= 10}

    # Remove child terms with identical gene sets as their parents
    to_remove = set()
    for parent in go_locus.keys():
        # If child term has exact same members as parent, remove
        child_ids = [x.id for x in obodag[parent].children if x.id in go_locus.keys()]
        for child in child_ids:
            if go_locus[child] == go_locus[parent]:
                to_remove.add(child)
    go_locus = {key: value for (key, value) in go_locus.items() if key not in to_remove}

    # Remove sibling terms with identical gene sets
    to_remove = set()
    for brother in go_locus.keys():
        for parent in obodag[brother].parents:
            siblings = set([y.id for y in parent.children])
            siblings.remove(brother)
            for sibling in siblings:
                if sibling in go_locus.keys() and go_locus[brother] == go_locus[sibling]:
                    to_remove.add(sibling)
    go_locus = {key: value for (key, value) in go_locus.items() if key not in to_remove}

    return go_locus

In [6]:
# pop is group1 + group2
def enrichment_fish(go_in_group, nongo_in_group, go_in_other, nongo_in_other):
    import scipy.stats as stats
        
    if go_in_group != 0:
        go_in_group -= 1
       
    oddsr, pval = stats.fisher_exact([[go_in_group, go_in_other], [nongo_in_group, nongo_in_other]], alternative='two-sided')
    
    samplings = go_in_group + nongo_in_group
    total_go = go_in_group + go_in_other
    total = go_in_group + nongo_in_group + go_in_other + nongo_in_other

    lower, expected = stats.hypergeom.interval(.99, M=total+1, n=total_go+1, N=samplings+1)
    
    return oddsr, pval, expected

In [7]:
group_df = pd.read_csv(os.path.join(BASE,'groups.csv'), index_col=0)
group_df.head()

Unnamed: 0,RAG_Enriched,RAG_Unenriched,RT_Enriched,RT_Unenriched
115324,True,False,True,False
132035,False,True,False,False
318873,True,True,True,True
376417,False,False,True,True
407882,True,False,True,False


In [8]:
mf_map = filter_go2gene_map(make_go2Gene_map(grouped_loci))
bp_map = filter_go2gene_map(make_go2Gene_map(grouped_loci, ontology='BP'))

In [9]:
def enrichment_df(group1, group2, go_map):
    pv = dict()

    all_annotations = set(go_map.keys())
    
    group_loci = set(group_df.query('{} and ~{}'.format(group1, group2)).index)
    other_loci = set(group_df.query('~({} and ~{})'.format(group1, group2)).index)

    for annotation in all_annotations:
        go_in_group = len(set(go_map[annotation]) & group_loci)
        nongo_in_group = len(group_loci) - go_in_group
        go_in_other =  len(set(go_map[annotation]) & other_loci)
        nongo_in_other = len(other_loci) - go_in_other

        odds_r, p_val, expected = enrichment_fish(go_in_group, nongo_in_group, go_in_other, nongo_in_other)

        pv[annotation] = [odds_r, p_val, go_in_group+go_in_other, go_in_group, expected]

    res = (pd.DataFrame({obodag[k].name:v for k,v in pv.items()})
             .T
             .rename(columns={0:'odds_ratio', 1:'p_value', 2: 'total_annot', 3:'in_group', 4:'expected'})
             .sort_values(by=['p_value']))
    return res

## RT - Enriched vs Uneneriched - MF

In [10]:
(enrichment_df('RT_Enriched', 'RT_Unenriched', mf_map)
 .query('odds_ratio > 1 and p_value < 0.01')
 .sort_values('odds_ratio', ascending=False))

Unnamed: 0,odds_ratio,p_value,total_annot,in_group,expected
"oxidoreductase activity, acting on other nitrogenous compounds as donors",22.729767,6.395769e-09,23.0,21.0,13.0
asparagine-tRNA ligase activity,14.381311,9.927729e-08,23.0,20.0,13.0
cysteine-type peptidase activity,11.358025,1.481807e-07,25.0,21.0,14.0
urocanate hydratase activity,11.287466,0.0002609547,13.0,11.0,8.0
alcohol dehydrogenase (NAD) activity,7.34744,0.0001373076,18.0,14.0,11.0
cysteine-type endopeptidase activity,6.777626,0.0003457332,17.0,13.0,10.0
acetaldehyde dehydrogenase (acetylating) activity,6.777626,0.0003457332,17.0,13.0,10.0
methionine-tRNA ligase activity,6.009977,0.005133476,12.0,9.0,8.0
FMN binding,5.225979,4.61922e-06,34.0,24.0,18.0
"ligase activity, forming carbon-nitrogen bonds",4.388796,3.980853e-06,42.0,28.0,21.0


## RT - Enriched vs Uneneriched - MF

In [11]:
(enrichment_df('RT_Enriched', 'RT_Unenriched', bp_map)
 .query('odds_ratio > 1 and p_value < 0.01')
 .sort_values('odds_ratio', ascending=False))

Unnamed: 0,odds_ratio,p_value,total_annot,in_group,expected
serine family amino acid catabolic process,15.774983,0.001533337,9.0,8.0,7.0
asparaginyl-tRNA aminoacylation,14.381311,9.927729e-08,23.0,20.0,13.0
carbon utilization,6.777626,0.0003457332,17.0,13.0,10.0
methionyl-tRNA aminoacylation,6.009977,0.005133476,12.0,9.0,8.0
ncRNA metabolic process,4.332141,1.458621e-07,56.0,37.0,26.0
tRNA aminoacylation for protein translation,4.208885,2.899121e-07,55.0,36.0,26.0
amino acid activation,4.208885,2.899121e-07,55.0,36.0,26.0
branched-chain amino acid metabolic process,3.585561,0.000582136,32.0,20.0,17.0
organic acid catabolic process,3.161475,0.006346572,25.0,15.0,14.0
protein refolding,2.768468,0.001619826,41.0,23.0,20.0


## RT - Unenriched vs Enriched - MF

In [12]:
(enrichment_df('RT_Unenriched', 'RT_Enriched', mf_map)
 .query('odds_ratio > 1 and p_value < 0.01')
 .sort_values('odds_ratio', ascending=False))

Unnamed: 0,odds_ratio,p_value,total_annot,in_group,expected
phosphoglycerate mutase activity,13.085006,2.973009e-12,45.0,40.0,26.0
protein-N(PI)-phosphohistidine-sugar phosphotransferase activity,11.59144,5.99832e-05,17.0,15.0,12.0
"intramolecular transferase activity, phosphotransferases",8.252778,1.452016e-12,60.0,50.0,32.0
methionine adenosyltransferase activity,7.111075,1.122956e-15,84.0,68.0,43.0
structural molecule activity,6.837471,1.845553e-54,327.0,257.0,146.0
RNA polymerase activity,6.766247,8.142e-25,143.0,114.0,69.0
"transferase activity, transferring alkyl or aryl (other than methyl) groups",6.69052,2.883961e-15,85.0,68.0,44.0
DNA binding,6.504838,9.505842e-25,148.0,117.0,71.0
C-acetyltransferase activity,6.059563,0.003557904,15.0,12.0,11.0
structural constituent of ribosome,5.815262,6.828591e-27,178.0,137.0,84.0


## RT - Unenriched vs Enriched - BP

In [None]:
(enrichment_df('RT_Unenriched', 'RT_Enriched', bp_map)
 .query('odds_ratio > 1 and p_value < 0.01')
 .sort_values('odds_ratio', ascending=False))

Unnamed: 0,odds_ratio,p_value,total_annot,in_group,expected
movement of cell or subcellular component,99.007941,5.477514e-46,116.0,114.0,57.0
locomotion,99.007941,5.477514e-46,116.0,114.0,57.0
S-adenosylmethionine metabolic process,7.111075,1.122956e-15,84.0,68.0,43.0
RNA biosynthetic process,6.894115,2.009358e-25,145.0,116.0,70.0
nucleic acid-templated transcription,6.766247,8.142000e-25,143.0,114.0,69.0
sulfur compound biosynthetic process,6.690520,2.883961e-15,85.0,68.0,44.0
amide biosynthetic process,6.037490,3.428364e-29,187.0,145.0,88.0
peptide metabolic process,5.895077,1.094698e-28,188.0,145.0,88.0
cellular amide metabolic process,5.846131,5.978251e-29,191.0,147.0,89.0
macromolecule biosynthetic process,4.639342,1.728650e-48,432.0,307.0,188.0


## RAG - Enriched vs Unenriched - MF

In [None]:
(enrichment_df('RAG_Enriched', 'RAG_Unenriched', mf_map)
 .query('odds_ratio > 1 and p_value < 0.01')
 .sort_values('odds_ratio', ascending=False))

## RAG - Enriched vs Unenriched - BP

In [None]:
(enrichment_df('RAG_Enriched', 'RAG_Unenriched', bp_map)
 .query('odds_ratio > 1 and p_value < 0.01')
 .sort_values('odds_ratio', ascending=False))

## RAG - Unenriched vs Enriched - MF

In [None]:
(enrichment_df('RAG_Unenriched', 'RAG_Enriched', mf_map)
 .query('odds_ratio > 1 and p_value < 0.01')
 .sort_values('odds_ratio', ascending=False))

## RAG - Unenriched vs Enriched - BP

In [None]:
(enrichment_df('RAG_Unenriched', 'RAG_Enriched', bp_map)
 .query('odds_ratio > 1 and p_value < 0.01')
 .sort_values('odds_ratio', ascending=False))