In [None]:
ref = '../out/associations/IAI39.tsv'
associated = '../out/associations/associated.eggnogg.tsv'
summary = '../out/associations/summary_cont_lmm_kmer.tsv'
roary = '../out/roary/gene_presence_absence.csv'
names = '../out/associations/associated_ogs.final.tsv'
hpi = '../data/hpi.tsv'
others = '../data/others.tsv'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('white')

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from goatools.obo_parser import GODag
from goatools.goea.go_enrichment_ns import GOEnrichmentStudy

In [None]:
ogs = set(pd.read_csv(summary, sep='\t', index_col=0).index)

In [None]:
cogs = 'ABCDEFGHIJKLMNOPQRSTUYZ'

In [None]:
pan = pd.read_csv(roary, sep=',', index_col=0, low_memory=False)['IAI39']

In [None]:
names = pd.read_csv(names, sep='\t')

In [None]:
names = names.set_index('query')['preferred_og_name'].to_dict()

In [None]:
for l in open(hpi):
    names[l.split()[0]] = l.rstrip().split()[1]
for l in open(others):
    for v in l.rstrip().split('\t')[1:]:
        names[v.split(',')[0]] = v.split(',')[1]

COG
---

CELLULAR PROCESSES AND SIGNALING
- D Cell cycle control, cell division, chromosome partitioning
- M Cell wall/membrane/envelope biogenesis
- N Cell motility
- O Post-translational modification, protein turnover, and chaperones
- T Signal transduction mechanisms
- U Intracellular trafficking, secretion, and vesicular transport
- V Defense mechanisms
- W Extracellular structures
- Y Nuclear structure
- Z Cytoskeleton

INFORMATION STORAGE AND PROCESSING
- A RNA processing and modification
- B Chromatin structure and dynamics
- J Translation, ribosomal structure and biogenesis
- K Transcription
- L Replication, recombination and repair

METABOLISM
- C Energy production and conversion
- E Amino acid transport and metabolism
- F Nucleotide transport and metabolism
- G Carbohydrate transport and metabolism
- H Coenzyme transport and metabolism
- I Lipid transport and metabolism
- P Inorganic ion transport and metabolism
- Q Secondary metabolites biosynthesis, transport, and catabolism

POORLY CHARACTERIZED
- R General function prediction only
- S Function unknown

In [None]:
categs = {'D': 'Cell cycle control, cell division, chromosome partitioning',
'M': 'Cell wall/membrane/envelope biogenesis',
'N': 'Cell motility',
'O': 'Post-translational modification, protein turnover, and chaperones',
'T': 'Signal transduction mechanisms',
'U': 'Intracellular trafficking, secretion, and vesicular transport',
'V': 'Defense mechanisms',
'W': 'Extracellular structures',
'Y': 'Nuclear structure',
'Z': 'Cytoskeleton',
'A': 'RNA processing and modification',
'B': 'Chromatin structure and dynamics',
'J': 'Translation, ribosomal structure and biogenesis',
'K': 'Transcription',
'L': 'Replication, recombination and repair',
'C': 'Energy production and conversion',
'E': 'Amino acid transport and metabolism',
'F': 'Nucleotide transport and metabolism',
'G': 'Carbohydrate transport and metabolism',
'H': 'Coenzyme transport and metabolism',
'I': 'Lipid transport and metabolism',
'P': 'Inorganic ion transport and metabolism',
'Q': 'Secondary metabolites biosynthesis, transport, and catabolism',
'R': 'General function prediction only',
'S': 'Function unknown',
'X': 'Not annotated'}

In [None]:
m = pd.read_csv(ref, sep='\t', skiprows=[0, 1, 2, 3], header=None)
m.columns = ['query_name', 'seed_eggNOG_ortholog', 'seed_ortholog_evalue',
             'seed_ortholog_score', 'best_tax_level', 'Preferred_name',
             'GOs', 'EC', 'KEGG_ko', 'KEGG_Pathway', 'KEGG_Module',
             'KEGG_Reaction', 'KEGG_rclass', 'BRITE', 'KEGG_TC',
             'CAZy', 'BiGG_Reaction', 'Domain', '?', '??', 'COG_categs',
             'COG_annotations']
m = m.drop(m.tail(3).index)
m.loc[m.index.difference(m['COG_categs'].dropna().index),
      'COG_categs'] = 'X'

In [None]:
n = pd.read_csv(associated, sep='\t', skiprows=[0, 1, 2, 3], header=None)
n.columns = ['query_name', 'seed_eggNOG_ortholog', 'seed_ortholog_evalue',
             'seed_ortholog_score', 'best_tax_level', 'Preferred_name',
             'GOs', 'EC', 'KEGG_ko', 'KEGG_Pathway', 'KEGG_Module',
             'KEGG_Reaction', 'KEGG_rclass', 'BRITE', 'KEGG_TC',
             'CAZy', 'BiGG_Reaction', 'Domain', '?', '??', 'COG_categs',
             'COG_annotations']
n = n.drop(n.tail(3).index)
n.loc[n.index.difference(n['COG_categs'].dropna().index),
      'COG_categs'] = 'X'

In [None]:
n = n[n['query_name'].isin(ogs)]

In [None]:
res = []
for cog in cogs + 'X':
    pop_c = m[m['COG_categs'].str.contains(cog)].shape[0]
    pop_n = m[~m['COG_categs'].str.contains(cog)].shape[0]

    study_c = n[n['COG_categs'].str.contains(cog)].shape[0]
    study_n = n[~n['COG_categs'].str.contains(cog)].shape[0]

    table = [[study_c, pop_c],
             [study_n, pop_n]]
    odds_ratio, pvalue = stats.fisher_exact(table, alternative='greater')
        
    # empirical
    ratios = []
    for _ in range(100):
        pop_c = m[m['COG_categs'].str.contains(cog)].shape[0]
        pop_n = m[~m['COG_categs'].str.contains(cog)].shape[0]
        
        r = m.sample(n.shape[0])
        study_r_c = r[r['COG_categs'].str.contains(cog)].shape[0]
        study_r_n = r[~r['COG_categs'].str.contains(cog)].shape[0]
        
        table = [[study_r_c, pop_c],
                 [study_r_n, pop_n]]
        ratios.append(stats.fisher_exact(table, alternative='greater')[0])

    zscores = stats.zscore(ratios + [odds_ratio])
    pvalues = stats.norm.sf(abs(zscores))
    qvalues = sm.stats.multipletests(pvalues, alpha=0.05, method='fdr_bh')[1]
    
    res.append((cog, categs[cog], pvalue, qvalues[-1]))

r = pd.DataFrame(res,
                 columns=['cog', 'category', 'pvalue', 'empirical-qvalue'])

In [None]:
r['qvalue'] = sm.stats.multipletests(r['pvalue'], alpha=0.05, method='fdr_bh')[1]
r = r[['cog', 'category', 'pvalue', 'qvalue', 'empirical-qvalue']]

In [None]:
r[r['empirical-qvalue'] < 0.05]

In [None]:
r.to_csv('cog.tsv', sep='\t', index=False)

GO terms
---

In [None]:
!wget --quiet -O go-basic.obo "http://purl.obolibrary.org/obo/go/go-basic.obo"

In [None]:
obodag = GODag("go-basic.obo")

In [None]:
og = {}
for k, v in pan.dropna().iteritems():
    for g in v.split('\t'):
        og[g] = k

In [None]:
assoc = {og.get(k, k): {y for y in v.split(',')}
         for k, v in m.set_index('query_name')['GOs'].dropna().to_dict().items()}
for k, v in n.set_index('query_name')['GOs'].dropna().to_dict().items():
    if k in assoc:
        continue
    assoc[k] = {y for y in v.split(',')}

In [None]:
go = GOEnrichmentStudy(assoc.keys(), assoc, obodag, methods=['fdr_bh'])
res = go.run_study(set(n['query_name']))

In [None]:
passing = [x for x in res
           if x.get_pvalue() < 0.05]

In [None]:
res = []
for go in passing:
    go.study_items = ['%s' % names.get(x, x) for x in go.study_items]
    res.append(str(go).split('\t'))
r = pd.DataFrame(res,
                 columns=go.get_prtflds_default())

In [None]:
r

In [None]:
r.to_csv('go_terms.tsv', sep='\t', index=False)