# Prep for other enrichment Algorithms (in Bioconductor [R]): topGO (and maybe deseq2)

In [1]:
import sys
sys.path.append('..')
import os
from itertools import chain
from collections import defaultdict
import shelve
from tqdm import tqdm
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from adjustText import adjust_text

from scripts import utils
from scripts.analysis import build_loci
from scripts.analysis.DBInfo import DBInfo

BASE = '../data'


db_info = DBInfo("compil_mgm")
metadata = build_loci.read_metadata(os.path.join(BASE,"metadata.csv"))

In [2]:
samp_names = list(metadata.columns)
samp_names

['UL_Rg_1016_N_1',
 'UL_Rg_1019_N_1',
 'UL_Rg_1021_N_1',
 'UL_Mix_1021_1',
 'UL_Mix_1021_N_1',
 'UL_Mix_1019_1',
 'UL_Mix_1019_N_1',
 'UL_Mix_1016_1',
 'UL_Mix_1016_N_1',
 'US_Mix_1021_1',
 'US_Mix_1021_N_1',
 'US_Mix_1019_1',
 'US_Mix_1019_N_1',
 'US_Mix_1016_1',
 'US_Mix_1016_N_1',
 'UL_Mix_1121_1',
 'UL_Mix_1121_N_1',
 'US_Mix_1121_1',
 'US_Mix_1121_N_1',
 'UL_Mix_1121_2',
 'UL_Mix_1121_N_2',
 'AL_Mix_1120_1',
 'AL_Mix_1120_N_1',
 'CL_Mix_1120_1',
 'CL_Mix_1120_N_1',
 'UL_Mix_1121_3',
 'UL_Mix_1121_N_3',
 'UL_Mix_1121_4',
 'UL_Mix_1121_N_4',
 'UL_Mix_1021_2',
 'UL_Mix_1021_N_2',
 'UL_Mix_1111_1',
 'UL_Mix_1111_N_1',
 'UL_Rg_1111_N_1',
 'UL_Mix_1111_2',
 'UL_Mix_1111_N_2',
 'UL_Mix_1111_3',
 'UL_Mix_1111_N_3',
 'CL_Mix_1111_1',
 'CL_Mix_1111_N_1',
 'AL_Tc_N14_1',
 'UL_Mix_Pool_1',
 'UL_Mix_Pool_N_1',
 'UL_Tc_Pool_1',
 'CL_Mix_Pool_1',
 'CL_Mix_Pool_N_1',
 'DL_Mix_Pool_1',
 'DL_Mix_Pool_N_1',
 'AL_Mix_Pool_1',
 'AL_Mix_Pool_N_1',
 'UL_Tc_Pool_2',
 'CL_Mix_Pool_2',
 'CL_Mix_Pool_N_2',


In [3]:
unenr_grouped_loci = utils.load(os.path.join(BASE,"unenriched_grouped_loci.pkl.gz"))
enr_grouped_loci = utils.load(os.path.join(BASE,"enriched_grouped_loci_filt1.pkl.gz"))
grouped_loci = utils.load(os.path.join(BASE,"grouped_loci_filt1.pkl.gz"))

annotations = {locus.cluster_id:locus.annotations['go'] for locus in grouped_loci if 'go' in locus.annotations}

In [4]:
unenr_keys = set()
for pc in unenr_grouped_loci:
    for samp in pc.quantification.keys():
        unenr_keys.add(samp)
unenr_keys

{'UL_Mix_1111_1',
 'UL_Mix_1111_2',
 'UL_Mix_1111_3',
 'UL_Mix_1111_N_1',
 'UL_Mix_1111_N_2',
 'UL_Mix_1111_N_3',
 'UL_Mix_1121_1',
 'UL_Mix_1121_2',
 'UL_Mix_1121_3',
 'UL_Mix_1121_4',
 'UL_Mix_1121_N_1',
 'UL_Mix_1121_N_2',
 'UL_Mix_1121_N_3',
 'UL_Mix_1121_N_4',
 'UL_Mix_Pool_1',
 'UL_Mix_Pool_N_1'}

In [5]:
enr_keys = set()
for pc in enr_grouped_loci:
    for samp in pc.quantification.keys():
        enr_keys.add(samp)
enr_keys

{'CL_Mix_Pool_1',
 'CL_Mix_Pool_2',
 'CL_Mix_Pool_3',
 'CL_Mix_Pool_4',
 'CL_Mix_Pool_5',
 'CL_Mix_Pool_N_1',
 'CL_Mix_Pool_N_2',
 'CL_Mix_Pool_N_3',
 'CL_Mix_Pool_N_4',
 'CL_Mix_Pool_N_5'}

In [6]:
unenr_keys = sorted(list(unenr_keys), key=lambda x: (x.split('_')[2], x.split('_')[-1], x.split('_')[-2]=='N'))
enr_keys = sorted(list(enr_keys), key=lambda x: (x.split('_')[2], x.split('_')[-1], x.split('_')[-2]=='N'))

## topGO needs a gene2go mapping

mapping consists of:  
    Gene1\tGO:1, GO:2, GO:3  
    Gene2\tGO:3, GO:5, GO:7, GO:8, GO:9  
etc.

Will use locus cluster_id numbers for gene names as those are unique identifiers for our protein clusters

In [7]:
def make_gene2goMap(grouped_loci, name):
    annotations = {locus.cluster_id:locus.annotations['go'] for locus in grouped_loci if 'go' in locus.annotations}
    with open(os.path.join(BASE, name+'.map'), 'w') as fout:
        for locus, terms in annotations.items():
            fout.write("{}\t".format(locus))
            for i, term in enumerate(terms):
                if i == 0:
                    fout.write('{}'.format(term))
                else:
                    fout.write(',{}'.format(term))
            fout.write('\n')

In [8]:
make_gene2goMap(grouped_loci, 'clusterID2GO')

### Filter Metadata

Make an un-cluttered version of the metadata, removing certain no-loger-needed bits of info (like paths to files that have already been processed).  Also, add a few new column for technical replicates.  This will make the info easier ot work with in R.

In [9]:
samp_names = list(unenr_keys+enr_keys)

met1 = metadata[samp_names].T

grpd = met1.groupby(['enriched', 'n15'])

for grp, data in grpd:
    for i, samp in enumerate( data.T ):
        met1.loc[samp, 'technical'] = i+1

met1 = met1.drop(['census', 'comb_dta', 'h_dta', 'l_dta', 'path'], axis=1)
met1 = met1.sort_values(['enriched', 'n15', 'technical'])
met1.to_csv(os.path.join(BASE, 'filt_metadata.csv'))

In [10]:
# Ensuring that pairs are correctly labeled together, and pulling out sets of sample name keys
sample_pairs = met1.reset_index().set_index(['enriched', 'technical']).sort_index().groupby(level=[0,1])

pairs = []
for x, y in sample_pairs:
    pairs.append(list(y['index'].values))
sample_names = list(chain(*pairs))

n14_samps = [x[0] for x in pairs]
n15_samps = [x[1] for x in pairs]

n14_un_samps = [x for x in n14_samps if x.startswith('UL_')]
n15_un_samps = [x for x in n15_samps if x.startswith('UL_')]

n14_enr_samps = [x for x in n14_samps if x.startswith('CL_')]
n15_enr_samps = [x for x in n15_samps if x.startswith('CL_')]

In [11]:
# Put the counts into a dataframe in case we decide to use DESEQ2.  Use Back_calc, in which N15 counts are calculated
# via ms1 ratio by taking N14_couts * N15/N14_ratio when a ratio is avaliable
loci = defaultdict(dict)
for cluster in grouped_loci:
    for samp, values in cluster.quantification.items():
        loci[cluster.cluster_id].update({samp: int(np.round(values['back_calc']))})

count_df = pd.DataFrame(loci).T.fillna(0)
count_df = count_df.T.reindex(met1.index).T

count_df.to_csv(os.path.join(BASE,'counts.csv'))

## Retrieve the p-value for each cluster

TopGO can use this for certain tests like Fisher.

In [12]:
def get_cluster_pvals(grouped_loci):
    loci = defaultdict(dict)
    for locus in grouped_loci:
        loci[locus.cluster_id].update({'ratio': locus.avg_ratio, 'p_value': locus.p_value})
    return(pd.DataFrame(loci).T.dropna())

In [13]:
get_cluster_pvals(unenr_grouped_loci).to_csv(os.path.join(BASE, 'unenriched_pvals.csv'))
get_cluster_pvals(enr_grouped_loci).to_csv(os.path.join(BASE, 'enriched_pvals.csv'))

In [14]:
build_loci.get_annotation_df(unenr_grouped_loci).to_csv(os.path.join(BASE, 'unenriched_annot.csv'))
build_loci.get_annotation_df(enr_grouped_loci).to_csv(os.path.join(BASE, 'enriched_annot.csv'))
build_loci.get_annotation_df(grouped_loci).to_csv(os.path.join(BASE, 'loci_annot.csv'))

### Info to determine which loci show up in which sample gorups 

This will allow for fihser test comparisons of enriched vs unenriched samples in topGO

In [15]:
# Assign groups to each locus
groups = {'RT-Enriched': n14_enr_samps, 'RAG-Enriched': n15_enr_samps, 'RT-Unenriched': n14_un_samps, 'RAG-Unenriched': n15_un_samps}

for locus in grouped_loci:
    locus.group = []
    samples_in_locus = {sample for sample, quant in locus.quantification.items() if (quant['ratio'] != 0 or quant['counts'] >= 5)}
    for group, members in groups.items():
        if set(members) & samples_in_locus:
            locus.group.append(group)

In [16]:
groups = dict()
for locus in grouped_loci:
    groups.update({locus.cluster_id: {"RT_Enriched": "RT-Enriched" in locus.group,
                   "RAG_Enriched": "RAG-Enriched" in locus.group, "RT_Unenriched": "RT-Unenriched" in locus.group,
                   "RAG_Unenriched": "RAG-Unenriched" in locus.group}})

In [17]:
group_df = pd.DataFrame.from_dict(groups).T

In [18]:
group_df.to_csv(os.path.join(BASE,'groups.csv'))