# Prep for other enrichment Algorithms (in Bioconductor [R])

In [2]:
import sys
sys.path.append('..')
import os
from itertools import chain
import shelve
from tqdm import tqdm
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from adjustText import adjust_text

from scripts import utils
from scripts.analysis import build_loci
from scripts.analysis.DBInfo import DBInfo

BASE = '../data'


db_info = DBInfo("compil_mgm")
metadata = build_loci.read_metadata(os.path.join(BASE,"metadata.csv"))

In [3]:
samples = shelve.open(os.path.join(BASE,"Samples.shelve"))

unenr_keys = []
for key, sample in samples.items():
    if sample['quant'] and not sample['enriched']:
        unenr_keys.append(key)
unenr_keys = sorted(unenr_keys, key=lambda x: (x.split('_')[2], x.split('_')[-1], x.split('_')[-2]=='N'))

enr_keys = []
for key, sample in samples.items():
    if sample['quant'] and sample['enriched'] and sample['probe'] == 'CMK':
        enr_keys.append(key)
enr_keys = sorted(enr_keys, key=lambda x: (x.split('_')[2], x.split('_')[-1], x.split('_')[-2]=='N'))

In [None]:
unenr_grouped_loci = utils.load(os.path.join(BASE,"unenriched_grouped_loci.pkl.gz"))
enr_grouped_loci = utils.load(os.path.join(BASE,"enriched_grouped_loci.pkl.gz"))
grouped_loci = utils.load(os.path.join(BASE,"grouped_loci.pkl.gz"))

annotations = {locus.cluster_id:locus.annotations['go'] for locus in grouped_loci if 'go' in locus.annotations}

In [9]:
from scripts.analysis import gsea
g = gsea.GSEA()

load obo file ../scripts/go-basic.obo
46933 nodes imported


In [5]:
def make_gmt(annotations, name):
    go_df = pd.DataFrame.from_dict(annotations, orient='index')
    go_df[-1] = go_df.index.map(lambda i: g.go_ontology[i].name)
    go_df = go_df.sort_index(axis=1)
    go_df.to_csv(name+'.gmt', sep='\t', header=False)

In [6]:
def make_gene2goMap(grouped_loci, name):
    annotations = {locus.cluster_id:locus.annotations['go'] for locus in grouped_loci if 'go' in locus.annotations}
    with open(os.path.join(BASE, name+'.map'), 'w') as fout:
        for locus, terms in annotations.items():
            fout.write("{}\t".format(locus))
            for i, term in enumerate(terms):
                if i == 0:
                    fout.write('{}'.format(term))
                else:
                    fout.write(',{}'.format(term))
            fout.write('\n')

In [7]:
make_gene2goMap(grouped_loci, 'clusterID2GO')

In [10]:
make_gmt(annotations, 'test')

KeyError: 62032555

In [11]:
samp_names = list(unenr_keys+enr_keys)

met1 = metadata[samp_names].T

grpd = met1.groupby(['enriched', 'n15'])

for grp, data in grpd:
    for i, samp in enumerate( data.T ):
        met1.loc[samp, 'technical'] = i+1

met1 = met1.drop(['census', 'comb_dta', 'h_dta', 'l_dta', 'path'], axis=1)
met1 = met1.sort_values(['enriched', 'n15', 'technical'])
met1.to_csv(os.path.join(BASE, 'filt_metadata.csv'))

In [12]:
loci = defaultdict(dict)
for cluster in grouped_loci:
    for samp, values in cluster.quantification.items():
        loci[cluster.cluster_id].update({samp: int(np.round(values['back_calc']))})

count_df = pd.DataFrame(loci).T.fillna(0)
count_df = count_df.T.reindex(met1.index).T

count_df.to_csv(os.path.join(BASE,'counts.csv'))

In [13]:
def get_cluster_pvals(grouped_loci):
    loci = defaultdict(dict)
    for locus in grouped_loci:
        loci[locus.cluster_id].update({'ratio': locus.avg_ratio, 'p_value': locus.p_value})
    return(pd.DataFrame(loci).T.dropna())

In [14]:
get_cluster_pvals(unenr_grouped_loci).to_csv(os.path.join(BASE, 'unenriched_pvals.csv'))
get_cluster_pvals(enr_grouped_loci).to_csv(os.path.join(BASE, 'enriched_pvals.csv'))

In [15]:
build_loci.get_annotation_df(unenr_grouped_loci).to_csv(os.path.join(BASE, 'unenriched_annot.csv'))
build_loci.get_annotation_df(enr_grouped_loci).to_csv(os.path.join(BASE, 'enriched_annot.csv'))
build_loci.get_annotation_df(grouped_loci).to_csv(os.path.join(BASE, 'loci_annot.csv'))

### Info to determine what sample groups show up in which loci

This will allow for fihser test comparisons of enriched vs unenriched samples in topGO

In [16]:
# Assign groups to each locus
groups = {'RT-Enriched': rt_enr_keys, 'RAG-Enriched': rag_enr_keys, 'RT-Unenriched': rt_un_keys, 'RAG-Unenriched': rag_un_keys}

for locus in grouped_loci:
    locus.group = []
    samples_in_locus = {sample for sample, quant in locus.quantification.items() if (quant['ratio'] > 0 or quant['counts'] >= 5)}
    for group, members in groups.items():
        if set(members) & samples_in_locus:
            locus.group.append(group)

NameError: name 'rt_enr_keys' is not defined

In [56]:
groups = dict()
for locus in grouped_loci:
    groups.update({locus.cluster_id: {"RT_Enriched": "RT-Enriched" in locus.group,
                   "RAG_Enriched": "RAG-Enriched" in locus.group, "RT_Unenriched": "RT-Unenriched" in locus.group,
                   "RAG_Unenriched": "RAG-Unenriched" in locus.group}})

In [57]:
group_df = pd.DataFrame.from_dict(groups).T

In [58]:
group_df.query('~RT_Unenriched and RT_Enriched')

Unnamed: 0,RAG_Enriched,RAG_Unenriched,RT_Enriched,RT_Unenriched
115324,True,False,True,False
407882,True,False,True,False
1967039,True,False,True,False
2006234,False,False,True,False
2026093,False,False,True,False
2118852,False,False,True,False
3525060,False,False,True,False
3590912,False,False,True,False
3604714,False,False,True,False
3713116,True,False,True,False


In [59]:
group_df.to_csv(os.path.join(BASE,'groups.csv'))