# Taxonomy Analysis

This runs the backend of the taxonomy analysis and generates Dataframes that have tax data at Phylum and Family levels.

For each peptide in a sample, it searches the database to find the LCA for that peptide.  If it can be assigned at the Phylum or Family level, then it is assigned for the level and given a weight which is equal to the number of spectral counts for that peptide.

In [1]:
import sys
sys.path.append('..')
import os
from itertools import chain
from collections import defaultdict
import shelve
import numpy as np
import pandas as pd

from tqdm import tqdm
from pymongo import MongoClient

from scripts import utils
from scripts.analysis import build_loci
from scripts.analysis import taxonomy
from scripts.analysis import DBInfo

In [2]:
# Start and test DB connections
dbinfo = DBInfo.DBInfo('compil_mgm')
t = taxonomy.Taxonomy(host="wl-cmadmin", port=27017)
protDB = dbinfo.protDB
seqDB = dbinfo.seqDB
taxDB = MongoClient('wl-cmadmin', 27017)["TaxDB_20151009"]["TaxDB_20151009"]
taxDB.find_one()

{'_id': 2,
 'exact_match': True,
 'matched_organism': 'Paenibacillus phage phiIBB_Pl23',
 'organism': 'Paenibacillus phage phiIBB_Pl23',
 'taxid': 1337877}

In [3]:
# Load data
BASE = '../data'
OUT = BASE + '/tax_data'

samples = shelve.open(os.path.join(BASE, 'samples.shelve'))
protein_clusters = shelve.open(os.path.join(BASE, 'protein_clusters.shelve'))

metadata = pd.read_csv(os.path.join(BASE,"filt_metadata.csv"), index_col=0)
samp_names = list(metadata.index)

met1 = pd.read_csv(os.path.join(BASE, 'filt_metadata.csv'), index_col=0)

unenr_grouped_loci = utils.load(os.path.join(BASE,"unenriched_grouped_loci.pkl.gz"))
enr_grouped_loci = utils.load(os.path.join(BASE,"enriched_grouped_loci.pkl.gz"))
grouped_loci = utils.load(os.path.join(BASE,"grouped_loci.pkl.gz"))

In [4]:
# Filter out unused samples
samps = {k:v for k, v in samples.items() if k in samp_names}

In [5]:
sample_pairs = metadata.reset_index().set_index(['enriched', 'technical']).sort_index().groupby(level=[0,1])

pairs = []
for x, y in sample_pairs:
    pairs.append(list(y['index'].values))

n14_samps = [x[0] for x in pairs]
n15_samps = [x[1] for x in pairs]

n14_un_samps = [x for x in n14_samps if x.startswith('UL_')]
n15_un_samps = [x for x in n15_samps if x.startswith('UL_')]

n14_enr_samps = [x for x in n14_samps if x.startswith('CL_')]
n15_enr_samps = [x for x in n15_samps if x.startswith('CL_')]

In [6]:
ctl_samps = []
for samp_name, samp in samples.items():
    if samp['probe'] == 'DMSO':
        ctl_samps.append(samp_name)

In [7]:
peptides_to_filter = []
for sample in ctl_samps:
    for locus in protein_clusters[sample]:
        if locus.quantification['counts'] >= 10:
            peptides_to_filter += list(locus.peptide_quant.keys())
peptides_to_filter = set(peptides_to_filter)

In [8]:
dfs = dict()
dfs["phylum"] = dict()
dfs["family"] = dict()

In [9]:
sample_pep_quant = defaultdict(dict)
sample_sum = dict()
for sample_name, sample in samps.items():
    # Probably a better way to do this than looping, but I'm pressed for time, Hopefully won't have to re-run lots of times, otherwise a better algortihm should be used
    for pep in tqdm(sample.pep_quant.columns):
        # Calculate the back_calculated counts for a peptide.
        if sample['n15']:
            if not np.isnan(sample.pep_quant.loc['ratio', pep]) and sample.pep_quant.loc['l_spec', pep] != 0:
                back_calc = sample.pep_quant.loc['l_spec', pep] / sample.pep_quant.loc['ratio', pep]
            else:
                back_calc = sample.pep_quant.loc['h_spec', pep]
        else:
            if sample.pep_quant.loc['l_spec', pep] == 0 and not np.isnan(sample.pep_quant.loc['ratio', pep]):
                back_calc = sample.pep_quant.loc['h_spec', pep] * sample.pep_quant.loc['ratio', pep]
            else:
                back_calc = sample.pep_quant.loc['l_spec', pep]
        
        # Store these back counts as quant for the pep
        sample_pep_quant[sample_name].update({pep: back_calc})
    
    sample_sum[sample_name] = sum(sample_pep_quant[sample_name].values())
nf = {k:v/np.median(list(sample_sum.values())) for k,v in sample_sum.items()}

100%|██████████| 1071/1071 [00:00<00:00, 2149.56it/s]
100%|██████████| 1417/1417 [00:00<00:00, 4095.10it/s]
100%|██████████| 1360/1360 [00:00<00:00, 3962.44it/s]
100%|██████████| 1165/1165 [00:00<00:00, 2413.40it/s]
100%|██████████| 2682/2682 [00:00<00:00, 3731.16it/s]
100%|██████████| 1591/1591 [00:00<00:00, 3635.17it/s]
100%|██████████| 1609/1609 [00:00<00:00, 3778.98it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2244.72it/s]
100%|██████████| 1677/1677 [00:00<00:00, 2409.37it/s]
100%|██████████| 2700/2700 [00:00<00:00, 4009.22it/s]
100%|██████████| 2236/2236 [00:00<00:00, 4173.23it/s]
100%|██████████| 1044/1044 [00:00<00:00, 2561.82it/s]
100%|██████████| 2079/2079 [00:00<00:00, 3657.24it/s]
100%|██████████| 1570/1570 [00:00<00:00, 3012.20it/s]
100%|██████████| 2194/2194 [00:00<00:00, 3793.10it/s]
100%|██████████| 1628/1628 [00:00<00:00, 2498.78it/s]
100%|██████████| 1697/1697 [00:00<00:00, 3682.81it/s]
100%|██████████| 1831/1831 [00:00<00:00, 4090.80it/s]
100%|██████████| 948/948 [00

In [10]:
for sample_name, sample in samps.items():
    
    pep_tax = dict()
    peptides = sample.peptides
    
    #Get rid of control peptides if the sample is enriched.
    if sample['enriched']:
        peptides = set(pep for pep in peptides if not pep in peptides_to_filter)
    
    #Do any peptides, by themselves (but restricted to the proteins ID'd in this sample), point to one particular species?
    for peptide in tqdm(peptides):
        p = set(x['i'] for x in seqDB.find_one(peptide)['p']) & sample.prot_ids
        x = list(taxDB.aggregate([{'$match':{'_id':{'$in':list(p)}}},{'$group':{'_id':None,'taxid':{'$addToSet':"$taxid"}}}]))
        if x:
            taxIDs=[y for y in x[0]['taxid'] if y]
            pep_tax[peptide] = taxIDs
    
    all_taxids = set(chain(*pep_tax.values()))
    pep_lca = dict()
    for pep, taxIDs in tqdm(pep_tax.items()):
        pep_lca[pep] = t.LCA(taxIDs)
    
    pep_lcatax = dict()
    all_taxids = set(pep_lca.values())
    taxdict = {x['taxid']:x for x in t.taxonomy_coll.find({'taxid': {'$in': list(all_taxids)}})}
    for pep, lca in tqdm(pep_lca.items()):
        if lca:
            tax = taxdict[lca]
            if tax['rank#'] and tax['rank#']>=16:
                pep_lcatax[pep] = tax
    #
    species_pep = defaultdict(set)
    pep_species = {pep:tax['scientific_name'] for pep,tax in pep_lcatax.items()}
    for pep,species in pep_species.items():
        species_pep[species].add(pep)
            
    species_pep_list = sorted(list(species_pep.items()), key=lambda x:len(x[1]), reverse=True)
    
    with open(os.path.join(OUT,"organims_specific_peptides_{}.csv".format(sample_name)), 'w') as f:
        f.writelines('\n'.join([x[0] + ";" + ",".join(x[1]) for x in species_pep_list]))
    
    # Take every peptide and bring it up to family
    # Quantify family counts
    for rank in ["phylum", "family"]:
        organism_pep = defaultdict(set)
        for pep, lca in tqdm(pep_lca.items()):
            family = t.get_rank(lca, rank, "taxid")
            if family:
                organism_pep[family].add(pep)
        
        organism_quant = {k:sum([sample_pep_quant[sample_name][pep] for pep in peps]) for k,peps in organism_pep.items()}
        
        df = pd.DataFrame(organism_quant, index = ['count']).T
        df['organism_name'] = [t.taxid_to_taxonomy(int(x))['scientific_name'] for x in df.index]
        df=df.sort_values("count",ascending=False)
        
        df.to_csv(os.path.join(OUT,"{}_count_{}.csv".format(rank, sample_name)))
        dfs[rank][sample_name] = df

100%|██████████| 967/967 [00:03<00:00, 316.67it/s]
100%|██████████| 788/788 [00:00<00:00, 1703.79it/s]
100%|██████████| 788/788 [00:00<00:00, 1317827.57it/s]
100%|██████████| 788/788 [00:06<00:00, 113.35it/s]
100%|██████████| 788/788 [00:03<00:00, 243.92it/s]
100%|██████████| 1174/1174 [00:02<00:00, 403.45it/s]
100%|██████████| 1119/1119 [00:00<00:00, 1792.48it/s]
100%|██████████| 1119/1119 [00:00<00:00, 1319861.13it/s]
100%|██████████| 1119/1119 [00:10<00:00, 102.68it/s]
100%|██████████| 1119/1119 [00:04<00:00, 249.17it/s]
100%|██████████| 1360/1360 [00:03<00:00, 431.77it/s]
100%|██████████| 1256/1256 [00:00<00:00, 2803.16it/s]
100%|██████████| 1256/1256 [00:00<00:00, 1292137.80it/s]
100%|██████████| 1256/1256 [00:16<00:00, 74.50it/s]
100%|██████████| 1256/1256 [00:05<00:00, 239.77it/s]
100%|██████████| 1031/1031 [00:02<00:00, 424.54it/s]
100%|██████████| 831/831 [00:00<00:00, 2314.15it/s]
100%|██████████| 831/831 [00:00<00:00, 1415705.37it/s]
100%|██████████| 831/831 [00:07<00:00, 11

In [11]:
for rank in ["phylum", "family"]:
    for sample_name, df in dfs[rank].items():
        df['taxid'] = df.index
        df['sample'] = sample_name
    all_df = pd.concat(list(dfs[rank].values()))
    all_df.to_csv(os.path.join(OUT,"{}_count.csv".format(rank)))