In [1]:
from elliot_utils import *
from scipy import stats

In [2]:
figPath = Path.cwd().joinpath('figures/analyzing_hybrid/')
analysisPath = Path.cwd().joinpath('analysis_files/analyzing_hybrid/')

In [3]:
tailoredResults = getOrderedFiles(Path.cwd().joinpath(TAILORED_RESULTS), '.tsv')
hybridResults = getOrderedFiles(Path.cwd().joinpath(HYBRID_RESULTS), '.tsv')

In [51]:
tailoredRefs = []
for dbFile in getOrderedFiles(TAILORED_DB, '.fasta'):
    tailoredRefs.append(ProtRef(dbFile))
hybridRefs = []
for dbFile in getOrderedFiles(HYBRID_DB, '.fasta'):
    hybridRefs.append(ProtRef(dbFile))

In [4]:
tAllowedHumanPeps = getFilteredPeptides(tailoredResults, 'human')
hAllowedHumanPeps = getFilteredPeptides(hybridResults, 'human')
tAllowedBacteriaPeps = getFilteredPeptides(tailoredResults, 'bacteria')
hAllowedBacteriaPeps = getFilteredPeptides(hybridResults, 'bacteria')

In [5]:
# Returns a list with the number of statistically significant PSMs in the samples of the specified type.
# Can specify a set of allowed peptides to filter
def countSpectra(resultsList, lookFor, allowedPeps):
    counts = []
    for res in resultsList:
        with res.open(mode='r') as infile:
            reader = csv.reader(infile, delimiter='\t')
            count = 0
            for row in reader:
                protType = determineIDType(row)
                if protType == 'first':
                    continue
                if not isSignificant(row):
                    break
                if protType == lookFor and row[PEPTIDE] in allowedPeps:
                    count += 1
        counts.append(count)
    return counts

In [7]:
tHumanCount = countSpectra(tailoredResults, 'human', allowedPeps=tAllowedHumanPeps)
hHumanCount = countSpectra(hybridResults, 'human', allowedPeps=hAllowedHumanPeps)
tBacteriaCount = countSpectra(tailoredResults, 'bacteria', allowedPeps=tAllowedBacteriaPeps)
hBacteriaCount = countSpectra(hybridResults, 'bacteria', allowedPeps=hAllowedBacteriaPeps)

What is the correlation between the number of genomes are available for an organism and how many new tryptic peptides you get for that organism when performing metagenomic sequencing?

In [14]:
# Get the set of unique tryptic peptides I have for each organism, collapsing all Gardnerellas into G. vaginalis
#trypticSets = {'Gardnerella vaginalis':set()} #key=species name, value=set of all tryptic peptides for the organism
#for file in peptideDataDir.iterdir():
#    if not file.suffix == '.fasta':
#        continue
#    speciesNameFasta = file.name.replace('_', ' ')
#    speciesName = speciesNameFasta.replace('.fasta', '')
#    if speciesName.find('Gardnerella') != -1:
#        identifyTrypticPeptides(file, trypticSets['Gardnerella vaginalis'])
#    else:
#        trypticSets[speciesName] = set()
#        identifyTrypticPeptides(file, trypticSets[speciesName])

In [20]:
publicTrypticCountsFile = analysisPath.joinpath('public_tryptic_peptides.csv')

In [22]:
# It took a while to get the number of tryptic peptides for all species, so I'll write that out to a csv file to make it easier to access
with publicTrypticCountsFile.open(mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    for s, trypticSet in trypticSets.items():
        species = s.replace('.fasta', '')
        writer.writerow([species, str(len(trypticSet))])

In [15]:
# Takes a deduplicated FASTA file of bacterial proteins
# Adds all tryptic peptides with length ≥6 amino acids (Swaney, Wenger, & Coon [2010]) generated by processing all those proteins to the supplied set
def identifyTrypticPeptides(fastaFile, trypticPeptides):
    with open(fastaFile, 'r') as infile:
        rawText = infile.read()
        dataList = rawText.split('\n>')
        dataList[0] = dataList[0][1:]
        del rawText
        for sequence in dataList:
            protein = Protein(sequence)
            seq = protein.sequence
            trypticList = []
            for i in range(len(seq)):
                trypticList.append(seq[i])
                if i + 1 == len(seq):
                    if len(trypticList) > 5:
                        trypticPeptides.add(''.join(trypticList))
                    trypticList = []
                    continue
                if (seq[i] == 'K' or seq[i] == 'R') and seq[i + 1] != 'P':
                    if len(trypticList) > 5:
                        trypticPeptides.add(''.join(trypticList))
                    trypticList = []

In [25]:
shotgunFile = Path.cwd().joinpath('../ShotgunMetagenomics/genes_annotated.fasta')
shotgunReference = ProtRef(shotgunFile)

In [26]:
# Returns a set of all tryptic peptides with length >5 amino acids in the supplied sequence
def trypticDigest(seq):
    trypticPeptides = set()
    trypticList = []
    for i in range(len(seq)):
        trypticList.append(seq[i])
        if i + 1 == len(seq):
            if len(trypticList) > 5:
                trypticPeptides.add(''.join(trypticList))
            trypticList = []
            continue
        if (seq[i] == 'K' or seq[i] == 'R') and seq[i + 1] != 'P':
            if len(trypticList) > 5:
                trypticPeptides.add(''.join(trypticList))
            trypticList = []
    return trypticPeptides

In [12]:
# Make a dictionary to translate metagenomic taxa calls to the names I'm using:
shotgun2taxa = { #key=shotgun sequencing taxa name, value=updated taxa name
    'Clostridiales bacterium KA00274':'BVAB2',
    'Coriobacteriales bacterium DNF00809':'Lachnocurva vaginae',
    'Candidatus Saccharibacteria':'TM7 H1',
    'Megasphaera genomosp. type_1':'Megasphaera lornae',
    'Megasphaera genomosp. type_2':'Megasphaera hutchinsoni'
}

What bacteria had the largest increase in the percent of spectra identified for them using the hybrid databases?

In [53]:
# Takes in a dictionary in the format key=species name, value=number of PSMs attributable to that organism
# Modifies the dictionary to count the total number of PSMs for that organism
def countAllPsmsTaxa(dictToModify, results, allowedPeps, protRefs, taxaRef):
    for i in range(len(results)):
        ref = protRefs[i]
        result = results[i]
        with result.open(mode='r') as infile:
            reader = csv.reader(infile, delimiter='\t')
            for row in reader:
                protType = determineIDType(row)
                if protType == 'first':
                    continue
                if not isSignificant(row):
                    break
                if protType == 'bacteria' and row[PEPTIDE] in allowedPeps:
                    hits = getProteinHitList(row, 'bacteria')
                    hitTaxa = set()
                    for hit in hits:
                        prot = ref.getProt(hit)
                        for t in prot.taxa:
                            taxaName = t
                            if taxaName in taxaRef.keys():
                                taxaName = taxaRef[taxaName]
                            elif taxaName.find('Gardnerella') != -1:
                                taxaName = 'Gardnerella vaginalis'
                            hitTaxa.add(taxaName)
                    for taxon in hitTaxa:
                        if taxon in dictToModify.keys():
                            dictToModify[taxon] += 1

In [64]:
tPsms = {}
hPsms = {}
for t in differenceTryptics.keys():
    tPsms[t] = 0
    hPsms[t] = 0
for t in shotgun2taxa.values():
    tPsms[t] = 0
    hPsms[t] = 0

In [65]:
countAllPsmsTaxa(tPsms, tailoredResults, tAllowedBacteriaPeps, tailoredRefs, shotgun2taxa)
countAllPsmsTaxa(hPsms, hybridResults, hAllowedBacteriaPeps, hybridRefs, shotgun2taxa)

In [66]:
percentChangePsms = {} #key=species name, value=% increase PSMs for the species 16S -> Hybrid
for t in tPsms.keys():
    percentChangePsms[t] = ((hPsms[t] - tPsms[t]) / (1 + tPsms[t])) * 100

In [68]:
# Record the percent change in PSMs for each taxa going from 16S -> Hybrid databases:
taxaPsmChangeFile = analysisPath.joinpath('16s_to_hybrid_psm_change_taxa.csv')
with taxaPsmChangeFile.open(mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['Species', 'Percent Increase PSMs'])
    for t, percent in percentChangePsms.items():
        writer.writerow([t, str(percent)])

In [81]:
rawChangePsms = {} #key=species name, value=increase PSMs for the species 16S -> Hybrid
for t in tPsms.keys():
    rawChangePsms[t] = hPsms[t] - tPsms[t]

In [82]:
# Record the change in PSMs for each taxa going from 16S -> Hybrid databases:
taxaPsmRawChangeFile = analysisPath.joinpath('16s_to_hybrid_psm_change_raw_taxa.csv')
with taxaPsmRawChangeFile.open(mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['Species', 'Number Increase PSMs'])
    for t, percent in rawChangePsms.items():
        writer.writerow([t, str(percent)])