**12/17/21**

The purpose of this notebook is to document construction of a hybrid public/metagenomic database. When looking at results of the different database types, I noticed the individual databases actually identified more bacterial peptides in a few of the samples. So are the bacterial strains present in those samples just not well represented by the sequences in publicly available databases? And would complementing the Tailored databases with sequences from metagenomic sequencing boost bacterial peptide identifications even more?

In [1]:
from elliot_utils import *

In [2]:
tailoredDBs = getOrderedFiles(Path.cwd().joinpath('../12-16-21_NextflowMSGF_Tailored4_Combined/databases/'), '.fasta')
individualDBs = getOrderedFiles(Path.cwd().joinpath('../ShotgunMetagenomics/individual_databases/'), '.fasta')

In [3]:
# Extracts proteins from dbFile, adding them to the supplied dictionary
# Differentiates between contaminant proteins and regular proteins with identical sequences
# dictionary key=protein sequence, value=protein object
def addProtToDict(dictionary, dbFile):
    with open(dbFile, 'r') as database:
        rawText = database.read()
        dataList = rawText.split('\n>')
        dataList[0] = dataList[0][1:]
        del rawText
        for sequence in dataList:
            newProt = Protein(sequence)
            identifier = f'contaminant_{newProt.sequence}' if newProt.isContaminant else newProt.sequence
            if not identifier in dictionary.keys():
                dictionary[identifier] = newProt

In [4]:
# Go through each tailored, then individual database in order and pull out the protein sequences
# Combine proteins with identical sequences, prioritizing annotation data from the tailored database
# Do not combine contaminant sequences with regular proteins that have identical amino acid sequences
proteinHolders = [] # key=protein sequence, value=protein object
for i in range(len(SAMPLE_NAMES)):
    proteinHolders.append({})
    addProtToDict(proteinHolders[i], tailoredDBs[i])
    addProtToDict(proteinHolders[i], individualDBs[i])

In [5]:
hybridDir = Path.cwd().joinpath('../12-17-21_NextflowMSGF_Combined_Hybrid2/databases/')

In [6]:
# Write the initial Hybrid databases to file
for i in range(len(SAMPLE_NAMES)):
    toWrite = []
    for prot in proteinHolders[i].values():
        toWrite.append(prot.getEntry())
    with open(hybridDir.joinpath(f'{SAMPLE_NAMES[i]}_Hybrid2.fasta'), 'w', newline='') as output:
        output.write(''.join(toWrite))

In [7]:
# Make the refined hybrid databases
unprocessed = getOrderedFiles(Path.cwd().joinpath('../12-17-21_NextflowMSGF_Combined_Hybrid2/output/'), '.tsv')
processedPath = Path.cwd().joinpath('../12-17-21_NextflowMSGF_Combined_Hybrid2/output_processed/')

In [9]:
collapseRepeatRows(unprocessed, processedPath)

In [10]:
processed = getOrderedFiles(processedPath, '.tsv')
hybridDBs = getOrderedFiles(hybridDir, '.fasta')
refinedDBPath = Path.cwd().joinpath('../12-17-21_NextflowMSGF_Combined_Hybrid2/databases_refined/')

In [11]:
# For each result/db pair, pull out all the hit proteins in the result file, identify the proteins in the DB, then write them out to a new DB
for i in range(len(processed)):
    protsToInclude = set()
    with processed[i].open(mode='r') as infile:
        reader = csv.reader(infile, delimiter='\t')
        for row in reader:
            protType = determineHitType(row)
            if protType == 'first':
                continue
            hitProts = getProteinHitList(row)
            for hit in hitProts:
                if hit.find('XXX_') == -1: # Ignore decoy hits
                    protsToInclude.add(hit)
    protObjs = {} # key=protID, value=protein object
    with open(hybridDBs[i], 'r') as database:
        rawText = database.read()
        dataList = rawText.split('\n>')
        dataList[0] = dataList[0][1:]
        del rawText
        for sequence in dataList:
            newProt = Protein(sequence)
            if newProt.id in protsToInclude:
                if newProt.id in protObjs.keys():
                    for taxa in newProt.taxa:
                        protObjs[newProt.id].addTaxa(taxa)
                else:
                    protObjs[newProt.id] = newProt
    toWrite = []
    for prot in protObjs.values():
        toWrite.append(prot.getEntry())
    with open(refinedDBPath.joinpath(f'{SAMPLE_NAMES[i]}_Hybrid2_Refined.fasta'), 'w', newline='') as output:
        output.write(''.join(toWrite))

In [12]:
refinedOutput = getOrderedFiles(Path.cwd().joinpath('../12-17-21_NextflowMSGF_Combined_Hybrid2/output_refined/'), '.tsv')
refinedOutputProcessedDir = Path.cwd().joinpath('../12-17-21_NextflowMSGF_Combined_Hybrid2/output_refined_processed/')

In [13]:
collapseRepeatRows(refinedOutput, refinedOutputProcessedDir)