**12/16/21**

The purpose of this notebook is to build Tailored databases that include contaminant sequences, then refine them.

In [2]:
from elliot_utils import *

In [3]:
# Get locations of the data
analysisPath = Path.cwd().joinpath('analysis_files/tailored_db_building/')
bacteriaDir = Path.cwd().joinpath('../PublicSequences/Combined_AllNCBI_12-21/')
humanFile = Path.cwd().joinpath('../PublicSequences/Human9606_2-6-2019_TrypPigBov.fasta')
sample2SpeciesFile = analysisPath.joinpath('Tailored0_1.csv')

In [3]:
# Get human and contaminant data from the file
humanData = ''
with humanFile.open(mode='r') as infile:
    humanData = infile.read()

In [4]:
# Collect species -> protein associations
protDict = {} # key=species name, value=list of protein objects from that species's fasta file
for fastafile in bacteriaDir.iterdir():
    species = fastafile.stem.replace('_', ' ')
    data = ''
    with fastafile.open(mode='r') as infile:
        data = infile.read()
    protList = []
    dataList = data.split('\n>')
    dataList[0] = dataList[0][1:]
    del data
    for sequence in dataList:
        protList.append(Protein(sequence))
    protDict[species] = protList

In [5]:
# Get sample -> species associations
sample2species = {} # key=sample name, value=list of species that are present in that sample at >0.1% abundance
with sample2SpeciesFile.open(mode='r', encoding='utf-8-sig') as infile:
    reader = csv.reader(infile)
    for row in reader:
        sampleName = row[0].split('_')[0]
        speciesList = []
        for i in range(1, len(row)):
            if row[i] == '':
                break
            if row[i] == 'Null':
                continue
            speciesList.append(row[i])
        sample2species[sampleName] = speciesList

In [4]:
dbPath = Path.cwd().joinpath('../12-16-21_NextflowMSGF_Tailored4_Combined/databases/')

In [45]:
# Write out the initial databases for each sample
for sample, speciesList in sample2species.items():
    dbProteins = {} #key=protein sequence, value=protein object
    for species in speciesList:
        for prot in protDict[species]:
            if prot.sequence in dbProteins.keys():
                dbProteins[prot.sequence].addTaxa(species)
            else:
                dbProteins[prot.sequence] = prot
    toWrite = [humanData]
    for prot in dbProteins.values():
        toWrite.append(prot.getEntry())
    with dbPath.joinpath(f'{sample}_TailoredDatabase.fasta').open(mode='w', newline='') as dbfile:
        dbfile.write(''.join(toWrite))

In [6]:
# Refine tailored dbs for second round of searching
tailoredDBs = getOrderedFiles(dbPath, '.fasta')
unprocessedUnrefined = getOrderedFiles(Path.cwd().joinpath('../12-16-21_NextflowMSGF_Tailored4_Combined/output/'), '.tsv')
processedUnrefinedDir = Path.cwd().joinpath('../12-16-21_NextflowMSGF_Tailored4_Combined/output_processed/')

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'E:\\Proteomics\\jupyter_notebooks\\..\\12-16-21_NextflowMSGF_Tailored4_Combined\\output'

In [11]:
collapseRepeatRows(unprocessedUnrefined, processedUnrefinedDir)

In [7]:
refinedDBDir = Path.cwd().joinpath('../12-16-21_NextflowMSGF_Tailored4_Combined/databases_refined/')
processed = getOrderedFiles(processedUnrefinedDir, '.tsv')

NameError: name 'processedUnrefinedDir' is not defined

In [13]:
# For each sample, find all the proteins that were hit, regardless of significance
for i in range(len(SAMPLE_NAMES)):
    protsToInclude = set()
    with processed[i].open(mode='r') as infile:
        reader = csv.reader(infile, delimiter='\t')
        for row in reader:
            protType = determineIDType(row)
            if protType == 'first':
                continue
            hitProts = getProteinHitList(row, 'all')
            for hit in hitProts:
                if hit.find('XXX_') == -1: # Ignore decoy hits
                    protsToInclude.add(hit)
    # Get protein data for sequences to include from the initial database
    protObjs = {} # key=protID, value=protein object
    with open(tailoredDBs[i], 'r') as database:
        rawText = database.read()
        dataList = rawText.split('\n>')
        dataList[0] = dataList[0][1:]
        del rawText
        for sequence in dataList:
            newProt = Protein(sequence)
            if newProt.id in protsToInclude:
                protObjs[newProt.id] = newProt
    # Write the new databases to file
    toWrite = []
    for prot in protObjs.values():
        toWrite.append(prot.getEntry())
    with open(refinedDBDir.joinpath(f'{SAMPLE_NAMES[i]}_Tailored4_refined.fasta'), 'w', newline='') as output:
        output.write(''.join(toWrite))

In [8]:
refinedUnprocessed = getOrderedFiles(Path.cwd().joinpath('../12-16-21_NextflowMSGF_Tailored4_Combined/output_refined/'), '.tsv')
refinedProcessedDir = Path.cwd().joinpath('../12-16-21_NextflowMSGF_Tailored4_Combined/output_refined_processed/')

In [9]:
collapseRepeatRows(refinedUnprocessed, refinedProcessedDir)