**1/5/21**

The purpose of this notebook is to build the "Metagenomic Combined" sequence database that includes all non-redundant protein sequences found in metagenomic sequencing of the samples.

I'll use the gene ID from the "genes.fasta" file as the protein ID (i.e. 'gene_59a6e05c_523aa'), the 'eggNOG_desc' value as the protein's name, and the 'tax_name' value as the organism name. Some of these values can be NaN, which I will substitute with 'Unknown.'

In [1]:
import h5py
import tables
from elliot_utils import *

In [2]:
annotationFile = Path.cwd().joinpath('../ShotgunMetagenomics/annotations.hdf5')
geneFile = Path.cwd().joinpath('../ShotgunMetagenomics/genes.fasta')
combinedDB = Path.cwd().joinpath('../ShotgunMetagenomics/metagenomic_combined.fasta')

In [3]:
annotationDF = pd.read_hdf(annotationFile, '/annot/gene/all')

In [4]:
geneHolder = {} #key = Sequenced gene ID, value = Protein object

# Iterate over the genes file to pull out the gene ID and sequence for each
with open(geneFile, 'r') as dataFile:
    rawText = dataFile.read()
    dataArray = rawText.split('\n')
    del rawText
    for i in range(0, len(dataArray), 2):
        geneID = dataArray[i][1:]
        newProt = Protein('')
        newProt.id = geneID
        newProt.sequence = dataArray[i + 1]
        geneHolder[geneID] = newProt
    del dataArray

In [5]:
# Retrieve protein name, taxa name from hdf5 file for each protein
for i in range(len(annotationDF)):
    annotID = annotationDF.at[i, 'gene']
    geneHolder[annotID].name = annotationDF.at[i, 'eggNOG_desc']
    if type(geneHolder[annotID].name) != str and math.isnan(geneHolder[annotID].name):
        geneHolder[annotID].name = 'Hypothetical protein'
    geneHolder[annotID].taxa = [annotationDF.at[i, 'tax_name']]
    if type(geneHolder[annotID].taxa[0]) != str and math.isnan(geneHolder[annotID].taxa[0]):
        geneHolder[annotID].taxa = ['Unknown']

In [6]:
# Clean up the genes without annotations
for prot in geneHolder.values():
    if prot.taxa == None:
        prot.taxa = ['Unknown']
        prot.name = 'Hypothetical protein'

In [8]:
# Write the combined data out to the new database file
toWrite = []
for prot in geneHolder.values():
    toWrite.append(prot.getFormattedEntry())

In [11]:
with open(combinedDB, 'w', newline='') as output:
    output.write(''.join(toWrite))

In [24]:
### Create the refined Shotgun_Pooled database ###

In [8]:
# Process the initial result files with collapseRepeatRows() from elliot_utils
combinedUnprocessed = getOrderedFiles(Path.cwd().joinpath('../3-8-21_NextflowMSGF_Combined_CombinedMetagenomic/combinedmetagenomic_output/'), '.tsv')
combinedProcessedPath = Path.cwd().joinpath('../3-8-21_NextflowMSGF_combined_CombinedMetagenomic/combinedmetagenomic_output_processed/')

In [9]:
collapseRepeatRows(combinedUnprocessed, combinedProcessedPath)

In [6]:
combinedFolder = Path.cwd().joinpath('../ShotgunMetagenomics/combined/')
combinedHitProts = getHitsInResults(Path.cwd().joinpath('../3-8-21_NextflowMSGF_combined_CombinedMetagenomic/combinedmetagenomic_output_processed/'))
refinedCombinedPath = Path.cwd().joinpath('../4-9-21_NextflowMSGF_Combined_Pooled/database_refined/')
refineHugeDatabase(combinedHitProts, combinedFolder, refinedCombinedPath, 'metagenomicCombined_refined')

Done reading metagenomic_combined.fasta
289179 sequences written.


In [7]:
# Process results of the refined Shotgun_Pooled database
combinedRefinedUnprocessed = getOrderedFiles(Path.cwd().joinpath('../4-9-21_NextflowMSGF_Combined_Pooled/output_combined_refined/'), '.tsv')
refinedCombinedProcessedPath = Path.cwd().joinpath('../4-9-21_NextflowMSGF_Combined_Pooled/output_combined_refined_processed/')
collapseRepeatRows(combinedRefinedUnprocessed, refinedCombinedProcessedPath)

In [12]:
### Prepare individual sequencing databases###

In [3]:
humanFile = Path.cwd().joinpath('../ShotgunMetagenomics/Human9606_2-6-2019_TrypPigBov.fasta')
indDataFolder = Path.cwd().joinpath('../ShotgunMetagenomics/individual')
outputFolder = Path.cwd().joinpath('../ShotgunMetagenomics/individual_databases')

In [4]:
# Get the gene IDs from the individual gene files so they can be matched to the annotated genes
preindGeneFiles = getOrderedFiles(genesFolder, '.faa')

# Make a separate set for each sample to hold the amino acid sequences of each translated protein
preIndHolders = []
for i in range(len(SAMPLE_NAMES)):
    geneHolders.append([])
    with open(preindGeneFiles[i], 'r') as dataFile:
        rawText = dataFile.read()
        dataList = rawText.split('\n>')
        dataList[0] = dataList[0][1:]
        del rawText
        for entry in dataList:
            prelimSeq = entry[entry.find('\n') + 1:].replace('\n', '')
            geneHolders[i].append(prelimSeq.replace('*', ''))

NameError: name 'genesFolder' is not defined

In [42]:
# Get list of gene IDs for each sample
indCSVs = getOrderedFiles(genesFolder, '.csv')
genesInSamples = [] # key=individual gene ID, value=annotated gene ID (can be None)

for i in range(len(SAMPLE_NAMES)):
    genesInSamples.append([])
    with open(indCSVs[i], 'r') as infile:
        reader = csv.reader(infile, delimiter=',')
        header = True
        annotIndex = -1
        for row in reader:
            if header:
                annotIndex = row.index('catalog_gene')
                header = False
                continue
            annotVal = row[annotIndex]
            if not annotVal == '':
                genesInSamples[i].append(annotVal)

In [43]:
# Get annotation data from geneHolder, write that and human/contaminant sequences to database file
humanData = ''
with open(humanFile, 'r') as infile:
    humanData = infile.read()

In [53]:
for i in range(len(SAMPLE_NAMES)):
    toWrite = [humanData]
    for gene in genesInSamples[i]:
        toWrite.append(geneHolder[gene].getFormattedEntry())
    with open(outputFolder.joinpath(f'{SAMPLE_NAMES[i]}_metagenomicIndividual.fasta'), 'w', newline='') as output:
        output.write(''.join(toWrite))

In [5]:
# Process results for Shotgun_Sample-Matched databases
individualUnprocessed = getOrderedFiles(Path.cwd().joinpath('../3-10-21_NextflowMSGF_Combined_IndividualMetagenomic/output/'), '.tsv')
individualProcessedPath = Path.cwd().joinpath('../3-10-21_NextflowMSGF_Combined_IndividualMetagenomic/output_processed/')

In [None]:
collapseRepeatRows(individualUnprocessed, individualProcessedPath)

In [6]:
# Refine Shotgun_Sample-Matched databases
individualProcessed = getOrderedFiles(individualProcessedPath, '.tsv')
individualDBs = getOrderedFiles(outputFolder, '.fasta')

In [7]:
# For each result/db pair, pull out all the hit proteins in the result file, identify the proteins in the DB, then write them out to a new DB
individualRefinedPath = Path.cwd().joinpath('../4-2-21_NextflowMSGF_Combined_Individual/databases_refined/')
for i in range(len(individualProcessed)):
    protsToInclude = set()
    with individualProcessed[i].open(mode='r') as infile:
        reader = csv.reader(infile, delimiter='\t')
        for row in reader:
            protType = determineHitType(row)
            if protType == 'first':
                continue
            hitProts = getProteinHitList(row)
            for hit in hitProts:
                if hit.find('XXX_') == -1: # Ignore decoy hits
                    protsToInclude.add(hit)
    protObjs = {} # key=protID, value=protein object
    with open(individualDBs[i], 'r') as database:
        rawText = database.read()
        dataList = rawText.split('\n>')
        dataList[0] = dataList[0][1:]
        del rawText
        for sequence in dataList:
            newProt = Protein(sequence)
            if newProt.id in protsToInclude:
                if newProt.id in protObjs.keys():
                    for taxa in newProt.taxa:
                        protObjs[newProt.id].addTaxa(taxa)
                else:
                    protObjs[newProt.id] = newProt
    toWrite = []
    for prot in protObjs.values():
        toWrite.append(prot.getEntry())
    with open(individualRefinedPath.joinpath(f'{SAMPLE_NAMES[i]}_metagenomicIndividual_refined.fasta'), 'w', newline='') as output:
        output.write(''.join(toWrite))

In [13]:
# Process results for Shotgun_Sample-Matched Refined databases
individualRefinedUnprocessed = getOrderedFiles(Path.cwd().joinpath('../4-2-21_NextflowMSGF_Combined_Individual/output_refined/'), '.tsv')
individualRefinedProcessedPath = Path.cwd().joinpath('../4-2-21_NextflowMSGF_Combined_Individual/output_refined_processed/')
collapseRepeatRows(individualRefinedUnprocessed, individualRefinedProcessedPath)