**5/3/21**

The purpose of this notebook is to analyze the functions of bacterial and human proteins identified and annotated by Eggnog Mapper. I'm interested both in the functions that are differentially abundant based on BV status, as well as the different functions that are identified by databases populated by public or sequencing-based proteins.

In [1]:
from elliot_utils import *
from scipy import stats

In [2]:
analysisPath = Path.cwd().joinpath('analysis_files/functional_analysis/')

In [3]:
tResults = getOrderedFiles(TAILORED_RESULTS, '.tsv')
hResults = getOrderedFiles(HYBRID_RESULTS, '.tsv')
sResults = getOrderedFiles(SINGLE_RESULTS, '.tsv')
cResults = getOrderedFiles(COMMUNITY_RESULTS, '.tsv')
pResults = getOrderedFiles(POOLED_RESULTS, '.tsv')

In [4]:
tPeps = getFilteredPeptides(tResults, 'bacteria')
hPeps = getFilteredPeptides(hResults, 'bacteria')
sPeps = getFilteredPeptides(sResults, 'bacteria')
cPeps = getFilteredPeptides(cResults, 'bacteria')
pPeps = getFilteredPeptides(pResults, 'bacteria')
tHPeps = getFilteredPeptides(tResults, 'human')
hHPeps = getFilteredPeptides(hResults, 'human')
sHPeps = getFilteredPeptides(sResults, 'human')
cHPeps = getFilteredPeptides(cResults, 'human')
pHPeps = getFilteredPeptides(pResults, 'human')

In [5]:
# Processes a string in the format: 'GO:XXXXXXX,'
def processGOList(listString):
    stripped = listString.replace('GO:', '')
    return stripped.split(',')

# Associate protein IDs with GO numbers from eggnog annotation
# Eggnog fails to annotate some proteins. Those proteins are added to the output dictionary as "unannotated"
# Returns a dictionary in the form key=protein ID, value=list of GO numbers
def extractFuncs(fastaFile, annotFile):
    toReturn = {} # key=protein ID, value=list of GO numbers
    with open(fastaFile, 'r') as database:
        rawText = database.read()
        dataList = rawText.split('\n>')
        dataList[0] = dataList[0][1:]
        del rawText
        for sequence in dataList:
            newProt = Protein(sequence)
            toReturn[newProt.id] = ['unannotated']
    annotDF = pd.read_csv(annotFile, sep='\t')
    for index, row in annotDF.iterrows():
        protID = row['#query']
        annotations = []
        if row['GOs'] == '-':
            annotations = ['unannotated']
        else:
            annotations = processGOList(row['GOs'])
        toReturn[protID] = annotations
    return toReturn

In [6]:
bfuncs = extractFuncs(analysisPath.joinpath('all_hit_bacterial_proteins.fasta'), analysisPath.joinpath('eggnog_bacterial_annotations.tsv'))
hfuncs = extractFuncs(analysisPath.joinpath('all_hit_human_proteins.fasta'), analysisPath.joinpath('eggnog_human_annotations.tsv'))

In [7]:
allRef = ProtRef(analysisPath.joinpath('all_hit_bacterial_proteins.fasta'))

In [8]:
# Get relative abundance values for all GO numbers across the samples
# Returns a dictionary in the form {'annotation1':[sample_1_abundance, sample_2_abundance...], etc.}
def getFunctionRelativeAbundances(results, allowedPeps, typeOfProt, funcRef, transform=True):
    toReturn = {} # key=annotation, value=list of abundance for annotation in each sample
    for goList in funcRef.values():
        for goNumber in goList:
            toReturn[goNumber] = []
    for res in results:
        total = 0
        counts = {} # key=annotation, value=spectral count for that annotation in this sample
        for annot in toReturn.keys():
            counts[annot] = 0
        with res.open(mode='r') as infile:
            reader = csv.reader(infile, delimiter='\t')
            for row in reader:
                protType = determineIDType(row)
                if protType == 'first':
                    continue
                if not isSignificant(row):
                    break
                if row[PEPTIDE] in allowedPeps:
                    total += 1
                    hits = getProteinHitList(row, typeOfProt)
                    hitAnnots = set()
                    for hit in hits:
                        annotList = funcRef[hit]
                        for annot in annotList:
                            hitAnnots.add(annot)
                    for a in hitAnnots:
                        counts[a] += 1
        if transform:
            for annot in toReturn.keys():
                toReturn[annot].append(math.log2((counts[annot] / total) + 1))
        else:
            for annot in toReturn.keys():
                toReturn[annot].append(counts[annot] / total)
    return toReturn

In [9]:
# Returns true if all the numbers in the list are 0, false otherwise
def allZeros(numList):
    for num in numList:
        if not num == 0:
            return False
    return True

# Tests whether a group of annotations are significantly differentially abundant using the mann-whitney U test.
# annotDict must be in the format {'annotation':[list of transformed values, corresponding to ordered samples]}
# Returns a list of tuples in the format ('annotation', p-value, BV- avg, BV+ avg)
def test4Significance(annotDict):
    toReturn = []
    for protName, data in annotDict.items():
        negData = []
        posData = []
        for i in range(len(BV_STATUS)):
            if BV_STATUS[i] == '-':
                negData.append(data[i])
            else:
                posData.append(data[i])
        pVal = 1
        if not allZeros(negData) or not allZeros(posData):
            res = stats.mannwhitneyu(negData, posData)
            pVal = res.pvalue
        toReturn.append((protName, pVal, sum(negData)/len(negData), sum(posData)/len(posData)))
    toReturn.sort(key=lambda x: x[1])
    sigCount = 0
    for x in toReturn:
        if x[1] < 0.01:
            sigCount += 1
    print(sigCount)
    return toReturn

# Write the results of differential abundance testing for annotations out to the specified csv file
def writeSigData2File(sigData, outputPath):
    with outputPath.open(mode='w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['Annotation', 'p-value', 'BV- Average', 'BV+ Average'])
        for row in sigData:
            writer.writerow(row)

In [10]:
# 16S_Sample-Matched Bacteria
tBacteriaAbundance = getFunctionRelativeAbundances(tResults, tPeps, 'bacteria', bfuncs)
tbAnnotData = test4Significance(tBacteriaAbundance)

460


In [11]:
# 16S_Sample-Matched Human
tHumanAbundance = getFunctionRelativeAbundances(tResults, tHPeps, 'human', hfuncs)
thAnnotData = test4Significance(tHumanAbundance)

1881


In [12]:
# 16S_Pooled Bacteria
cBacteriaAbundance = getFunctionRelativeAbundances(cResults, cPeps, 'bacteria', bfuncs)
cbAnnotData = test4Significance(cBacteriaAbundance)

386


In [13]:
# 16S_Pooled Human
cHumanAbundance = getFunctionRelativeAbundances(cResults, cHPeps, 'human', hfuncs)
chAnnotData = test4Significance(cHumanAbundance)

1713


In [14]:
# Shotgun_Sample-Matched Bacteria
sBacteriaAbundance = getFunctionRelativeAbundances(sResults, sPeps, 'bacteria', bfuncs)
sbAnnotData = test4Significance(sBacteriaAbundance)
writeSigData2File(sbAnnotData, analysisPath.joinpath('differential_Annotations_single_bacteria.csv'))

411


In [15]:
# Shotgun_Sample-Matched Human
sHumanAbundance = getFunctionRelativeAbundances(sResults, sHPeps, 'human', hfuncs)
shAnnotData = test4Significance(sHumanAbundance)

1935


In [16]:
# Shotgun_Pooled Bacteria
pBacteriaAbundance = getFunctionRelativeAbundances(pResults, pPeps, 'bacteria', bfuncs)
pbAnnotData = test4Significance(pBacteriaAbundance)

377


In [17]:
# Shotgun_Pooled Human
pHumanAbundance = getFunctionRelativeAbundances(pResults, pHPeps, 'human', hfuncs)
phAnnotData = test4Significance(pHumanAbundance)

1710


In [18]:
# Hybrid_Sample-Matched Bacteria
hBacteriaAbundance = getFunctionRelativeAbundances(hResults, hPeps, 'bacteria', bfuncs)
hbAnnotData = test4Significance(hBacteriaAbundance)
writeSigData2File(hbAnnotData, analysisPath.joinpath('differential_annotations_hybrid_bacteria.csv'))

479


In [19]:
# Hybrid_Sample-Matched Human
hHumanAbundance = getFunctionRelativeAbundances(hResults, hHPeps, 'human', hfuncs)
hhAnnotData = test4Significance(hHumanAbundance)
writeSigData2File(hhAnnotData, analysisPath.joinpath('differential_annotations_hybrid_human.csv'))

1903
