**5/26/21**

The purpose of this notebook is to look for differences in abundance between BV+ and BV- for specific human proteins in the results of the Hybrid_Sample-Matched database searches.

In [1]:
from elliot_utils import *
from scipy import stats

In [2]:
results = getOrderedFiles(HYBRID_RESULTS, '.tsv')
protRefs = []
for db in getOrderedFiles(HYBRID_DB, '.fasta'):
    protRefs.append(ProtRef(db))

In [3]:
figPath = Path.cwd().joinpath('figures/specific_protein_analysis/')
analysisPath = Path.cwd().joinpath('analysis_files/specific_protein_analysis/')

In [4]:
humanPeps = getFilteredPeptides(results, 'human')
bacteriaPeps = getFilteredPeptides(results, 'bacteria')

In [5]:
hInterestProts = { # key=protein ID corresponding to the protein of interest, value=name of protein of interest
    'sp|P00441|SODC_HUMAN':'Superoxide dismutase',
    'sp|P23297|S10A1_HUMAN':'S100-A1',
    'sp|Q92817|EVPL_HUMAN':'Envoplakin',
    'sp|O60437|PEPL_HUMAN':'Periplakin',
    'sp|P20930|FILA_HUMAN':'Filaggrin',
    'sp|P22735|TGM1_HUMAN':'Transglutaminase-1',
    'sp|P21980|TGM2_HUMAN':'Transglutaminase-2',
    'sp|Q08188|TGM3_HUMAN':'Transglutaminase-3',
    'sp|Q07283|TRHY_HUMAN':'Trichohyalin',
    'sp|P04035|HMDH_HUMAN':'HMGCR cholesterol synthase',
    'sp|O95477|ABCA1_HUMAN':'Cholesterol transporter',
    'sp|Q9UBM7|DHCR7_HUMAN':'7-dehydrocholesterol reductase',
    'sp|P54840|GYS2_HUMAN':'Glycogen synthase',
    'sp|P04746|AMYP_HUMAN':'Alpha-amylase',
    'sp|Q14697|GANAB_HUMAN':'Glucosidase',
    'sp|Q12794|HYAL1_HUMAN':'Hyaluronidase-1',
    'sp|P22894|MMP8_HUMAN':'Matrix metalloproteinase-8',
    'sp|P01833|PIGR_HUMAN':'IgA secretory component',
    'sp|P01009|A1AT_HUMAN':'Serpin A1',
    'sp|P01011|AACT_HUMAN':'Serpin A3',
    'sp|P01040|CYTA_HUMAN':'Cystatin A',
    'sp|P04080|CYTB_HUMAN':'Cystatin B',
    'sp|A8K2U0|A2ML1_HUMAN':'A2ML1',
    'sp|P03973|SLPI_HUMAN':'Antileukoproteinase',
    'sp|P30740|ILEU_HUMAN':'Leukocyte elastase inhibitor',
    'sp|P08246|ELNE_HUMAN':'Neutrophil elastase',
    'sp|P10147|CCL3_HUMAN':'MIP-alpha',
    'sp|P13236|CCL4_HUMAN':'MIP-beta',
    'sp|P04083|ANXA1_HUMAN':'Annexin A1',
    'sp|P07355|ANXA2_HUMAN':'Annexin A2',
    'sp|P12429|ANXA3_HUMAN':'Annexin A3',
    'sp|P09525|ANXA4_HUMAN':'Annexin A4',
    'sp|P08758|ANXA5_HUMAN':'Annexin A5',
    'sp|P08133|ANXA6_HUMAN':'Annexin A6',
    'sp|P20073|ANXA7_HUMAN':'Annexin A7',
    'sp|P13928|ANXA8_HUMAN':'Annexin A8',
    'sp|O76027|ANXA9_HUMAN':'Annexin A9',
    'sp|Q9UJ72|ANX10_HUMAN':'Annexin A10',
    'sp|P50995|ANX11_HUMAN':'Annexin A11',
    'sp|P27216|ANX13_HUMAN':'Annexin A13',
    'sp|P01584|IL1B_HUMAN':'IL-1 beta',
    'sp|P60568|IL2_HUMAN':'IL-2',
    'sp|P08700|IL3_HUMAN':'IL-3',
    'sp|P05112|IL4_HUMAN':'IL-4',
    'sp|P05113|IL5_HUMAN':'IL-5',
    'sp|P05231|IL6_HUMAN':'IL-6',
    'sp|P13232|IL7_HUMAN':'IL-7',
    'sp|P10145|IL8_HUMAN':'IL-8',
    'sp|P15248|IL9_HUMAN':'IL-9',
    'sp|P22301|IL10_HUMAN':'IL-10',
    'sp|P29459|IL12A_HUMAN':'IL-12',
    'sp|P35225|IL13_HUMAN':'IL-13',
    'sp|Q14005|IL16_HUMAN':'IL-16',
    'sp|Q14116|IL18_HUMAN':'IL-18',
    'sp|Q9NYY1|IL20_HUMAN':'IL-20',
    'sp|Q9HBE4|IL21_HUMAN':'IL-21',
    'sp|Q9GZX6|IL22_HUMAN':'IL-22',
    'sp|Q13007|IL24_HUMAN':'IL-24',
    'sp|Q9H293|IL25_HUMAN':'IL-25',
    'sp|P13807|GYS1_HUMAN':'Glycogen synthase',
    'sp|Q99527|GPER1_HUMAN':'Estrogen receptor',
    'sp|O95477|ABCA1_HUMAN':'Cholesterol transporter',
    'sp|O15118|NPC1_HUMAN':'Intracellular cholesterol transporter',
    'sp|P98088|MUC5A_HUMAN':'Mucin-5AC',
    'sp|Q9HC84|MUC5B_HUMAN':'Mucin-5B',
    'sp|P05109|S10A8_HUMAN':'Calprotectin',
    'sp|P00751|CFAB_HUMAN':'Complement factor 3',
    'sp|P14174|MIF_HUMAN':'Migration inhibitory factor',
    'sp|P61626|LYSC_HUMAN':'Lysozyme C',
    'sp|P02788|TRFL_HUMAN':'Lactotransferrin',
    'sp|Q9NQ38|ISK5_HUMAN':'Serine protease inhibitor 5',
    'sp|P07476|INVO_HUMAN':'Involucrin',
    'sp|P23490|LORI_HUMAN':'Loricrin',
    'sp|Q6XPR3|RPTN_HUMAN':'Repetin',
    'sp|P35321|SPR1A_HUMAN':'Cornifin-A',
    'sp|P22528|SPR1B_HUMAN':'Cornifin-B',
    'sp|P35326|SPR2A_HUMAN':'Small proline-rich protein 2A',
    'sp|Q9UBC9|SPRR3_HUMAN':'Small proline-rich protein 3',
    'sp|P19957|ELAF_HUMAN':'Elafin',
    'sp|P02538|K2C6A_HUMAN':'Keratin 6A',
    'sp|P08779|K1C16_HUMAN':'Keratin 16',
    'sp|P13647|K2C5_HUMAN':'Keratin 5',
    'sp|P02533|K1C14_HUMAN':'Keratin 14',
    'sp|P04264|K2C1_HUMAN':'Keratin 1',
    'sp|P13645|K1C10_HUMAN':'Keratin 10',
    'sp|Q6UWP8|SBSN_HUMAN':'Suprabasin',
    'sp|P08311|CATG_HUMAN':'Cathepsin G',
    'sp|P07339|CATD_HUMAN':'Cathepsin D',
    'sp|P08246|ELNE_HUMAN':'Neutrophil elastase',
    'sp|P59665|DEF1_HUMAN':'Neutrophil defensin 1',
    'sp|P62805|H4_HUMAN':'Histone H4',
    'sp|Q02413|DSG1_HUMAN':'Desmoglein-1',
    'sp|Q08554|DSC1_HUMAN':'Desmocollin-1',
    'sp|P14923|PLAK_HUMAN':'Plakoglobin',
    'sp|Q15517|CDSN_HUMAN':'Corneodesmosin',
    'sp|P49862|KLK7_HUMAN':'SCCE protease',
    'sp|Q9Y337|KLK5_HUMAN':'SCTE protease',
    'sp|O60911|CATL2_HUMAN':'SCCL protease',
    'sp|P35228|NOS2_HUMAN':'Nitric oxide synthase, inducible',
    'sp|P29475|NOS1_HUMAN':'Nitric oxide synthase, brain',
    'sp|P29474|NOS3_HUMAN':'Nitric oxide synthase, endothelial',
    'sp|P05089|ARGI1_HUMAN':'Arginase',
    'sp|P10451|OSTP_HUMAN':'Osteopontin',
    'sp|P02751|FINC_HUMAN':'Fibronectin',
    'sp|P02461|CO3A1_HUMAN':'Collagen III',
    'sp|P00533|EGFR_HUMAN':'Epidermal Growth Factor Receptor',
    'sp|P01133|EGF_HUMAN':'Pro-epidermal Growth Factor',
    'sp|Q15910|EZH2_HUMAN':'Histone Methyltransferase EZH2',
    'sp|Q9Y6K1|DNM3A_HUMAN':'DNA Methyltransferase 3A',
    'sp|Q9UBC3|DNM3B_HUMAN':'DNA Methyltransferase 3B',
    'sp|P26358|DNMT1_HUMAN':'DNA Methyltransferase 1'
}

In [6]:
# Takes a dictionary in the format key=protein ID corresponding to the protein of interest, value=name of protein of interest
# Returns a dictionary in the format key=protein name, value=list of relative abundance data corresponding to sample
def getAbundances(protDict, allowedPeps, lookFor):
    toReturn = {} # key=protein name, value=list of transformed relative abundance data corresponding to sample
    for protName in protDict.values():
        toReturn[protName] = []
    for res in results:
        with res.open(mode='r') as infile:
            total = 0
            counts = {} # key=protein name, value=spectral count for that protein in this sample
            for key in protDict.values():
                counts[key] = 0
            reader = csv.reader(infile, delimiter='\t')
            for row in reader:
                protType = determineIDType(row)
                if protType == 'first':
                    continue
                if not isSignificant(row):
                    break
                if row[PEPTIDE] in allowedPeps and protType == lookFor:
                    total += 1
                    hits = getProteinHitList(row, lookFor)
                    for hit in hits:
                        if hit in protDict.keys():
                            nameOfProt = protDict[hit]
                            counts[nameOfProt] += 1
            for prot in protDict.values():
                toReturn[prot].append(counts[prot] / total)
    return toReturn

# Takes in a dictionary in the format key=protein name, value=list of relative abundance data corresponding to sample
# Returns a dictionary in the format key=protein name, value=list of log-transformed relative abundance data
def logTransformDict(inDict):
    toReturn = {}
    for prot, values in inDict.items():
        toReturn[prot] = []
        for v in values:
            toReturn[prot].append(math.log2(v + 1))
    return toReturn

In [7]:
# Collect normalized and log2 transformed relative abundances of each protein of interest for each sample.
hInterestAbund = getAbundances(hInterestProts, humanPeps, 'human')
hInterestData = logTransformDict(hInterestAbund)

In [8]:
# Returns true if all the numbers in the list are 0, false otherwise
def allZeros(numList):
    for num in numList:
        if not num == 0:
            return False
    return True

# Tests whether a group of proteins are significantly differentially abundant using the mann-whitney U test.
# protDict must be in the format {'protein name':[list of transformed values, corresponding to ordered samples]}
# Returns a list of tuples in the format ('protein name', p-value, BV- transformed avg, BV+ transformed avg, BV- avg, BV+ avg)
def test4Significance(protDict, abundDict):
    toReturn = []
    for protName, data in protDict.items():
        negData = []
        posData = []
        negAbund = []
        posAbund = []
        for i in range(len(BV_STATUS)):
            if BV_STATUS[i] == '-':
                negData.append(data[i])
                negAbund.append(abundDict[protName][i])
            else:
                posData.append(data[i])
                posAbund.append(abundDict[protName][i])
        pVal = 1
        if not allZeros(negData) or not allZeros(posData):
            res = stats.mannwhitneyu(negData, posData)
            pVal = res.pvalue
        toReturn.append((protName, pVal, sum(negData)/len(negData), sum(posData)/len(posData), sum(negAbund)/len(negData), sum(posAbund)/len(posData)))
    toReturn.sort(key=lambda x: x[1])
    return toReturn

In [9]:
hInterestStats = test4Significance(hInterestData, hInterestAbund)

In [10]:
with analysisPath.joinpath('hybrid_human_differential_proteins_untransformed.csv').open(mode='w', newline='') as infile:
    writer = csv.writer(infile)
    writer.writerow(['Protein', 'p-value', 'BV- Transformed Average', 'BV+ Transformed Average', 'BV- Average', 'BV+ Average'])
    for row in hInterestStats:
        writer.writerow(row)