In [1]:
import pandas as pd
import glob
import re

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
psm_files = glob.glob("MSFragger_Output/*_psm.tsv")
psm_files

['MSFragger_Output\\acidovorans_psm.tsv',
 'MSFragger_Output\\baltica_psm.tsv',
 'MSFragger_Output\\cereus_psm.tsv',
 'MSFragger_Output\\cryptum_psm.tsv',
 'MSFragger_Output\\denitrifans_psm.tsv',
 'MSFragger_Output\\gnavus_psm.tsv',
 'MSFragger_Output\\griseorubens_psm.tsv',
 'MSFragger_Output\\indologenes_psm.tsv',
 'MSFragger_Output\\ljungdahlii_psm.tsv',
 'MSFragger_Output\\necator_psm.tsv',
 'MSFragger_Output\\palustris_psm.tsv',
 'MSFragger_Output\\smegmatis_psm.tsv',
 'MSFragger_Output\\thermosulf_psm.tsv',
 'MSFragger_Output\\tumefaciens_psm.tsv']

In [4]:
run_files = glob.glob("MSFragger_Output/*_run.tsv")
run_files

['MSFragger_Output\\tumefaciens_run.tsv']

In [5]:
def extract_scannum(Spectrum):
    pattern = r'\.\d*\.'
    lst = []
    for s in Spectrum:
        m = re.search(pattern, s)
        if m:
            m = m[0]
            m = m.replace('.', '')
            m = int(m)
            lst.append(m)
        else:
            lst.append(None)
    return (lst)

In [6]:
def combine_run_files(files):
    dfs = []
    for file in files:
        name = re.split('_run.tsv', file)
        name = name[0]
        pattern = r"\\"
        name = re.split(pattern, name)
        name = name[1]
        df = pd.read_csv(file, sep = '\t')
        keeps = ['scannum', 'best_locs', 'peptide']
        df = df[df.best_locs.notnull()]
        df = df[keeps]
        df['Organism'] = [name] * len(df)
        dfs.append(df)
    dfs = pd.concat(dfs)
    return (dfs)


In [7]:
def combine_psm_files(files):
    dfs = []
    i = 0
    for file in files:
        name = re.split('_psm.tsv', file)
        name = name[0]
        pattern = r"\\"
        name = re.split(pattern, name)
        name = name[1]
        keeps = ['scannum', 'Peptide', 'Observed Modifications', 'Protein ID', 'Gene', 'Spectrum']
        df = pd.read_csv(file, sep = '\t')   
        df = df[df['PeptideProphet Probability'] >=0.99]
        df = df[df['Observed Modifications'].notnull()]
        df = df[df.Gene.notnull()]
        df = df[~df.Gene.str.contains('Daci')]
        df['scannum'] = extract_scannum(df.Spectrum)
        df = df[keeps]
        df['Organism'] = [name] * len(df)        
        dfs.append(df)
    df = pd.concat(dfs)
    return(df)

In [8]:
combinded_psm_df = combine_psm_files(psm_files)
combined_run_df = combine_run_files(run_files)

  if (await self.run_code(code, result,  async_=asy)):


In [9]:
df = pd.merge(combinded_psm_df, combined_run_df)
df

Unnamed: 0,scannum,Peptide,Observed Modifications,Protein ID,Gene,Spectrum,Organism,best_locs,peptide
0,1342.0,HVVPHGDR,Lys(128.094963),A0A176XCR9,valS,Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov...,tumefaciens,HVVPHGDR,HVVPHGDR
1,1392.0,ARDPDEAHAQAR,"Lys(128.094963), TMAB(128.107539)",A0A176XGQ7,ribB,Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov...,tumefaciens,ardPDEAHAQAR,ARDPDEAHAQAR
2,1411.0,DGDRPDRGPR,Arg(156.101111),A0A0X8J2W6,rpsF,Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov...,tumefaciens,dGDRPDRGPR,DGDRPDRGPR
3,1439.0,SHEEDYTHKK,"Amidine(41.026549), Ser->Gln(41.026549)",A0A176XDU2,fusA,Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov...,tumefaciens,sHEEDYTHKK,SHEEDYTHKK
4,1476.0,REYGPGQHGQR,Lys(128.094963),A0A0X8IZF9,rpsD,Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov...,tumefaciens,ReYGPGQHGQR,REYGPGQHGQR
...,...,...,...,...,...,...,...,...,...
11581,50567.0,LSNLQAMLPVLEAVVQTGKPLVIIAEDVEGEALATLVVNK,Val->Thr(1.979265),A0A176XGK2,groEL,Biodiversity_A_tumefaciens_R2A_aerobic_3_23Nov...,tumefaciens,SDLLpilyatatgtldkvqAEWRDDAALTVVLASK,SDLLPILYATATGTLDKVQAEWRDDAALTVVLASK
11582,50603.0,DVVEILAAEGFNIGR,"Ala->Gly(-14.015650), Gln->Asn(-14.015650), Gl...",A0A176WZL0,rplI,Biodiversity_A_tumefaciens_R2A_aerobic_3_23Nov...,tumefaciens,EVPLLANLFGTRQRIEWGLGLETGGLPALgQK,EVPLLANLFGTRQRIEWGLGLETGGLPALGQK
11583,50635.0,GFGFIQPDNGGTDVFVHISAVER,"Asn->Asp(0.984016), Deamidated(0.984016), Gln-...",A0A037XNX0,cspA,Biodiversity_A_tumefaciens_R2A_aerobic_3_23Nov...,tumefaciens,AIAGALTAIMTGVSYATSAEMAGELGPFPGFAPnrDNMLR,AIAGALTAIMTGVSYATSAEMAGELGPFPGFAPNRDNMLR
11584,50717.0,KLPVTSLLMALGMDGEDILSTFYTK,Asp->Met(16.013542),A0A176XCU3,rpoB,Biodiversity_A_tumefaciens_R2A_aerobic_3_23Nov...,tumefaciens,VLSVVGSVASEKAISMIAIHAHGIPHIFPQNVLAEADaAK,VLSVVGSVASEKAISMIAIHAHGIPHIFPQNVLAEADAAK


Questions for Dr. Payne
1) Peptide vs peptide discrepancies
2) Missing scannums/ some in one but not the other