In [1]:
import pandas as pd
import glob
import re

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
#Need to uncompress .gz files before running this block
psm_files = glob.glob("Data/MSFragger_Output/*_psm.tsv")

In [4]:
run_files = glob.glob("Data/MSFragger_Output/*_run.tsv")

In [5]:
def extract_scannum(Spectrum):
    pattern = r'\.\d*\.'
    lst = []
    for s in Spectrum:
        m = re.search(pattern, s)
        if m:
            m = m[0]
            m = m.replace('.', '')
            m = int(m)
            lst.append(m)
        else:
            lst.append(None)
    return (lst)

In [6]:
def combine_run_files(files):
    dfs = []
    for file in files:
        pattern = '[A-Za-z]+_[a-z]+'
        m = re.search(pattern, file)
        name = m[0]
        df = pd.read_csv(file, sep = '\t')
        keeps = ['scannum', 'best_locs', 'peptide']
        df = df[df.best_locs.notnull()]
        df = df[keeps]
        df['Organism'] = [name] * len(df)
        df = df.rename(columns = {'peptide': 'Peptide'})
        dfs.append(df)
    dfs = pd.concat(dfs)
    return (dfs)

In [7]:
def combine_psm_files(files):
    dfs = []
    i = 0
    for file in files:
        pattern = '[A-Za-z]+_[a-z]+'
        m = re.search(pattern, file)
        name = m[0]
        keeps = ['scannum', 'Peptide', 'Observed Modifications', 'Protein ID', 'Gene', 'Delta Mass']
        df = pd.read_csv(file, sep = '\t')   
        df = df[df['PeptideProphet Probability'] >=0.99]
        df = df[df['Observed Modifications'].notnull()]
        df = df[df.Gene.notnull()]
        df = df[~df.Gene.str.contains('Daci')]
        df['scannum'] = extract_scannum(df.Spectrum)
        df = df[keeps]
        df['Organism'] = [name] * len(df)        
        dfs.append(df)
    df = pd.concat(dfs)
    return(df)

In [8]:
combinded_psm_df = combine_psm_files(psm_files)
combined_run_df = combine_run_files(run_files)

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


In [9]:
df = pd.merge(combinded_psm_df, combined_run_df, how='inner')
df.to_csv('Data/Modified_peptides.tsv', sep = '\t') 

In [10]:
df

Unnamed: 0,scannum,Peptide,Observed Modifications,Protein ID,Gene,Delta Mass,Organism,best_locs
0,2397,AEATHPAPAESGNGAEGGK,Xlink:EGS[115](115.026943),A5FZF9,secB,115.0261,Acidiphilium_cryptum,AEATHPAPAESGNGAEGGk
1,2415,ASGAGGQHVNKTESAVR,"Asn->Gln(14.015650), Asp->Glu(14.015650), Gly-...",A5FX99,prfA,14.0112,Acidiphilium_cryptum,ASGAggqHVNKTESAVR
2,2457,KASAAKSTTAAAPK,"Ala->Asn(43.005814), Carbamyl(43.005814)",A5FUV4,Acry_0158,43.0001,Acidiphilium_cryptum,KASAAkSTTAAAPK
3,2571,KQPNKATADAVK,"Ala->Asn(43.005814), Carbamyl(43.005814)",A5FZT4,Acry_1915,43.0047,Acidiphilium_cryptum,KqPNKATADAVK
4,2582,PAAQHVGAAPK,Bacillosamine(228.111007),A5FYI2,Acry_1456,228.1204,Acidiphilium_cryptum,PAAQHVGAAPK
...,...,...,...,...,...,...,...,...
102718,38808,NMITGAAQMDGAILVVSAADGPMPQTR,Pro->HAVA(18.010565),A0A1R0IKZ6,tuf,18.0100,Sulfobacillus_thermosulfidooxidans,NMITGAAQMDGAIlVVSAADGPMPQTR
102719,38815,DNQLVDKIQTDLEFVIQTIEGSK,Xle->Ser(-26.052036),A0A2T2WQR6,atpH,-26.0469,Sulfobacillus_thermosulfidooxidans,DNQLVdkiqtdlEFVIQTIEGSK
102720,38830,FLAPLNTPLDMLDEALNILEK,"Met->Phe(16.027929), Methyl:2H(2)(16.028204), ...",A0A2T2X5Q4,gabT,16.0358,Sulfobacillus_thermosulfidooxidans,FLAPLNTPLDmldEALNILEK
102721,38836,FLAPLNTPLDMLDEALNILEK,"Asn->Gln(14.015650), Asp->Glu(14.015650), Gly-...",A0A2T2X5Q4,gabT,14.0097,Sulfobacillus_thermosulfidooxidans,FLAPLNTPLDMLdEALNILEK
