In [7]:
import pandas as pd
import numpy as np
from glob import glob
import re
import datetime

In [8]:
# import data and select columns
MSF_raw = pd.read_csv('data_old/peptide.tsv', sep='\t')
MM_raw = pd.read_csv('data_old/AllQuantifiedPeptides.tsv', sep='\t')
MQ_raw = pd.read_csv('data_old/evidence.txt', sep='\t')
AP_raw = pd.read_csv("data_old/results_peptides.csv")
MM_all_raw = pd.read_csv("data_old/AllPeptides.psmtsv", sep='\t')

In [9]:
# define modifications dictionary
mods_dict = {
    "cC": "C_carbamidomethylation",
    "C(57.0214)": "C_carbamidomethylation",
    "[Common Fixed:Carbamidomethyl on C]": "C_carbamidomethylation",
    "(Oxidation (M))": "M_oxidation",
    "M(15.9949)": "M_oxidation",
    "oxM": "M_oxidation",
    "[Common Variable:Oxidation on M]": "M_oxidation",
}

In [10]:
# define function for removing duplicates and sorting
def _sort_and_remove_duplicates(protein):
    proteins = protein.split(", ")
    unique_proteins = sorted(set(proteins))
    return ", ".join(unique_proteins)

MSF parser

In [25]:
# define MSF parser
def MSF_parser(df, mapper=mods_dict):
    
    # select and rename cols
    df = df[['Peptide', 'Probability', 'Assigned Modifications', 'Protein']]
    df = df.rename(columns={'Peptide': 'peptide', "Probability": "MSF_probability", 'Assigned Modifications': 'mods_type', 'Protein': 'proteinID'})

    # add mods col 
    df['mods'] = df['mods_type'].apply(lambda x: 'modified' if isinstance(x, str) else 'unmodified')
    
    # remove '-p1', remove duplicates and sort proteinID col
    df['proteinID'] = df['proteinID'].str.replace('-p1', '').apply(_sort_and_remove_duplicates)

    # use mapper to get wished output
    for key, val in mapper.items():
        df['mods_type'] = df['mods_type'].str.replace(key, val)
    df['mods_type'] = df['mods_type'].astype(str)

    # order mods_type
    def _mods_type_sorter(val: str):
        return ', '.join(sorted(val.split(', '), key=lambda el: None if el == 'nan' else int(re.search('^\d+', el).group(0))))
    
    df['mods_type'] = df['mods_type'].apply(_mods_type_sorter)

    # select max values for those with matching keys
    df = df.groupby(['peptide', 'proteinID', 'mods', 'mods_type'], as_index=False)['MSF_probability'].max()

    return df.reset_index(drop=True)

In [26]:
MSF = MSF_parser(MSF_raw)
MSF.head()

Unnamed: 0,peptide,proteinID,mods,mods_type,MSF_probability
0,AEMLYSGESGPDDKYYVGIK,TCSYLVIO_009936-t26_1,modified,3M_oxidation,1.0
1,FLADKFDWDVAEAR,TCSYLVIO_009936-t26_1,unmodified,,0.9999
2,GGIVGGGGGGGGFSR,TcCLB.503575.27:mRNA,unmodified,,0.991
3,GLKPDIPPLDTFLDK,TCSYLVIO_009936-t26_1,unmodified,,0.9969
4,GVIIGEENRPGTPIYNVR,TCSYLVIO_009936-t26_1,unmodified,,1.0


MM parser

In [27]:
# define MM parser
def MM_parser(df, df_all, mapper=mods_dict):

    # add new col from df_all, select and rename cols
    df['MM_score'] = df['Sequence'].map(df_all.set_index('Full Sequence')['Score'])
    df = df[['MM_score', 'Base Sequence', 'Sequence', 'Protein Groups']]
    df = df.rename(columns={'Base Sequence': 'peptide', 'Sequence': 'mods_type', 'Protein Groups': 'proteinID'})

    # add mods col
    df['mods'] = df.apply(lambda row: 'modified' if row['mods_type'] != row['peptide'] else 'unmodified', axis=1)

    # remove '-p1', remove duplicates and sort proteinID col
    df['proteinID'] = df['proteinID'].str.replace('-p1', '').str.replace(' | ', ',').str.replace('|', ',').apply(_sort_and_remove_duplicates)
    
    # order mods type
    def _mods_type_extract(val):
        res = []
        while re.search('(\[[^\[\]]+\])', val):
            m = re.search('(\[[^\[\]]+\])', val)
            res.append(f'{m.start()}{mapper[m.group(1)]}')
            val = val[:m.start()] + val[m.end():]

        if not res:
            return 'nan'
            
        return ', '.join(res)
    
    df['mods_type'] = df['mods_type'].apply(_mods_type_extract)
    
    # select max values for those with matching keys
    df = df.groupby(['peptide', 'proteinID', 'mods_type', 'mods'], as_index=False)['MM_score'].max()

    return df

In [28]:
MM = MM_parser(MM_raw, MM_all_raw)
MM.head()

Unnamed: 0,peptide,proteinID,mods_type,mods,MM_score
0,AEMLYSGESGPDDK,"TcCLB.508169.20:mRNA,TcCLB.508169.20:mRNA,TcCL...",3M_oxidation,modified,10.509
1,AEMLYSGESGPDDK,"TcCLB.508169.20:mRNA,TcCLB.508169.20:mRNA,TcCL...",,unmodified,11.598
2,AEMLYSGESGPDDKYYVGIK,"TcCLB.508169.20:mRNA,TcCLB.508169.20:mRNA,TcCL...",3M_oxidation,modified,13.495
3,AYLPVAESFGFTADLR,"TcCLB.508169.20:mRNA,TcCLB.508169.20:mRNA,TcCL...",,unmodified,11.561
4,DLQEDFMNGAPLK,"TcCLB.508169.20:mRNA,TcCLB.508169.20:mRNA,TcCL...",7M_oxidation,modified,14.344


MQ parser

In [11]:
def MQ_parser(df, mapper=mods_dict):

    # select and rename cols
    df = df[["Sequence", "Modified sequence", "Score", "Proteins"]]
    df = df.rename(
        columns={"Sequence": "peptide", "Score": "MQ_score", "Modified sequence": "mods_type", "Proteins": "proteinID"}
    )

    # remove contaminants
    df = df.drop(df[df["proteinID"].str.contains("CON__")].index)

    # add mods col
    df["mods_type"] = df["mods_type"].str.replace("_", "")
    df["mods"] = df.apply(lambda row: "unmodified" if row["mods_type"] == row["peptide"] else "modified", axis=1)

    # remove '-p1', remove duplicates and sort proteinID col
    df["proteinID"] = df["proteinID"].str.replace("-p1", "").str.replace(";", ", ").apply(_sort_and_remove_duplicates)

    # order mods type
    def _mods_type_extract(val):
        res = []
        while re.search("(\(\w+\s+\([A-Z]\)\))", val):
            m = re.search("(\(\w+\s+\([A-Z]\)\))", val)
            res.append(f"{m.start()}{mapper[m.group(0)]}")
            val = val[: m.start()] + val[m.end() :]

        if not res:
            return "nan"

        return ", ".join(res)

    df["mods_type"] = df["mods_type"].apply(_mods_type_extract)

    # select max values for those with matching keys
    df = df.groupby(["peptide", "proteinID", "mods", "mods_type"], as_index=False)["MQ_score"].max()
    return df

In [12]:
MQ = MQ_parser(MQ_raw)
MQ.head()

Unnamed: 0,peptide,proteinID,mods,mods_type,MQ_score
0,AEMLYSGESGPDDK,"TCSYLVIO_009936-t26_1, TcCLB.508169.20:mRNA, T...",unmodified,,22.486
1,AEMLYSGESGPDDKYYVGIK,"TCSYLVIO_009936-t26_1, TcCLB.508169.20:mRNA, T...",unmodified,,4.7873
2,ALLSLSPR,"TCSYLVIO_010625-t26_1, TcCLB.510407.40:mRNA",unmodified,,44.407
3,ALMDYPEQIR,"TCSYLVIO_009936-t26_1, TCSYLVIO_009940-t26_1, ...",modified,3M_oxidation,49.554
4,ALMDYPEQIR,"TCSYLVIO_009936-t26_1, TCSYLVIO_009940-t26_1, ...",unmodified,,40.239


AA parser

In [33]:
# define AP parser
def AP_parser(df, mapper=mods_dict):

    # select and rename cols
    df = df[['sequence', 'sequence_naked', 'score', 'protein']]
    df = df.rename(columns={'sequence_naked': 'peptide', 'score': 'AP_score', 'sequence': 'mods_type', 'protein': 'proteinID'})

    # add mods col
    df['mods'] = df.apply(lambda row: 'modified' if row['mods_type'] != row['peptide'] else 'unmodified', axis=1)

    # remove '-p1', remove duplicates and sort proteinID col
    df['proteinID'] = df['proteinID'].str.replace('-p1', '').str.replace(',', ', ').apply(_sort_and_remove_duplicates)

    # order mods type
    def _mods_type_extract(val):
        res = []
        while re.search('([a-z]+[A-Z])', val):
            m = re.search('([a-z]+[A-Z])', val)
            res.append(f'{m.start()+1}{mapper[m.group(1)]}')
            val = val[:m.start()] + val[m.end()-1:]

        if not res:
            return 'nan'
            
        return ', '.join(res)
    
    df['mods_type'] = df.mods_type.apply(_mods_type_extract)

    # select max values for those with matching keys
    df = df.groupby(['peptide', 'proteinID', 'mods', 'mods_type'], as_index=False)['AP_score'].max()
    return df 

In [34]:
AP = AP_parser(AP_raw)
AP.head()

Unnamed: 0,peptide,proteinID,mods,mods_type,AP_score
0,AEMLYSGESGPDDK,"TCSYLVIO_009936-t26_1, TcCLB.508169.20:mRNA, T...",modified,3M_oxidation,0.98966
1,AEMLYSGESGPDDKYYVGIK,"TCSYLVIO_009936-t26_1, TcCLB.508169.20:mRNA, T...",modified,3M_oxidation,1.0
2,AEMLYSGESGPDDKYYVGIK,"TCSYLVIO_009936-t26_1, TcCLB.508169.20:mRNA, T...",unmodified,,0.91436
3,ALMDYPEQIR,"TCSYLVIO_009936-t26_1, TcCLB.508169.20:mRNA, T...",modified,3M_oxidation,0.943901
4,ALMDYPEQIR,"TCSYLVIO_009936-t26_1, TcCLB.508169.20:mRNA, T...",unmodified,,0.919905


In [38]:
# merge dataframes
merged = MSF.merge(MM, how='outer', left_on=['peptide', 'mods', 'proteinID', 'mods_type'], right_on=['peptide', 'mods', 'proteinID', 'mods_type']).merge(MQ, how='outer', left_on=['peptide', 'mods', 'proteinID', 'mods_type'], right_on=['peptide', 'mods', 'proteinID', 'mods_type']).merge(AP, how='outer', left_on=['peptide', 'mods', 'proteinID', 'mods_type'], right_on=['peptide', 'mods', 'proteinID', 'mods_type'])

In [39]:
current_date = datetime.datetime.now().strftime("%Y-%m-%d")
merged.to_csv(f'{current_date}_merged.csv', index=False)