In [31]:
import pandas as pd

CARBAMIDOMETHYL = cm = 57.021464
OXIDATION = ox = 15.994915
MOD_MAPPING = mm = {"C": str(cm), "M": str(ox)}

In [41]:
def get_athena_peptidoform(amanda_row: pd.Series) -> str:
    seq = amanda_row["Annotated Sequence"]
    mods = amanda_row["Modifications"]
    if pd.isna(mods):
        return seq
    mods = [str(mod).split("(")[0].strip() for mod in mods.split(";")]
    return f"{seq.upper()}[{"+".join([str(int(mod[1:]) - 1) + ":" + mm[mod[0]] for mod in mods])}]"

In [42]:
ground_truth = pd.read_excel("amanda/PSMs_rep2_deconv.xlsx")
ground_truth["Athena Sequence"] = ground_truth.apply(lambda row: get_athena_peptidoform(row), axis = 1)
athena_candidates = pd.read_csv("results/rep2_candidates.csv", sep = ";")
athena_psms = pd.read_csv("results/rep2_psms.csv", sep = ";")

In [43]:
ground_truth

Unnamed: 0,Checked,Confidence,Identifying Node,PSM Ambiguity,Annotated Sequence,Modifications,# Proteins,Protein Accessions,# Missed Cleavages,Charge,...,File ID,Amanda Score,CharmeRT Combined Score,Search Space,MS Amanda Rank,Search Depth,q-Value,PEP,SVM Score,Athena Sequence
0,False,High,MS Amanda 3.0 (A4),Unambiguous,NLDIERPTYTNLNR,,7,Q71U36; Q9BQE3; P68363; P68366; Q9NY65; Q6PEY2...,0,2,...,F2,608.61,608.61,280,1,1,0.000092,6.305120e-16,5.125190,NLDIERPTYTNLNR
1,False,High,MS Amanda 3.0 (A4),Unambiguous,MHDLNTDQENLVGTHDAPIR,,1,O43684,0,3,...,F2,816.40,816.40,109,1,1,0.000092,6.305120e-16,5.046630,MHDLNTDQENLVGTHDAPIR
2,False,High,MS Amanda 3.0 (A4),Unambiguous,HLYTLDGGDIINALcFSPNR,C15(Carbamidomethyl),1,P63244,0,2,...,F2,763.19,763.19,177,1,1,0.000092,6.305120e-16,7.322650,HLYTLDGGDIINALCFSPNR[14:57.021464]
3,False,High,MS Amanda 3.0 (A4),Unambiguous,SHTSEGAHLDITPNSGAAGNSAGPK,,1,Q92597,0,2,...,F2,1039.47,1039.47,126,1,1,0.000092,6.305120e-16,11.261000,SHTSEGAHLDITPNSGAAGNSAGPK
4,False,High,MS Amanda 3.0 (A4),Unambiguous,LPAELQELPGLSHQYWSAPSDK,,1,P27695,0,3,...,F2,664.40,664.40,170,1,1,0.000092,6.305120e-16,5.461450,LPAELQELPGLSHQYWSAPSDK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14350,False,High,MS Amanda 3.0 (A4),Unambiguous,ELSNAKEELELmAK,M12(Oxidation),1,Q5VU43,1,2,...,F2,128.37,128.37,217,1,1,0.009886,2.494870e-01,-0.135877,ELSNAKEELELMAK[11:15.994915]
14351,False,High,MS Amanda 3.0 (A4),Unambiguous,LIcDVSR,C3(Carbamidomethyl),1,P54136,0,2,...,F2,92.34,92.34,265,1,1,0.009886,2.496590e-01,-0.136029,LICDVSR[2:57.021464]
14352,False,High,MS Amanda 3.0 (A4),Unambiguous,IYPYLVmNDAcLTESR,M7(Oxidation); C11(Carbamidomethyl),1,Q9Y4A5,0,4,...,F2,53.37,53.37,87,1,1,0.009886,2.496840e-01,-0.136050,IYPYLVMNDACLTESR[6:15.994915+10:57.021464]
14353,False,High,MS Amanda 3.0 (A4),Unambiguous,SLITSIR,,1,Q9Y4G6,0,2,...,F2,148.73,148.73,117,1,1,0.009886,2.500320e-01,-0.136355,SLITSIR


In [3]:
ground_truth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14355 entries, 0 to 14354
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Checked                     14355 non-null  bool   
 1   Confidence                  14355 non-null  object 
 2   Identifying Node            14355 non-null  object 
 3   PSM Ambiguity               14355 non-null  object 
 4   Annotated Sequence          14355 non-null  object 
 5   Modifications               1869 non-null   object 
 6   # Proteins                  14355 non-null  int64  
 7   Protein Accessions          14355 non-null  object 
 8   # Missed Cleavages          14355 non-null  int64  
 9   Charge                      14355 non-null  int64  
 10  DeltaScore                  14318 non-null  float64
 11  DeltaCn                     14355 non-null  float64
 12  Rank                        14355 non-null  int64  
 13  Search Engine Rank          143

In [4]:
athena_candidates

Unnamed: 0,ScanNumber,Peptides
0,845,"_SPGAR[],_SPGARESASVSDSR[],_SPGARESASVSDSRR[],..."
1,890,"GAPGLLMAVREDLYCFSYG[14:57.021464],THLNDKYGY[],..."
2,1491,"KHTPNFFSENSSMSITSEDSK[],_KHTGGTEAECQIEAGEEQKK[..."
3,1501,"APSIIFIDELDAIGTK[],_APGTK[],APGTK[],_GPTAVMMHV..."
4,1504,"KGPGRPTGSK[],GPGRPTGSK[],SEAGKKGPGRPTGSK[],GPR..."
...,...,...
21103,32110,"EGGLGPLNIPLLADVTR[],KEGGLGPLNIPLLADVTR[],ENHSV..."
21104,32119,"_LSGSHAMEMSQLLSELK[],DICNDVLSLLEK[2:57.021464]..."
21105,32120,"QLSIVVTALEGAAATLRPVADL[],_ARKPPVGPGPGPR[],SPIF..."
21106,32121,"_DLHSILAESESTGSAAPADPSHTDK[],_VTTDK[],_LSNTPGA..."


In [5]:
athena_candidates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21108 entries, 0 to 21107
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ScanNumber  21108 non-null  int64 
 1   Peptides    21108 non-null  object
dtypes: int64(1), object(1)
memory usage: 329.9+ KB


In [6]:
athena_psms

Unnamed: 0,ScanNumber,PrecursorMZ,PrecursorMass,PrecursorCharge,RT,Peptidoform,Peptide,PeptidoformMass,Proteins,AmandaScore,AdjustedAmandaScore,Score,BinomScore,AvgMassDiffPPM,MatchedIons,ExplainedIonCurrent,Rank,IsDecoy,Label
0,845,438.24884,874.483127,2,222,_SGPASTIYGLFIVQNSSSKLLKSVAIAQLK[],SGPASTIYGLFIVQNSSSKLLKSVAIAQLK,3119.759428,sp|Q9H720,1.000000,2.113283,10000.000000,0.000000,1000.000000,0,0.000000,1,True,false-decoy
1,845,438.24884,874.483127,2,222,_SGPAACMDLIKHHPRVNLLLPPIMSVAEQR[5:57.021464],SGPAACMDLIKHHPRVNLLLPPIMSVAEQR,3349.757364,sp|Q08J23,1.000000,2.113283,10000.000000,0.000000,1000.000000,0,0.000000,2,True,false-decoy
2,845,438.24884,874.483127,2,222,_SGPAACMDLIKHHPRVNLLLPPIMSVAEQR[5:57.021464+6:...,SGPAACMDLIKHHPRVNLLLPPIMSVAEQR,3365.752279,sp|Q08J23,1.000000,2.113283,10000.000000,0.000000,1000.000000,0,0.000000,3,True,false-decoy
3,845,438.24884,874.483127,2,222,_SGPAACMDLIKHHPRVNLLLPPIMSVAEQR[5:57.021464+23...,SGPAACMDLIKHHPRVNLLLPPIMSVAEQR,3365.752279,sp|Q08J23,1.000000,2.113283,10000.000000,0.000000,1000.000000,0,0.000000,4,True,false-decoy
4,845,438.24884,874.483127,2,222,_SGPAACMDLIKHHPRVNLLLPPIMSVAEQR[5:57.021464+6:...,SGPAACMDLIKHHPRVNLLLPPIMSVAEQR,3381.747194,sp|Q08J23,1.000000,2.113283,10000.000000,0.000000,1000.000000,0,0.000000,5,True,false-decoy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125899,32122,925.45587,1848.897187,2,4747,_ETWDSGPGQGPMEAAGLGRATHPLLQGLM[],ETWDSGPGQGPMEAAGLGRATHPLLQGLM,2976.422216,sp|Q8WUB2,23.361091,48.876507,0.004612,0.001419,2.027139,1,0.307658,3,True,false-decoy
125900,32122,925.45587,1848.897187,2,4747,_ETWDSGPGQGPMEAAGLGRATHPLLQGLM[11:15.994915],ETWDSGPGQGPMEAAGLGRATHPLLQGLM,2992.417131,sp|Q8WUB2,23.361091,48.876507,0.004612,0.001419,2.027139,1,0.307658,4,True,false-decoy
125901,32122,925.45587,1848.897187,2,4747,_ETWDSGPGQGPMEAAGLGRATHPLLQGLM[28:15.994915],ETWDSGPGQGPMEAAGLGRATHPLLQGLM,2992.417131,sp|Q8WUB2,23.003343,48.128021,0.005008,0.001541,2.027139,1,0.307658,5,True,false-decoy
125902,32122,925.45587,1848.897187,2,4747,_ETWDSGPGQGPMEAAGLGRATHPLLQGLM[11:15.994915+28...,ETWDSGPGQGPMEAAGLGRATHPLLQGLM,3008.412046,sp|Q8WUB2,23.003343,48.128021,0.005008,0.001541,2.027139,1,0.307658,6,True,false-decoy


In [7]:
athena_psms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125904 entries, 0 to 125903
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ScanNumber           125904 non-null  int64  
 1   PrecursorMZ          125904 non-null  float64
 2   PrecursorMass        125904 non-null  float64
 3   PrecursorCharge      125904 non-null  int64  
 4   RT                   125904 non-null  int64  
 5   Peptidoform          125904 non-null  object 
 6   Peptide              125904 non-null  object 
 7   PeptidoformMass      125904 non-null  float64
 8   Proteins             125904 non-null  object 
 9   AmandaScore          125904 non-null  float64
 10  AdjustedAmandaScore  125904 non-null  float64
 11  Score                125904 non-null  float64
 12  BinomScore           125904 non-null  float64
 13  AvgMassDiffPPM       125904 non-null  float64
 14  MatchedIons          125904 non-null  int64  
 15  ExplainedIonCurre

In [8]:
athena_psms[athena_psms["Label"]=="true-target"].shape[0]

11602