In [1]:
# import libraries
from pyopenms import *
import os

In [2]:
# change directory to find file of interest
os.chdir(r'C:\Users\miar\Desktop\data')

In [3]:
#------------------------------------------------------------------------------------------------------------------------------

# ONLY NEED TO RUN THE NEXT TWO BLOCKS ONCE

#-------------------------------------------------------------------------------------------------------------------------------

In [4]:
# load the content of the mzML file into the exp variable of type MSExperiment
#exp = MSExperiment()
#MzMLFile().load("HEK293T_De_Novo_053122_Glu-C_B_correctRTSenzyme_BP.mzML", exp)

# access the raw data and spectra
#spectrum_data = exp.getSpectrum(0).get_peaks()
#spectrum_data

In [5]:
# loop through the spectra to gather MS2 scans
#specM2 = []
#for s in exp.getSpectra():
#    if s.getMSLevel() == 2:
#        specM2.append(s)
        
#print("Number of MS2 scans: " + str(len(specM2)))

#exp.setSpectra(specM2) # keep only MS2

# store the modified data structure on disk
#MzMLFile().store("filtered_MS2.mzML", exp)

In [6]:
# parse function
def parseScanLine(input):
    x = input.split(" For: ")
    [scan_number, mzs] = x[1].split(", ")
    [precursor_mz, fragment_mz] = mzs.split(";")
    trimmed_fragment_mz = fragment_mz.strip() # trim fragment strings to remove \n
    return [scan_number, precursor_mz, trimmed_fragment_mz]

In [7]:
# checking lines of log file and creating dictionary of scan numbers and fragment mzs
try:
  
    # words to search for
    search = ' Submitted Custom Scan For:'
  
    # reading file content line by line
    search = ' Submitted Custom Scan For:'   # words to search for
    
    # dict for scan numbers and corresponding fragments 
    scan2frag = dict()
    with open('App-2022-05-31_20-49-35.log') as f:
        for line in f:
            if search in line:
                scan_number, precursor_mz, trimmed_fragment_mz = parseScanLine(line)
                scan2frag[scan_number] = trimmed_fragment_mz
            
    # if the input string doesn't exist in the text file
    if len(scan2frag)==0:
        print("\n\"" +search+ "\" is not found in \"" +'App-2022-05-31_20-49-35.log'+ "\"!")
    else:
        pass

except FileNotFoundError:
    print("The file does not exist!")

In [8]:
# load in MS2 scans
exp1 = MSExperiment()
MzMLFile().load("filtered_MS2.mzML", exp1)

In [9]:
# read in peptide sequence from tsv
import pandas as pd
tsv = pd.read_csv('HEK293T_De_Novo_053122_Glu-C_B_correctRTSenzyme_BP_realtimesearch1.tsv', sep='\t')

# create dictionary with scan # as key and sequence/charge as values
scan2PeptideCharge = dict([(i, [x,y]) for i, x,y, in zip(tsv['Scan Number'], tsv['Peptide'], tsv['Charge State'])])

# removing all NaN sequences (not useful)
scan2PeptideCharge_modified = {k:v for k,v in scan2PeptideCharge.items() if str(v[0]) != 'nan'}

In [10]:
def findFragments(peptide_object, charge):
    # loop through each prefix and suffix (b and y ions, respectively)
    try:
        # y and b ions
        y_ions = []
        b_ions = []
        for i in range(1, (len(trimmed_sequence) - 1)): # start at index of 1, end at peptide length - 1
            y_ions.append(peptide_object.getSuffix(i))
            b_ions.append(peptide_object.getPrefix(i))

    except RuntimeError: # range above may be too large for indexing when considering modifications (ex. [15.9949])
        print('Modifications resulted in abnormal indexing for sequence: ' + str(peptide_object))
        # will have to remove "fragment" that is now the entire peptide length instead of peptide length - 1
        y_ions.pop()
        b_ions.pop()
        
    def loopChargeStates():
        # computing fragment mzs
        # compute all y_ion mzs
        y_ion_mzs = []
        for i in y_ions:
            for x in range(1, charge):
                mz_y = i.getMonoWeight(Residue.ResidueType.YIon, x) / x
                y_ion_mzs.append(mz_y)   
        # compute all b_ion mzs
        b_ion_mzs = []
        for i in b_ions:
            for x in range(1, charge):
                mz_b = i.getMonoWeight(Residue.ResidueType.BIon, x) / x
                b_ion_mzs.append(mz_b)
        return y_ion_mzs, b_ion_mzs
    
    # call nested function
    y_ion_mzs, b_ion_mzs = loopChargeStates()

    y_indices = []
    for i in y_ion_mzs:
        y_indices.append(s.findNearest(i, 0.4))
    b_indices = []
    for i in b_ion_mzs:
        b_indices.append(s.findNearest(i, 0.4))

    return y_indices, b_indices

In [18]:
y_indices = []
b_indices = []
for s in exp1:
    s_number = s.getNativeID().split(' ')[-1]
    _, scan_number = s_number.split('=')
    
    if scan_number in scan2frag and int(scan_number) in scan2PeptideCharge_modified:
        # isolate peptide sequence from dict
        sequence = scan2PeptideCharge_modified[int(scan_number)][0]     
        trimmed_sequence = sequence[2:-2] # remove first two and last two characters 
        
        # isolate charge from dict
        charge = scan2PeptideCharge_modified[int(scan_number)][1]
        
        # create peptide object 
        peptide_object = AASequence.fromString(trimmed_sequence)
        
        # call findFragments function
        y, b = findFragments(peptide_object, charge)
        y_indices.append(y)
        b_indices.append(b)
    
    else: 
        pass 

Modifications resulted in abnormal indexing for sequence: ASPDPMLEPM[15.9949]SWRGNIHE
Modifications resulted in abnormal indexing for sequence: RQVPLASPSSM[15.9949]SAALRGISCYLKE
Modifications resulted in abnormal indexing for sequence: KSSM[15.9949]KRKLPFTISPSRNE
Modifications resulted in abnormal indexing for sequence: GRGPPGNPPRRM[15.9949]GRINHLRGPSPPPMAGGUGR
Modifications resulted in abnormal indexing for sequence: KLM[15.9949]QLNLCSNRLE
Modifications resulted in abnormal indexing for sequence: SVPPISQLPCTALSPVAQTHGSM[15.9949]LSPGAQPHD
Modifications resulted in abnormal indexing for sequence: SKVALSTCNGLHGM[15.9949]FE
Modifications resulted in abnormal indexing for sequence: MPRAQPSSASYQPVPADPFAIVSRAQQM[15.9949]VE
Modifications resulted in abnormal indexing for sequence: GQAAARGAGPRLLLLLLLPLPLPLPLLQLVGRRPGAAGRSGSAWRRAQLGM[15.9949]
Modifications resulted in abnormal indexing for sequence: SPSSIALFYKACLCSVDPVAM[15.9949]MKQTVIND
Modifications resulted in abnormal indexing for sequence:

Modifications resulted in abnormal indexing for sequence: TIRMM[15.9949]ITQGNMQLKE
Modifications resulted in abnormal indexing for sequence: QSPANRKVSQM[15.9949]NSLE
Modifications resulted in abnormal indexing for sequence: M[15.9949]RGRLCVGRAAAAAAAVAVPLAGGQE
Modifications resulted in abnormal indexing for sequence: SINVALVQNAM[15.9949]TLIRGKSILKE
Modifications resulted in abnormal indexing for sequence: VVM[15.9949]QDPLYE
Modifications resulted in abnormal indexing for sequence: QIVQIFGNKM[15.9949]E
Modifications resulted in abnormal indexing for sequence: M[15.9949]DPKQTTLLCLGLYGKPFLSAD
Modifications resulted in abnormal indexing for sequence: MAGTALKRLM[15.9949]AE
Modifications resulted in abnormal indexing for sequence: RSHIGANLWVAPRRVGKGTSFKLVYM[15.9949]PRNE
Modifications resulted in abnormal indexing for sequence: WREPPTPALTRM[15.9949]E
Modifications resulted in abnormal indexing for sequence: LVM[15.9949]AQANVSRAKAVRALRD
Modifications resulted in abnormal indexing for sequence: 

Modifications resulted in abnormal indexing for sequence: TNM[15.9949]GIIAGVAFGIAFSQLIGMLLACCLSRFITANQYE
Modifications resulted in abnormal indexing for sequence: M[15.9949]RYVASYLLAALGGNSSPSAKD
Modifications resulted in abnormal indexing for sequence: YLRVASGPM[15.9949]PVHTTSKRPRVDPVLSD
Modifications resulted in abnormal indexing for sequence: QALFHSLNSSLSQACASPSMEPLGVMPTHM[15.9949]GQGRYPVGVSNMVLRILGFLVD
Modifications resulted in abnormal indexing for sequence: LKLLPCTSKAIM[15.9949]PYCLHLMLACFKLRAFTD
Modifications resulted in abnormal indexing for sequence: GLLRLRGGGIFSNLRVQGCM[15.9949]PQPGCNLLNGTQE
Modifications resulted in abnormal indexing for sequence: STAM[15.9949]TLPMSDPTAWATAMNNLGMAPLGIAGQPILPD
Modifications resulted in abnormal indexing for sequence: NRTSSSM[15.9949]QLAHQSPLQPLTAAALAGSLGVWVQD
Modifications resulted in abnormal indexing for sequence: VLGFNTRQRKAFLNAVMRWGM[15.9949]PPQD
Modifications resulted in abnormal indexing for sequence: LLNQSFPWGKIPLNALTM[15.9949]CLARLLFFK

Modifications resulted in abnormal indexing for sequence: NQRLKVPSPKRRVVCVM[15.9949]IVLAFIILNYGPMSMLE
Modifications resulted in abnormal indexing for sequence: GNRPTNSIVFTKLTPFMLGALVAM[15.9949]YE
Modifications resulted in abnormal indexing for sequence: NPYFGAGFGLVGVGTALALARKGVQLGLVAFRRHYM[15.9949]ITLE
Modifications resulted in abnormal indexing for sequence: IQVKKYKCGLIKPCPANYFAFKICSGAANVVGPTM[15.9949]CFE
Modifications resulted in abnormal indexing for sequence: DPFPSRDPRSLGPMVPSLLTGITLGPPRM[15.9949]E
Modifications resulted in abnormal indexing for sequence: SFFTIM[15.9949]CQVVVPILLSGLCMMTAGLVMNTIQHWPVFVE
Modifications resulted in abnormal indexing for sequence: MLGM[15.9949]NMLLITLFLLLPLSMLKGEPWE
Modifications resulted in abnormal indexing for sequence: QACM[15.9949]VPGLALCLLLGPLAGAKPVQE
Modifications resulted in abnormal indexing for sequence: MPGLPCPALPCPAPPPAPSCLIAM[15.9949]ALKNSKTGSLPVSE
Modifications resulted in abnormal indexing for sequence: VSPHGAPALSNGPQTQAQLLQPLQALQTQLLPQAL

In [21]:
y_indices[0]

[9,
 -1,
 -1,
 6,
 77,
 -1,
 -1,
 -1,
 129,
 -1,
 -1,
 -1,
 -1,
 92,
 -1,
 -1,
 -1,
 109,
 -1,
 123,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1]

In [None]:
# etc etc

In [None]:
# next....
# how many times we didn't see any peaks that correspond to the fragmentation at a specific peptide bond 