In [1]:
# import libraries
from pyopenms import *
import os

In [2]:
# change directory to find file of interest
os.chdir(r'C:\Users\miar\Desktop\data')

In [3]:
# load the content of the mzML file into the exp variable of type MSExperiment
exp = MSExperiment()
MzMLFile().load("HEK293T_De_Novo_053122_Glu-C_B_correctRTSenzyme_BP.mzML", exp)

# access the raw data and spectra
#spectrum_data = exp.getSpectrum(0).get_peaks()
#spectrum_data

In [4]:
# loop through the spectra to gather MS2 scans

specM2 = []
for s in exp.getSpectra():
    if s.getMSLevel() == 2:
        specM2.append(s)
        
#print("Number of MS2 scans: " + str(len(specM2)))

exp.setSpectra(specM2) # keep only MS2

# store the modified data structure on disk
MzMLFile().store("filtered_MS2.mzML", exp)

In [5]:
# parse function
def parseScanLine(input):
    x = input.split(" For: ")
    [scan_number, mzs] = x[1].split(", ")
    [precursor_mz, fragment_mz] = mzs.split(";")
    return [scan_number, precursor_mz, fragment_mz]

In [6]:
# checking lines of log file and creating dictionary of scan numbers and fragment mzs
try:
  
    # words to search for
    search = ' Submitted Custom Scan For:'
  
    # reading file content line by line
    scans = []
    frag = []
    search = ' Submitted Custom Scan For:'   # words to search for
    
    with open('App-2022-05-31_20-49-35.log') as f:
        for line in f:
            if search in line:
                scan_number, precursor_mz, fragment_mz = parseScanLine(line)
                scans.append(scan_number)
                frag.append(fragment_mz)
    
    # trim fragment strings to remove \n
    fragments = [sub[ : -1] for sub in frag]
    
    # convert lists to dictionary 
    # keys - scans 
    # values - fragments 
    dict1 = dict(zip(scans, fragments))
                  
    # if the input string doesn't exist in the text file
    if len(scans)==0 or len(frag)==0:
        print("\n\"" +search+ "\" is not found in \"" +'App-2022-05-31_20-49-35.log'+ "\"!")
    else:
        pass

except FileNotFoundError:
    print("The file does not exist!")

In [39]:
#dict1

In [8]:
# load in MS2 scans
# is this step necessary ???
exp1 = MSExperiment()
MzMLFile().load("filtered_MS2.mzML", exp1)

In [9]:
# convert scan numbers in dict to list
scan_nrs = list(dict1.keys())

# filter spectra for scan numbers also found in dict
filtered = MSExperiment()
spec_scans = []
for k, s in enumerate(exp1):
    if k in list(map(int, scan_nrs)): # convert str to int for every element in list
        filtered.addSpectrum(s)
        spec_scans.append(k)

# next, create peptide object ...
# https://pyopenms.readthedocs.io/en/latest/aasequences.html 

In [57]:
#spec_scans

In [10]:
# read in peptide sequence from tsv
import pandas as pd
tsv = pd.read_csv('HEK293T_De_Novo_053122_Glu-C_B_correctRTSenzyme_BP_realtimesearch1.tsv', sep='\t')

# create dictionary with scan # as key and sequence/charge as values
dict2 = dict([(i, [x,y]) for i, x,y, in zip(tsv['Scan Number'], tsv['Peptide'], tsv['Charge State'])])

In [58]:
#dict2

In [32]:
# isolate sequences from dict2 for scan numbers found in both dict2 and filtered spectra
seqs = []
for i in spec_scans:
    if i in dict2:
        seqs.append(dict2[i][0])
        
# isolate portion of sequence between the periods       
seqs_str = list(map(str, seqs)) # converts NaN (float) lines (and all lines) to str
trimmed_seqs = [] 
for i in seqs_str: 
    if i != 'nan':
        trimmed_seqs.append(i[2:-2]) # remove first two and last two characters in str 
    elif i == 'nan':
        pass # disregard NaN values

In [59]:
#seqs

In [13]:
# number of scans in dict1 (log file)
len(scan_nrs)

3620

In [14]:
# number of scans in filtered MS2 spectrum
len(spec_scans)

3457

In [33]:
# number of matched scans between spectrumm data and dict2 (tsv file)
len(seqs)

3371

In [41]:
# number of sequences after NaNs removed
len(trimmed_seqs)

3339

In [36]:
# if scan number in spectra data is also found in dict, create a peptide object 
pept_objects = []
for i in trimmed_seqs:
    pept_objects.append(AASequence.fromString(i))

In [38]:
# check which fragments are present in the spectrum
# for first peptide object/sequence in list

# loop through each prefix and suffix (b and y ions, respectively)
try:
    # y and b ions
    y_ions = []
    b_ions = []
    for i in range(len(trimmed_seqs[0])):
        y_ions.append(pept_objects[0].getSuffix(i))
        b_ions.append(pept_objects[0].getPrefix(i))
        
except RuntimeError: # range above may be too large sometimes when considering modifications (ex. [15.9949])
    pass 

In [None]:
# for each ion look through all possible charge states

# isolate charges from dict2
charges = []
for i in seqs:
    if i != 'nan':
        charges.append(dict2[i][1])
        

In [None]:
print("y mz:", suffix.getMonoWeight(Residue.ResidueType.YIon, 2) / 2.0 )

In [None]:
suffix = seq.getSuffix(3) # y3 ion "GER"
print("="*35)
print("y3 ion sequence:", suffix)
y3_formula = suffix.getFormula(Residue.ResidueType.YIon, 2) # y3++ ion
suffix.getMonoWeight(Residue.ResidueType.YIon, 2) / 2.0 # CORRECT
suffix.getMonoWeight(Residue.ResidueType.XIon, 2) / 2.0 # CORRECT
suffix.getMonoWeight(Residue.ResidueType.BIon, 2) / 2.0 # INCORRECT
 
print("y3 mz:", suffix.getMonoWeight(Residue.ResidueType.YIon, 2) / 2.0 )
print("y3 molecular formula:", y3_formula)

In [21]:
test_seq.getMonoWeight()

2744.2727062339004