In [1]:
# import libraries
from pyopenms import *
import os

In [2]:
# change directory to find file of interest
os.chdir(r'C:\Users\miar\Desktop\data')

In [3]:
# load the content of the mzML file into the exp variable of type MSExperiment
exp = MSExperiment()
MzMLFile().load("HEK293T_De_Novo_053122_Glu-C_B_correctRTSenzyme_BP.mzML", exp)

# access the raw data and spectra
#spectrum_data = exp.getSpectrum(0).get_peaks()
#spectrum_data

In [4]:
# loop through the spectra to gather MS2 scans

specM2 = []
for s in exp.getSpectra():
    if s.getMSLevel() == 2:
        specM2.append(s)
        
#print("Number of MS2 scans: " + str(len(specM2)))

exp.setSpectra(specM2) # keep only MS2

# store the modified data structure on disk
MzMLFile().store("filtered_MS2.mzML", exp)

In [5]:
# parse function
def parseScanLine(input):
    x = input.split(" For: ")
    [scan_number, mzs] = x[1].split(", ")
    [precursor_mz, fragment_mz] = mzs.split(";")
    return [scan_number, precursor_mz, fragment_mz]

In [6]:
# checking lines of log file and creating dictionary of scan numbers and fragment mzs
try:
  
    # words to search for
    search = ' Submitted Custom Scan For:'
  
    # reading file content line by line
    scans = []
    frag = []
    search = ' Submitted Custom Scan For:'   # words to search for
    
    with open('App-2022-05-31_20-49-35.log') as f:
        for line in f:
            if search in line:
                scan_number, precursor_mz, fragment_mz = parseScanLine(line)
                scans.append(scan_number)
                frag.append(fragment_mz)
    
    # trim fragment strings to remove \n
    fragments = [sub[ : -1] for sub in frag]
    
    # convert lists to dictionary 
    # keys - scans 
    # values - fragments 
    dict1 = dict(zip(scans, fragments))
                  
    # if the input string doesn't exist in the text file
    if len(scans)==0 or len(frag)==0:
        print("\n\"" +search+ "\" is not found in \"" +'App-2022-05-31_20-49-35.log'+ "\"!")
    else:
        pass

except FileNotFoundError:
    print("The file does not exist!")

In [7]:
#dict1

In [8]:
# load in MS2 scans
exp1 = MSExperiment()
MzMLFile().load("filtered_MS2.mzML", exp1)

In [10]:
# convert scan numbers to list
scan_nrs = list(dict1.keys())

# filter spectra for scan numbers
filtered = MSExperiment()
for k, s in enumerate(exp1):
    if k in scan_nrs:
        filtered.addSpectrum(s)
        
        
# create peptide object???
# https://pyopenms.readthedocs.io/en/latest/aasequences.html 

In [13]:
# read in peptide sequence from tsv
import pandas as pd
tsv = pd.read_csv('HEK293T_De_Novo_053122_Glu-C_B_correctRTSenzyme_BP_realtimesearch1.tsv', sep='\t')

# create dictionary with scan # as key and sequence/charge as values
dict2 = dict([(i, [x,y]) for i, x,y, in zip(tsv['Scan Number'], tsv['Peptide'], tsv['Charge State'])])

In [14]:
dict2

{1695: ['E.QMHYGSISISNFKPMHKATYE.F', 3],
 1696: ['E.QYFAVLHKKKIME.L', 3],
 1697: ['E.IQNGPYHTSPM[15.9949]IGQFSGTD.L', 2],
 1698: ['E.LIPSIQSHHSCVVCMPSD.E', 2],
 1699: ['D.KAQLCPGM[15.9949]GYATFSFRSE.R', 3],
 1700: ['E.SSAGGSFTVRTD.T', 2],
 1701: ['E.CVQVYSM[15.9949]GGPNRFYFLE.A', 2],
 1702: ['D.GLWNGAQLYACQDPTIFE.E', 2],
 1703: ['E.VNGFVCLCLPSYGGSFCE.K', 2],
 1704: ['E.TLAPSLLGSLSSINFD.A', 4],
 1705: ['E.CFSGGYSPLCLCVFGNVE.D', 3],
 1706: ['E.SSSTKMQLVKQRLE.T', 3],
 1707: ['D.ASPDPMLEPM[15.9949]SWRGNIHE.F', 3],
 1708: ['D.VPEPPGQAQCSLNFPTNAAVTRD.E', 3],
 1709: ['D.ASPDPMLEPM[15.9949]SWRGNIHE.F', 2],
 1710: ['D.VCTGTLLLLE.R', 2],
 1711: ['E.KHTTISNQAE.D', 2],
 1712: ['-.PWRSGAQGKPYSKANWVSCCQE.G', 3],
 1713: ['E.LSFWGVCVSE.V', 2],
 1714: ['E.SSAGGSFTVRAD.H', 2],
 1716: ['D.FWMQICRGSGRSYNSVKFSHSWLGE.C', 4],
 1717: ['E.EPGAPAPAGPSHVIFCLM[15.9949]CPGRFFCD.R', 4],
 1718: ['D.VRAAVYQPQPHPQPPPYGHCVTD.S', 3],
 1719: ['D.LSNNLNGVCPPE.M', 2],
 1720: ['E.AAVAIKAMAK.-', 2],
 1721: ['E.GSVSSLTFIPTVNP

In [18]:
dict2[1695][0]

'E.QMHYGSISISNFKPMHKATYE.F'

In [19]:
test_seq = AASequence.fromString(dict2[1695][0])

In [20]:
test_seq

<pyopenms.pyopenms_1.AASequence at 0x14fe1779870>

In [21]:
test_seq.getMonoWeight()

2744.2727062339004