In [1]:
# import libraries
from pyopenms import * # main package used for handling MS data
import os # changing directories
import pandas as pd # creating and manipulating dataframe

In [2]:
# change directory to find data files of interest
os.chdir(r'C:\Users\miar\Desktop\data')

In [3]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD10.mzML'
log = 'App-2022-06-12_14-16-26.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD10_realtimesearch.tsv'

In [4]:
# load the content of the mzML file into the exp variable of type MSExperiment
exp = MSExperiment()
MzMLFile().load(mzML, exp)

In [5]:
# loop through the spectra to filter MS3 scans
specM3 = [] # list of MS3 spectra 
row_data = []
for s in exp.getSpectra():
    
    if s.getMSLevel() == 3:
        specM3.append(s)
        
        # get scan number
        s_number = s.getNativeID().split(' ')[-1]
        _, scan_number = s_number.split('=')
        
        # obtain mz and intensity values 
        mz, intensity = s.get_peaks()
        
        mz_mod = " ".join(str(m) for m in mz)
        intensity_mod = " ".join(str(i) for i in intensity)
        
        # create dict (rows of dataframe)
        data = {'MS3_Scan':scan_number,
       'masses_raw':mz_mod,
       'intensities_raw':intensity_mod}
        
        row_data.append(data)

In [6]:
# set MS3 scans as index
df = pd.DataFrame(row_data)
df.set_index('MS3_Scan', inplace=True)

In [7]:
# parse functions
def parseScanLine(input):
    x = input.split(" For: ")
    [scan_number, mzs] = x[1].split(", ")
    [precursor_mz, fragment_mz] = mzs.split(";")
    trimmed_fragment_mz = fragment_mz.strip() # trim fragment strings to remove \n
    return [scan_number, precursor_mz, trimmed_fragment_mz]

def parseTargetIons(input):
    i = input.split('Target Fragment: ')
    ion = i[1].split(',')[0]
    return ion 

In [8]:
# checking lines of log file and creating dictionary of scan numbers and fragment mzs
try:
  
    # words to search for
    search = ' Submitted Custom Scan For:'
    
    # dict for scan numbers and corresponding fragments 
    scan2frag = dict()
    with open(log) as f:
        for line in f:
            if search in line:
                scan_number, precursor_mz, trimmed_fragment_mz = parseScanLine(line)
                scan2frag[scan_number] = [float(precursor_mz), float(trimmed_fragment_mz)]
            
    # if the input string doesn't exist in the text file
    if len(scan2frag)==0:
        print("\n\"" + search + "\" is not found in \"" + log + "\"!")

except FileNotFoundError:
    print("The file does not exist!")

In [9]:
# obtain MS3 scan numbers
# obtain precursor and fragment mzs directly from the MS3 spectrum

ms3scan2MZs = dict()
for s in specM3:
    s_number = s.getNativeID().split(' ')[-1]
    _, scan_number = s_number.split('=')
   
    fragment, precursor = s.getPrecursors()
    precursor_mz = precursor.getMZ()
    fragment_mz = fragment.getMZ()
    
    ms3scan2MZs[int(scan_number)] = [round(float(precursor_mz), 4), round(float(fragment_mz), 4)] # 4 decimal places, similar to log

In [10]:
def matchingMS3s(ms2_mzs, ms3_mzs): # either fragment or precursor
    
    # making sure they are within 100 scans of each other
    too_far = []
    for ms2scan, ms3scan in zip(list(scan2frag), list(ms3scan2MZs)):
        scan_diff = int(ms3scan) - int(ms2scan)
        if scan_diff > 100:
            too_far.append('Scans are not within 100 scans of each other...' + 'MS2 = ' + ms2scan + ' MS3 = ' + ms3scan)
    
    # do they not match off the bat?
    if ms2_mzs != ms3_mzs:
        # taking into consideration rounding discrepencies between the log and the spectrum
        mismatch = []
        for i in range(0, len(list(ms3scan2MZs))):
            
            precursor_diff = float(list(ms3scan2MZs.values())[i][0]) - float(list(scan2frag.values())[i][0]) 
            if precursor_diff < 0.000101 or (precursor_diff < 0 and precursor_diff > -0.000101): # because sometimes max number will be 0.0001000002 for example
                pass
            else:
                mismatch.append(i)
            
            fragment_diff = float(list(ms3scan2MZs.values())[i][1]) - float(list(scan2frag.values())[i][1]) 
            if fragment_diff < 0.000101 or (fragment_diff < 0 and fragment_diff > -0.000101): # because sometimes max number will be 0.0001000002 for example
                pass
            else:
                mismatch.append(i)
                
        # no mismatch after rounding and within 100 scans
        if len(mismatch) == len(too_far) == 0:
            print('Scans match up after taking rounding discrepencies into consideration')
            ms2_scans = list(scan2frag)
            return ms2_scans
        
        elif len(mismatch) != 0:
            print('There is mismatch at the following indicies:') # if this is the case, need to do more work...
            for i in mismatch:
                print(i) 
                
        elif len(too_far) != 0:
            print(too_far)
        
    # they match perfectly
    elif ms2_mzs == ms3_mzs:
        # within 100 scans
        if len(too_far) == 0:
            print('Scans match up perfectly!')
            ms2_scans = list(scan2frag)
            return ms2_scans
        else:
            print(too_far)

In [11]:
# make sure that MS3 scans are in the same order as MS2 scans
ms2_scans = matchingMS3s(list(ms3scan2MZs.values()), list(scan2frag.values()))

Scans match up after taking rounding discrepencies into consideration


In [12]:
# use realtime file to obtain peptide sequence and charge
# read in peptide sequence from tsv
tsv = pd.read_csv(realtime, sep='\t')

# create dictionary with scan # as key and sequence/charge as values
scan2PeptideCharge = dict([(i, [x,y]) for i, x,y, in zip(tsv['Scan Number'], tsv['Peptide'], tsv['Charge State'])])

# removing all NaN sequences (not useful)
scan2PeptideCharge_modified = {k:v for k,v in scan2PeptideCharge.items() if str(v[0]) != 'nan'}

In [13]:
seqs = []
charges = []
analyzer = []
collision = []

energy = int(realtime.split('_')[-2][-2:])

for scan in ms2_scans:
    if int(scan) in list(scan2PeptideCharge_modified):
        charge = scan2PeptideCharge_modified[int(scan)][1]
        charges.append(charge)
        
        sequence = scan2PeptideCharge_modified[int(scan)][0]     
        trimmed_sequence = sequence[2:-2] # remove first two and last two characters 
        seqs.append(trimmed_sequence)
        
        # all ms3 scans are orbit trap 
        # to be added to dataframe with other MS3 info
        analyzer.append('FTMS')
        
        # all scans have same collision energy
        collision.append(energy)


In [14]:
# add all new columns to dataframe
df = df.assign(charge=charges, peptide=seqs, mass_analyzer=analyzer, collision_energy=collision)

In [15]:
# remove all modified sequences
for i in df.index:
    if ('[' or ']') in df['peptide'][i]:
        df.drop(i, axis=0, inplace=True)

In [16]:
df

Unnamed: 0_level_0,masses_raw,intensities_raw,charge,peptide,mass_analyzer,collision_energy
MS3_Scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2315,50.171653747558594 70.50875854492188 108.44799...,803.9648 895.87146 999.27673 921.82806 1097.59...,4,ARHILTLVHKHFCTRLSGYIE,FTMS,10
2791,54.4091682434082 88.9527587890625 92.004905700...,1480.8969 971.96185 994.46344 915.70667 1061.5...,4,KRPPPPYPGPGKPAAAAAWIQGPPE,FTMS,10
4119,54.08456039428711 54.40793228149414 79.3719024...,901.14197 2220.223 956.72943 1074.1609 984.140...,4,LHQTYLKAPQHAQQSIRE,FTMS,10
4138,50.46807861328125 54.40923309326172 149.738021...,1006.93787 1044.2721 9281.319 1655.2512 870.18...,2,QSLPSKPSSVSSYE,FTMS,10
4165,77.07954406738281 79.37855529785156 117.531532...,1032.855 980.9786 945.3121 865.2765 953.76636 ...,3,AAVQVVGSWPSVQPRE,FTMS,10
...,...,...,...,...,...,...
84449,54.40785598754883 67.25708770751953 86.0960617...,1862.5776 957.45294 1108.9438 969.226 1119.701...,4,SPTIGKFWKSFTFTVNNVPSGHILVVAVVQPVTLE,FTMS,10
84623,54.4068603515625 54.409610748291016 86.1714172...,1486.2264 1031.2069 1134.4431 1055.9144 946.46...,4,NNHFIVPISQILIGMGSSTVLFCLLGYIGIHNE,FTMS,10
85006,54.406558990478516 54.40922164916992 64.626823...,1887.2615 1679.5621 881.72406 911.07367 969.77...,5,LHQPLPQKPQPLPNAQSSQAGPNQAQLVFCPHCSRIFTSD,FTMS,10
85258,52.34059143066406 54.408260345458984 83.345787...,923.42883 2598.8687 890.72424 1071.2543 873.99...,5,GSIGGKQYFRCNPGYGLLVRPSRVRRATGPVRRRSTGLRLGAPE,FTMS,10


In [17]:
from constants import ION_TYPES, DEFAULT_MAX_CHARGE

In [18]:
from match import augment

In [19]:
augment(df, ION_TYPES, DEFAULT_MAX_CHARGE)

AttributeError: 'Series' object has no attribute 'modified_sequence'