In [1]:
# import libraries
from pyopenms import * # main package used for handling MS data
import os # changing directories
import pandas as pd # creating and manipulating dataframe

 # importing variables and functions from other scripts
from constants import ION_TYPES, DEFAULT_MAX_CHARGE
import match as m
import tensorize as tens
from tensorize import csv, csv_training
from io_local import to_hdf5, from_hdf5

In [2]:
def formatData(mzML, log, realtime):
    # load the content of the mzML file into the exp variable of type MSExperiment
    exp = MSExperiment()
    MzMLFile().load(mzML, exp)
    
    # loop through the spectra to filter MS3 scans
    specM3 = [] # list of MS3 spectra 
    row_data = []
    for s in exp.getSpectra():
        if s.getMSLevel() == 3:
            specM3.append(s)
            # get scan number
            s_number = s.getNativeID().split(' ')[-1]
            _, scan_number = s_number.split('=')
            # obtain mz and intensity values 
            mz, intensity = s.get_peaks()
            mz_mod = " ".join(str(m) for m in mz)
            intensity_mod = " ".join(str(i) for i in intensity)
            # create dict (rows of dataframe)
            data = {#'MS3_Scan':scan_number,
           'masses_raw':mz_mod,
           'intensities_raw':intensity_mod}
            row_data.append(data)
    #print("num MS3s: ", len(specM3))
            
    # create series of all MS3 spectra
    s_series = pd.Series(specM3)
    # create dataframe
    df = pd.DataFrame(row_data)
    
    # parse functions
    def parseScanLine(input):
        x = input.split(" For: ")
        [scan_number, mzs] = x[1].split(", ")
        [precursor_mz, fragment_mz] = mzs.split(";")
        trimmed_fragment_mz = fragment_mz.strip() # trim fragment strings to remove \n
        return [scan_number, precursor_mz, trimmed_fragment_mz]

    def parseTargetIons(input):
        i = input.split('Target Fragment: ')
        ion = i[1].split(',')[0]
        charge = i[1].split(',')[2][-1]
        return ion, charge
    
    # checking lines of log file and creating dictionary of scan numbers and fragment mzs
    try:
        # words to search for
        search = ' Submitted Custom Scan For:'
        search_target = 'Target Fragment:'
        # dict for scan numbers and corresponding fragments 
        scan2frag = dict()
        target_values = []
        target_charge = []
        with open(log) as f:
            for line in f:
                if search in line:
                    scan_number, precursor_mz, trimmed_fragment_mz = parseScanLine(line)
                    scan2frag[scan_number] = [float(precursor_mz), float(trimmed_fragment_mz)]
                elif search_target in line:
                    target_ion, charge = parseTargetIons(line)
                    target_charge.append(int(charge))
                    target_values.append(target_ion) #to add to final dataframe
        # if the input string doesn't exist in the text file
        if len(scan2frag)==0:
            print("\n\"" + search + "\" is not found in \"" + log + "\"!")
    except FileNotFoundError:
        print("The file does not exist!")
        
    # obtain MS3 scan numbers
    # obtain precursor and fragment mzs directly from the MS3 spectrum
    ms3scan2MZs = dict()
    for s in specM3:
        s_number = s.getNativeID().split(' ')[-1]
        _, scan_number = s_number.split('=')
        fragment, precursor = s.getPrecursors()
        precursor_mz = precursor.getMZ()
        fragment_mz = fragment.getMZ()
        ms3scan2MZs[int(scan_number)] = [round(float(precursor_mz), 4), round(float(fragment_mz), 4)] # 4 decimal places, similar to log
        
    def matchingMS3s(ms2_mzs, ms3_mzs): # either fragment or precursor
        # making sure they are within 100 scans of each other
        too_far = []
        for ms2scan, ms3scan in zip(list(scan2frag), list(ms3scan2MZs)):
            scan_diff = int(ms3scan) - int(ms2scan)
            if scan_diff > 100:
                too_far.append('Scans are not within 100 scans of each other...' + 'MS2 = ' + str(ms2scan) + ' MS3 = ' + str(ms3scan))
        # do they not match off the bat?
        if ms2_mzs != ms3_mzs:
            # taking into consideration rounding discrepencies between the log and the spectrum
            mismatch = []
            for i in range(0, len(list(ms3scan2MZs))):
                precursor_diff = float(list(ms3scan2MZs.values())[i][0]) - float(list(scan2frag.values())[i][0]) 
                if precursor_diff < 0.000101 or (precursor_diff < 0 and precursor_diff > -0.000101): # because sometimes max number will be 0.0001000002 for example
                    pass
                else:
                    mismatch.append(i)
                fragment_diff = float(list(ms3scan2MZs.values())[i][1]) - float(list(scan2frag.values())[i][1]) 
                if fragment_diff < 0.000101 or (fragment_diff < 0 and fragment_diff > -0.000101): # because sometimes max number will be 0.0001000002 for example
                    pass
                else:
                    mismatch.append(i)
            # no mismatch after rounding and within 100 scans
            if len(mismatch) == len(too_far) == 0:
                print('Scans match up after taking rounding discrepencies into consideration')
                ms2_scans = list(scan2frag)
                return ms2_scans
            elif len(mismatch) != 0:
                print('There is mismatch at the following indicies:') # if this is the case, need to do more work...
                for i in mismatch:
                    print(i) 
            elif len(too_far) != 0:
                print(too_far)
        # they match perfectly
        elif ms2_mzs == ms3_mzs:
            # within 100 scans
            if len(too_far) == 0:
                print('Scans match up perfectly!')
                ms2_scans = list(scan2frag)
                return ms2_scans
            else:
                print(too_far)
    
    # make sure that MS3 scans are in the same order as MS2 scans
    ms2_scans = matchingMS3s(list(ms3scan2MZs.values()), list(scan2frag.values()))
    
    # use realtime file to obtain peptide sequence and charge
    # read in peptide sequence from tsv
    tsv = pd.read_csv(realtime, sep='\t')
    # create dictionary with scan # as key and sequence/charge as values
    scan2PeptideCharge = dict([(i, [x,y]) for i, x,y, in zip(tsv['Scan Number'], tsv['Peptide'], tsv['Charge State'])])
    # removing all NaN sequences (not useful)
    scan2PeptideCharge_modified = {k:v for k,v in scan2PeptideCharge.items() if str(v[0]) != 'nan'}
    
    # collect data for dataframe
    seqs = []
    #charges = []
    analyzer = []
    collision = []
    energy = int(realtime.split('_')[-2][-2:])
    for scan in ms2_scans:
        if int(scan) in list(scan2PeptideCharge_modified):
            #charge = scan2PeptideCharge_modified[int(scan)][1]
            #charges.append(charge)
            sequence = scan2PeptideCharge_modified[int(scan)][0]     
            #trimmed_sequence = sequence[2:-2] # remove first two and last two characters 
            seqs.append(sequence)
            # all ms3 scans are orbit trap 
            # to be added to dataframe with other MS3 info
            analyzer.append('FTMS')
            # all scans have same collision energy
            collision.append(energy)
    
    # add all data columns to dataframe
    df = df.assign(charge=target_charge, modified_sequence=seqs, mass_analyzer=analyzer, collision_energy=collision, target_fragment=target_values)
    
    # remove all modified sequences
    for i in df.index:
        if ('[' or ']') in df['modified_sequence'][i]:
            df.drop(i, axis=0, inplace=True)
            s_series.drop(i, inplace=True)
            
    # obtain target fragment sequences
    fragment_seqs = []
    for i in df.index:
        trimmed_seq = df['modified_sequence'][i][2:-2]
        peptide_object = AASequence.fromString(trimmed_seq) 
        # targeted fragment is a y ion
        if df['target_fragment'][i].startswith('y'):
            y_num = df['target_fragment'][i][-1]
            # the full sequence of the fragment
            full_seq = peptide_object.getSuffix(int(y_num))
        # targeted fragment is a b ion
        elif df['target_fragment'][i].startswith('b'):
            b_num = df['target_fragment'][i][-1]
            # the full sequence of the fragment
            full_seq = peptide_object.getPrefix(int(b_num))
        fragment_seqs.append(str(full_seq))
    df = df.assign(modified_sequence=fragment_seqs)
    
    # remove all sequences more than 30 in length
    for i in df.index:
        if len(df['modified_sequence'][i]) >= 30:
            df.drop(i, axis=0, inplace=True)
            s_series.drop(i, inplace=True)
    
    # running augment function
    df_augmented = m.augment(df, ION_TYPES, DEFAULT_MAX_CHARGE)
    # remove all rows that are completely empty
    for i in df_augmented.index:
        if df_augmented['matches_charge1'][i] == df_augmented['matches_charge2'][i] == df_augmented['matches_charge3'][i] == df_augmented['matches_charge4'][i] == df_augmented['matches_charge5'][i] == df_augmented['matches_charge6'][i]:
            df_augmented.drop(i, axis=0, inplace=True)
            s_series.drop(i, inplace=True)
    
    return df_augmented, s_series

In [3]:
# change directory to find data files of interest
os.chdir(r'C:\Users\miar\Desktop\data')

In [4]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD10.mzML'
log = 'App-2022-06-12_14-16-26.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD10_realtimesearch.tsv'

In [5]:
df1, series1 = formatData(mzML, log, realtime)

Scans match up after taking rounding discrepencies into consideration


In [6]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD15.mzML'
log = 'App-2022-06-12_09-54-27.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD15_realtimesearch.tsv'

In [7]:
df2, series2 = formatData(mzML, log, realtime)

Scans match up after taking rounding discrepencies into consideration


In [8]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD20.mzML'
log = 'App-2022-06-12_07-05-17.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD20_realtimesearch.tsv'

In [9]:
df3, series3 = formatData(mzML, log, realtime)

Scans match up after taking rounding discrepencies into consideration


In [10]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD30.mzML'
log = 'App-2022-06-12_22-28-53.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD30_realtimesearch.tsv'

In [11]:
df4, series4 = formatData(mzML, log, realtime)

Scans match up after taking rounding discrepencies into consideration


In [12]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD35.mzML'
log = 'App-2022-06-11_21-20-46.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD35_realtimesearch.tsv'

In [13]:
df5, series5 = formatData(mzML, log, realtime)

Scans match up after taking rounding discrepencies into consideration


In [14]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength2_HCD35.mzML'
log = 'App-2022-06-12_19-59-11.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength2_HCD35_realtimesearch.tsv'

In [15]:
df6, series6 = formatData(mzML, log, realtime)

Scans match up after taking rounding discrepencies into consideration


In [16]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD40.mzML'
log = 'App-2022-06-12_16-55-56.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD40_realtimesearch.tsv'

In [17]:
df7, series7 = formatData(mzML, log, realtime)

Scans match up after taking rounding discrepencies into consideration


In [18]:
# combine all dataframes 
all_dfs = [df1, df2, df3, df4, df5, df6, df7]
concat_df = pd.concat(all_dfs).reset_index(drop=True)

In [19]:
concat_df

Unnamed: 0,masses_raw,intensities_raw,charge,modified_sequence,mass_analyzer,collision_energy,target_fragment,matches_charge1,masses_the_charge1,masses_raw_charge1,...,masses_raw_charge4,intensities_raw_charge4,matches_charge5,masses_the_charge5,masses_raw_charge5,intensities_raw_charge5,matches_charge6,masses_the_charge6,masses_raw_charge6,intensities_raw_charge6
0,54.40834045410156 72.08840942382812 72.6154479...,1847.457 1044.1062 944.0088 927.47253 941.6198...,1,SE,FTMS,10,y2,y1,148.060434167,148.0601806640625,...,,,,,,,,,,
1,54.40696716308594 54.40972137451172 70.0648117...,1802.849 918.2612 4112.844 878.95984 1029.2969...,1,IP,FTMS,10,b2,y1,98.060041167,98.06021118164062,...,,,,,,,,,,
2,53.29237365722656 54.40982437133789 58.5242881...,848.94354 1105.8538 1257.8536 921.262 1054.482...,1,TK,FTMS,10,b2,y1,129.102240167,129.1013946533203,...,,,,,,,,,,
3,62.79367446899414 66.73774719238281 68.6735992...,910.83417 941.222 894.2407 840.0121 787.6718 8...,1,TK,FTMS,10,b2,y1,129.102240167,129.102294921875,...,,,,,,,,,,
4,50.54601287841797 54.40740966796875 54.4102096...,1046.0387 1604.429 1038.8164 902.5171 960.406 ...,1,SE,FTMS,10,y2,y1,148.060434167,148.0605010986328,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,54.408470153808594 70.06488800048828 70.590484...,3260.9204 6078.789 2339.1902 10882.506 2424.62...,1,PKKTE,FTMS,40,y5,b2,226.155003467,226.1552276611328,...,,,,,,,,,,
301,53.899383544921875 54.40927505493164 66.511924...,2461.784 3039.2222 2540.4285 8154.4355 2572.69...,1,PKKTE,FTMS,40,y5,b2,226.155003467,226.15432739257812,...,,,,,,,,,,
302,54.40829086303711 70.06476593017578 84.0803070...,3128.156 7825.7524 15576.507 2519.0374 2384.07...,1,PKKTE,FTMS,40,y5,b2;y3,226.155003467;377.203076167,226.15536499023438;377.2024841308594,...,,,,,,,,,,
303,54.40895080566406 59.92170333862305 70.0650405...,945.8575 1072.6782 2115.838 964.55164 2905.835...,1,PKKTE,FTMS,40,y5,b2,226.155003467,226.15496826171875,...,,,,,,,,,,


In [20]:
# concatenate all spectra (series) as well
all_series = [series1, series2, series3, series4, series5, series6, series7]
concat_series = pd.concat(all_series).reset_index(drop=True)
#concat_series

0      <pyopenms.pyopenms_5.MSSpectrum object at 0x00...
1      <pyopenms.pyopenms_5.MSSpectrum object at 0x00...
2      <pyopenms.pyopenms_5.MSSpectrum object at 0x00...
3      <pyopenms.pyopenms_5.MSSpectrum object at 0x00...
4      <pyopenms.pyopenms_5.MSSpectrum object at 0x00...
                             ...                        
300    <pyopenms.pyopenms_5.MSSpectrum object at 0x00...
301    <pyopenms.pyopenms_5.MSSpectrum object at 0x00...
302    <pyopenms.pyopenms_5.MSSpectrum object at 0x00...
303    <pyopenms.pyopenms_5.MSSpectrum object at 0x00...
304    <pyopenms.pyopenms_5.MSSpectrum object at 0x00...
Length: 305, dtype: object

In [21]:
# to run csv function 'charge' needs to be replaced with 'precursor_charge' ?
concat_df.rename(columns = {'charge':'precursor_charge'}, inplace = True)

# create dictionary of data
data = csv_training(concat_df, concat_series)
#print(data)

{'collision_energy_in': array([[0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.1 ],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
       [0.15],
 

  array = array / maxima[:, numpy.newaxis]


In [22]:
# convert dictionary data to hdf5 
to_hdf5(data, 'hdf5_data.hdf5')