In [1]:
# import libraries
from pyopenms import * # main package used for handling MS data
import os # changing directories
import pandas as pd # creating and manipulating dataframe

# importing variables and functions from other scripts
from constants import ION_TYPES, DEFAULT_MAX_CHARGE
import match as m
import tensorize as tens
from tensorize import csv, csv_training
from io_local import to_hdf5, from_hdf5

In [2]:
# change directory to find data files of interest
os.chdir(r'C:\Users\miar\Desktop\data')

In [3]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD30.mzML'
log = 'App-2022-06-12_22-28-53.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD30_realtimesearch.tsv'

In [4]:
# load the content of the mzML file into the exp variable of type MSExperiment
exp = MSExperiment()
MzMLFile().load(mzML, exp)

In [5]:
# loop through the spectra to filter MS3 scans
specM3 = [] # list of MS3 spectra 
row_data = []
for s in exp.getSpectra():
    
    if s.getMSLevel() == 3:
        specM3.append(s)
        
        # get scan number
        s_number = s.getNativeID().split(' ')[-1]
        _, scan_number = s_number.split('=')
        
        # obtain mz and intensity values 
        mz, intensity = s.get_peaks()
        
        mz_mod = " ".join(str(m) for m in mz)
        intensity_mod = " ".join(str(i) for i in intensity)
        
        # create dict (rows of dataframe)
        data = {'MS3_Scan':scan_number,
       'masses_raw':mz_mod,
       'intensities_raw':intensity_mod}
        
        row_data.append(data)

In [6]:
# create series of all MS3 spectra
s_series = pd.Series(specM3)

In [7]:
# create dataframe
df = pd.DataFrame(row_data)
#df.set_index('MS3_Scan', inplace=True)

In [8]:
# parse functions
def parseScanLine(input):
    x = input.split(" For: ")
    [scan_number, mzs] = x[1].split(", ")
    [precursor_mz, fragment_mz] = mzs.split(";")
    trimmed_fragment_mz = fragment_mz.strip() # trim fragment strings to remove \n
    return [scan_number, precursor_mz, trimmed_fragment_mz]

def parseTargetIons(input):
    i = input.split('Target Fragment: ')
    ion = i[1].split(',')[0]
    charge = i[1].split(',')[2][-1]
    return ion, charge

In [9]:
# checking lines of log file and creating dictionary of scan numbers and fragment mzs
try:
  
    # words to search for
    search = ' Submitted Custom Scan For:'
    search_target = 'Target Fragment:'
    
    # dict for scan numbers and corresponding fragments 
    scan2frag = dict()
    target_values = []
    target_charge = []
    with open(log) as f:
        for line in f:
            if search in line:
                scan_number, precursor_mz, trimmed_fragment_mz = parseScanLine(line)
                scan2frag[scan_number] = [float(precursor_mz), float(trimmed_fragment_mz)]
            elif search_target in line:
                target_ion, charge = parseTargetIons(line)
                target_charge.append(int(charge))
                target_values.append(target_ion) #to add to final dataframe
            
    # if the input string doesn't exist in the text file
    if len(scan2frag)==0:
        print("\n\"" + search + "\" is not found in \"" + log + "\"!")

except FileNotFoundError:
    print("The file does not exist!")

In [10]:
# obtain MS3 scan numbers
# obtain precursor and fragment mzs directly from the MS3 spectrum

ms3scan2MZs = dict()
for s in specM3:
    s_number = s.getNativeID().split(' ')[-1]
    _, scan_number = s_number.split('=')
   
    fragment, precursor = s.getPrecursors()
    precursor_mz = precursor.getMZ()
    fragment_mz = fragment.getMZ()
    
    ms3scan2MZs[int(scan_number)] = [round(float(precursor_mz), 4), round(float(fragment_mz), 4)] # 4 decimal places, similar to log

In [11]:
def matchingMS3s(ms2_mzs, ms3_mzs): # either fragment or precursor
    
    # making sure they are within 100 scans of each other
    too_far = []
    for ms2scan, ms3scan in zip(list(scan2frag), list(ms3scan2MZs)):
        scan_diff = int(ms3scan) - int(ms2scan)
        if scan_diff > 100:
            too_far.append('Scans are not within 100 scans of each other...' + 'MS2 = ' + str(ms2scan) + ' MS3 = ' + str(ms3scan))
    
    # do they not match off the bat?
    if ms2_mzs != ms3_mzs:
        # taking into consideration rounding discrepencies between the log and the spectrum
        mismatch = []
        for i in range(0, len(list(ms3scan2MZs))):
            
            precursor_diff = float(list(ms3scan2MZs.values())[i][0]) - float(list(scan2frag.values())[i][0]) 
            if precursor_diff < 0.000101 or (precursor_diff < 0 and precursor_diff > -0.000101): # because sometimes max number will be 0.0001000002 for example
                pass
            else:
                mismatch.append(i)
            
            fragment_diff = float(list(ms3scan2MZs.values())[i][1]) - float(list(scan2frag.values())[i][1]) 
            if fragment_diff < 0.000101 or (fragment_diff < 0 and fragment_diff > -0.000101): # because sometimes max number will be 0.0001000002 for example
                pass
            else:
                mismatch.append(i)
                
        # no mismatch after rounding and within 100 scans
        if len(mismatch) == len(too_far) == 0:
            print('Scans match up after taking rounding discrepencies into consideration')
            ms2_scans = list(scan2frag)
            return ms2_scans
        
        elif len(mismatch) != 0:
            print('There is mismatch at the following indicies:') # if this is the case, need to do more work...
            for i in mismatch:
                print(i) 
                
        elif len(too_far) != 0:
            print(too_far)
        
    # they match perfectly
    elif ms2_mzs == ms3_mzs:
        # within 100 scans
        if len(too_far) == 0:
            print('Scans match up perfectly!')
            ms2_scans = list(scan2frag)
            return ms2_scans
        else:
            print(too_far)

In [12]:
# make sure that MS3 scans are in the same order as MS2 scans
ms2_scans = matchingMS3s(list(ms3scan2MZs.values()), list(scan2frag.values()))

Scans match up after taking rounding discrepencies into consideration


In [13]:
# use realtime file to obtain peptide sequence and charge
# read in peptide sequence from tsv
tsv = pd.read_csv(realtime, sep='\t')

# create dictionary with scan # as key and sequence/charge as values
scan2PeptideCharge = dict([(i, [x,y]) for i, x,y, in zip(tsv['Scan Number'], tsv['Peptide'], tsv['Charge State'])])

# removing all NaN sequences (not useful)
scan2PeptideCharge_modified = {k:v for k,v in scan2PeptideCharge.items() if str(v[0]) != 'nan'}

In [14]:
# collect data for dataframe
seqs = []
charges = []
analyzer = []
collision = []

energy = int(realtime.split('_')[-2][-2:])

for scan in ms2_scans:
    if int(scan) in list(scan2PeptideCharge_modified):
        charge = scan2PeptideCharge_modified[int(scan)][1]
        charges.append(charge)
        
        sequence = scan2PeptideCharge_modified[int(scan)][0]     
        #trimmed_sequence = sequence[2:-2] # remove first two and last two characters 
        seqs.append(sequence)
        
        # all ms3 scans are orbit trap 
        # to be added to dataframe with other MS3 info
        analyzer.append('FTMS')
        
        # all scans have same collision energy
        collision.append(energy)

In [15]:
# add all data columns to dataframe
df = df.assign(charge=charges, modified_sequence=seqs, mass_analyzer=analyzer, collision_energy=collision, target_fragment=target_values)

In [16]:
# remove all modified sequences
for i in df.index:
    if ('[' or ']') in df['modified_sequence'][i]:
        df.drop(i, axis=0, inplace=True)
        s_series.drop(i, inplace=True)

In [17]:
# obtain target fragment sequences
fragment_seqs = []
for i in df.index:
    trimmed_seq = df['modified_sequence'][i][2:-2]
    peptide_object = AASequence.fromString(trimmed_seq) 
    
    # targeted fragment is a y ion
    if df['target_fragment'][i].startswith('y'):
        y_num = df['target_fragment'][i][-1]
        # the full sequence of the fragment
        full_seq = peptide_object.getSuffix(int(y_num))
        
    # targeted fragment is a b ion
    elif df['target_fragment'][i].startswith('b'):
        b_num = df['target_fragment'][i][-1]
        # the full sequence of the fragment
        full_seq = peptide_object.getPrefix(int(b_num))
    
    fragment_seqs.append(str(full_seq))

In [18]:
df = df.assign(modified_sequence=fragment_seqs)

In [21]:
df

Unnamed: 0,MS3_Scan,masses_raw,intensities_raw,charge,modified_sequence,mass_analyzer,collision_energy,target_fragment
0,2020,54.40639877319336 54.408939361572266 59.669750...,1197.1626 1670.647 859.46136 985.96564 1016.46...,4,PHE,FTMS,30,y3
3,3051,54.408634185791016 62.668479919433594 149.7439...,2066.4746 970.49506 3651.7222 2979.791,5,SVAL,FTMS,30,b4
4,3163,54.406776428222656 65.59066772460938 89.303787...,920.14667 996.5237 1133.543 886.9034 9615.201 ...,4,IQSLIE,FTMS,30,y6
5,3227,52.101802825927734 60.138431549072266 72.78981...,966.6876 906.81854 901.90924 1036.7853 1721.08...,3,KE,FTMS,30,y2
6,3545,70.06473541259766 83.60308074951172 87.3251876...,3398.8206 990.2591 1048.4507 871.2369 1013.838...,4,PSLGD,FTMS,30,y5
...,...,...,...,...,...,...,...,...
674,54879,50.39166259765625 54.40570831298828 54.4082908...,908.00323 1306.0411 1157.5964 948.8884 932.170...,4,HAIPE,FTMS,30,y5
677,55074,51.16191482543945 54.40586471557617 54.4089279...,869.91626 1037.0397 1943.6324 875.4436 962.630...,5,HLRVIG,FTMS,30,b6
678,55157,54.40780258178711 71.67377471923828 84.0804061...,2137.793 1185.057 7538.464 1008.2633 1030.0724...,4,HPE,FTMS,30,y3
679,55347,52.03042984008789 54.406410217285156 57.952243...,942.35956 1386.4752 870.078 2464.002 1984.4955...,2,IY,FTMS,30,b2


In [19]:
# remove all sequences more than 30 in length
for i in df.index:
    if len(df['modified_sequence'][i]) >= 30:
        df.drop(i, axis=0, inplace=True)
        s_series.drop(i, inplace=True)

In [20]:
# running augment function
df_augmented = m.augment(df, ION_TYPES, DEFAULT_MAX_CHARGE)

In [21]:
# remove all rows that are completely empty
for i in df_augmented.index:
    if df_augmented['matches_charge1'][i] == df_augmented['matches_charge2'][i] == df_augmented['matches_charge3'][i] == df_augmented['matches_charge4'][i] == df_augmented['matches_charge5'][i] == df_augmented['matches_charge6'][i]:
        df_augmented.drop(i, axis=0, inplace=True)
        s_series.drop(i, inplace=True)

In [22]:
# to run csv function 'charge' needs to be replaced with 'precursor_charge' ?
df_augmented.rename(columns = {'charge':'precursor_charge'}, inplace = True)

In [23]:
df_augmented

Unnamed: 0,MS3_Scan,masses_raw,intensities_raw,precursor_charge,modified_sequence,mass_analyzer,collision_energy,target_fragment,matches_charge1,masses_the_charge1,...,masses_raw_charge4,intensities_raw_charge4,matches_charge5,masses_the_charge5,masses_raw_charge5,intensities_raw_charge5,matches_charge6,masses_the_charge6,masses_raw_charge6,intensities_raw_charge6
40,4621,50.4739990234375 52.20036315917969 52.63237762...,977.42706 1109.9105 849.40424 1452.444 756.425...,2,PGE,FTMS,30,y3,b2,155.081504467,...,,,,,,,,,,
42,4639,57.859413146972656 70.06482696533203 70.788146...,901.5612 7928.145 820.16846 910.6698 943.537 1...,2,PGE,FTMS,30,y3,b2,155.081504467,...,,,,,,,,,,
55,4994,54.40592956542969 54.40852737426758 55.1464157...,977.354 1772.294 1148.3988 899.2873 850.2866 9...,2,HASIQ,FTMS,30,b5,b2,209.103302467,...,,,,,,,,,,
88,5602,55.055606842041016 60.60079574584961 79.244682...,958.7304 940.10425 898.421 895.7952 1551.0371 ...,4,GHE,FTMS,30,y3,b2,195.087652467,...,,,,,,,,,,
133,7299,52.04491424560547 52.17893981933594 54.4083366...,1210.0629 1222.7086 1627.2898 1167.2579 1126.5...,3,KE,FTMS,30,y2,b1,129.102239467,...,,,,,,,,,,
141,7644,54.407997131347656 70.06485748291016 74.969673...,2130.9834 1516.2654 873.34357 897.49316 1045.4...,3,KP,FTMS,30,b2,b1,129.102239467,...,,,,,,,,,,
157,8141,52.6069450378418 54.40809631347656 65.16360473...,911.1296 2101.3508 1145.6494 896.5965 1841.865...,4,RRQLIV,FTMS,30,b6,,,...,,,,,,,,,,
164,8253,51.75946807861328 54.40770721435547 90.9761276...,1154.2935 2343.5 890.11816 909.5369 1300.9065 ...,4,ISGK,FTMS,30,b4,y1,129.102240167,...,,,,,,,,,,
187,9831,59.75191116333008 95.1451644897461 110.0709991...,824.0301 1034.8226 7731.002 4386.921 1129.7994...,3,HLE,FTMS,30,y3,b2,251.15025246699997,...,,,,,,,,,,
190,10058,50.5954475402832 54.406402587890625 54.4088859...,1017.04877 1380.0731 1524.5885 2243.074 927.67...,4,SE,FTMS,30,y2,y1,148.060434167,...,,,,,,,,,,


In [24]:
df_augmented['masses_raw']

40     50.4739990234375 52.20036315917969 52.63237762...
42     57.859413146972656 70.06482696533203 70.788146...
55     54.40592956542969 54.40852737426758 55.1464157...
88     55.055606842041016 60.60079574584961 79.244682...
133    52.04491424560547 52.17893981933594 54.4083366...
141    54.407997131347656 70.06485748291016 74.969673...
157    52.6069450378418 54.40809631347656 65.16360473...
164    51.75946807861328 54.40770721435547 90.9761276...
187    59.75191116333008 95.1451644897461 110.0709991...
190    50.5954475402832 54.406402587890625 54.4088859...
195    54.408653259277344 56.27801513671875 56.641979...
212    66.76849365234375 70.06488037109375 74.0951232...
214    54.40906524658203 60.654029846191406 73.592971...
218    54.4089241027832 54.47573471069336 77.40539550...
223    54.40755844116211 70.06478118896484 71.0680999...
231    54.40690994262695 83.6074447631836 89.56525421...
234    54.408260345458984 63.66166687011719 84.044044...
249    50.37693405151367 54.407

In [25]:
# create dictionary of data
data = tens.csv_training(df_augmented, s_series)

  array = array / maxima[:, numpy.newaxis]


In [23]:
data

{'MS3_Scan': '55472',
 'masses_raw': '51.25165557861328 54.406455993652344 54.409034729003906 104.62690734863281 119.516845703125 149.74021911621094 151.7077178955078 179.1897735595703 181.63304138183594 196.24363708496094 209.4630126953125',
 'intensities_raw': '918.9743 1341.379 2118.155 1132.5398 902.64984 9122.675 946.59155 1029.8043 3490.0686 900.49994 894.94403'}

In [26]:
# convert dictionary data to hdf5 
#to_hdf5(data, 'hdf5_data2.hdf5')