In [1]:
# import libraries
from pyopenms import * # main package used for handling MS data
import os # changing directories
import pandas as pd # creating and manipulating dataframe

In [2]:
# change directory to find data files of interest
os.chdir(r'C:\Users\miar\Desktop\data')

In [3]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD10.mzML'
log = 'App-2022-06-12_14-16-26.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD10_realtimesearch.tsv'

In [4]:
# load the content of the mzML file into the exp variable of type MSExperiment
exp = MSExperiment()
MzMLFile().load(mzML, exp)

In [5]:
# loop through the spectra to filter MS3 scans
specM3 = [] # list of MS3 spectra 
row_data = []
for s in exp.getSpectra():
    
    if s.getMSLevel() == 3:
        specM3.append(s)
        
        # get scan number
        s_number = s.getNativeID().split(' ')[-1]
        _, scan_number = s_number.split('=')
        
        # obtain mz and intensity values 
        mz, intensity = s.get_peaks()
        
        mz_mod = " ".join(str(m) for m in mz)
        intensity_mod = " ".join(str(i) for i in intensity)
        
        # create dict (rows of dataframe)
        data = {#'MS3_Scan':scan_number,
       'masses_raw':mz_mod,
       'intensities_raw':intensity_mod}
        
        row_data.append(data)

In [6]:
# create series of all MS3 spectra
s_series = pd.Series(specM3)

In [7]:
# create dataframe
df = pd.DataFrame(row_data)
#df.set_index('MS3_Scan', inplace=True)

In [8]:
# parse functions
def parseScanLine(input):
    x = input.split(" For: ")
    [scan_number, mzs] = x[1].split(", ")
    [precursor_mz, fragment_mz] = mzs.split(";")
    trimmed_fragment_mz = fragment_mz.strip() # trim fragment strings to remove \n
    return [scan_number, precursor_mz, trimmed_fragment_mz]

def parseTargetIons(input):
    i = input.split('Target Fragment: ')
    ion = i[1].split(',')[0]
    return ion 

In [9]:
# checking lines of log file and creating dictionary of scan numbers and fragment mzs
try:
  
    # words to search for
    search = ' Submitted Custom Scan For:'
    search_target = 'Target Fragment:'
    
    # dict for scan numbers and corresponding fragments 
    scan2frag = dict()
    target_values = []
    with open(log) as f:
        for line in f:
            if search in line:
                scan_number, precursor_mz, trimmed_fragment_mz = parseScanLine(line)
                scan2frag[scan_number] = [float(precursor_mz), float(trimmed_fragment_mz)]
            elif search_target in line:
                target_ion = parseTargetIons(line)
                target_values.append(target_ion) #to add to final dataframe
            
    # if the input string doesn't exist in the text file
    if len(scan2frag)==0:
        print("\n\"" + search + "\" is not found in \"" + log + "\"!")

except FileNotFoundError:
    print("The file does not exist!")

In [10]:
# obtain MS3 scan numbers
# obtain precursor and fragment mzs directly from the MS3 spectrum

ms3scan2MZs = dict()
for s in specM3:
    s_number = s.getNativeID().split(' ')[-1]
    _, scan_number = s_number.split('=')
   
    fragment, precursor = s.getPrecursors()
    precursor_mz = precursor.getMZ()
    fragment_mz = fragment.getMZ()
    
    ms3scan2MZs[int(scan_number)] = [round(float(precursor_mz), 4), round(float(fragment_mz), 4)] # 4 decimal places, similar to log

In [11]:
def matchingMS3s(ms2_mzs, ms3_mzs): # either fragment or precursor
    
    # making sure they are within 100 scans of each other
    too_far = []
    for ms2scan, ms3scan in zip(list(scan2frag), list(ms3scan2MZs)):
        scan_diff = int(ms3scan) - int(ms2scan)
        if scan_diff > 100:
            too_far.append('Scans are not within 100 scans of each other...' + 'MS2 = ' + str(ms2scan) + ' MS3 = ' + str(ms3scan))
    
    # do they not match off the bat?
    if ms2_mzs != ms3_mzs:
        # taking into consideration rounding discrepencies between the log and the spectrum
        mismatch = []
        for i in range(0, len(list(ms3scan2MZs))):
            
            precursor_diff = float(list(ms3scan2MZs.values())[i][0]) - float(list(scan2frag.values())[i][0]) 
            if precursor_diff < 0.000101 or (precursor_diff < 0 and precursor_diff > -0.000101): # because sometimes max number will be 0.0001000002 for example
                pass
            else:
                mismatch.append(i)
            
            fragment_diff = float(list(ms3scan2MZs.values())[i][1]) - float(list(scan2frag.values())[i][1]) 
            if fragment_diff < 0.000101 or (fragment_diff < 0 and fragment_diff > -0.000101): # because sometimes max number will be 0.0001000002 for example
                pass
            else:
                mismatch.append(i)
                
        # no mismatch after rounding and within 100 scans
        if len(mismatch) == len(too_far) == 0:
            print('Scans match up after taking rounding discrepencies into consideration')
            ms2_scans = list(scan2frag)
            return ms2_scans
        
        elif len(mismatch) != 0:
            print('There is mismatch at the following indicies:') # if this is the case, need to do more work...
            for i in mismatch:
                print(i) 
                
        elif len(too_far) != 0:
            print(too_far)
        
    # they match perfectly
    elif ms2_mzs == ms3_mzs:
        # within 100 scans
        if len(too_far) == 0:
            print('Scans match up perfectly!')
            ms2_scans = list(scan2frag)
            return ms2_scans
        else:
            print(too_far)

In [12]:
# make sure that MS3 scans are in the same order as MS2 scans
ms2_scans = matchingMS3s(list(ms3scan2MZs.values()), list(scan2frag.values()))

Scans match up after taking rounding discrepencies into consideration


In [13]:
# use realtime file to obtain peptide sequence and charge
# read in peptide sequence from tsv
tsv = pd.read_csv(realtime, sep='\t')

# create dictionary with scan # as key and sequence/charge as values
scan2PeptideCharge = dict([(i, [x,y]) for i, x,y, in zip(tsv['Scan Number'], tsv['Peptide'], tsv['Charge State'])])

# removing all NaN sequences (not useful)
scan2PeptideCharge_modified = {k:v for k,v in scan2PeptideCharge.items() if str(v[0]) != 'nan'}

In [14]:
# collect data for dataframe
seqs = []
charges = []
analyzer = []
collision = []

energy = int(realtime.split('_')[-2][-2:])

for scan in ms2_scans:
    if int(scan) in list(scan2PeptideCharge_modified):
        charge = scan2PeptideCharge_modified[int(scan)][1]
        charges.append(charge)
        
        sequence = scan2PeptideCharge_modified[int(scan)][0]     
        #trimmed_sequence = sequence[2:-2] # remove first two and last two characters 
        seqs.append(sequence)
        
        # all ms3 scans are orbit trap 
        # to be added to dataframe with other MS3 info
        analyzer.append('FTMS')
        
        # all scans have same collision energy
        collision.append(energy)

In [15]:
# add all data columns to dataframe
df = df.assign(charge=charges, modified_sequence=seqs, mass_analyzer=analyzer, collision_energy=collision, target_fragment=target_values)

In [16]:
# remove all modified sequences
for i in df.index:
    if ('[' or ']') in df['modified_sequence'][i]:
        df.drop(i, axis=0, inplace=True)
        s_series.drop(i, inplace=True)

In [17]:
# obtain target fragment sequences
fragment_seqs = []
for i in df.index:
    trimmed_seq = df['modified_sequence'][i][2:-2]
    peptide_object = AASequence.fromString(trimmed_seq) 
    
    # targeted fragment is a y ion
    if df['target_fragment'][i].startswith('y'):
        y_num = df['target_fragment'][i][-1]
        # the full sequence of the fragment
        full_seq = peptide_object.getSuffix(int(y_num))
        
    # targeted fragment is a b ion
    elif df['target_fragment'][i].startswith('b'):
        b_num = df['target_fragment'][i][-1]
        # the full sequence of the fragment
        full_seq = peptide_object.getPrefix(int(b_num))
    
    fragment_seqs.append(str(full_seq))

In [18]:
df = df.assign(modified_sequence=fragment_seqs)

In [19]:
# remove all sequences more than 30 in length
for i in df.index:
    if len(df['modified_sequence'][i]) >= 30:
        df.drop(i, axis=0, inplace=True)
        s_series.drop(i, inplace=True)

In [22]:
# importing variables and functions from other scripts
from constants import ION_TYPES, DEFAULT_MAX_CHARGE
import match as m

In [23]:
# for reloading with troubleshooting
import importlib
importlib.reload(m)

<module 'match' from 'C:\\Users\\miar\\Documents\\GitHub\\DeNovo\\match.py'>

In [24]:
# running augment function
df_augmented = m.augment(df, ION_TYPES, DEFAULT_MAX_CHARGE)

In [25]:
df_augmented

Unnamed: 0,masses_raw,intensities_raw,charge,modified_sequence,mass_analyzer,collision_energy,target_fragment,matches_charge1,masses_the_charge1,masses_raw_charge1,...,masses_raw_charge4,intensities_raw_charge4,matches_charge5,masses_the_charge5,masses_raw_charge5,intensities_raw_charge5,matches_charge6,masses_the_charge6,masses_raw_charge6,intensities_raw_charge6
0,50.171653747558594 70.50875854492188 108.44799...,803.9648 895.87146 999.27673 921.82806 1097.59...,4,GYIE,FTMS,10,y4,,,,...,,,,,,,,,,
1,54.4091682434082 88.9527587890625 92.004905700...,1480.8969 971.96185 994.46344 915.70667 1061.5...,4,KRP,FTMS,10,b3,,,,...,,,,,,,,,,
3,54.08456039428711 54.40793228149414 79.3719024...,901.14197 2220.223 956.72943 1074.1609 984.140...,4,IRE,FTMS,10,y3,,,,...,,,,,,,,,,
4,50.46807861328125 54.40923309326172 149.738021...,1006.93787 1044.2721 9281.319 1655.2512 870.18...,2,SSYE,FTMS,10,y4,,,,...,,,,,,,,,,
5,77.07954406738281 79.37855529785156 117.531532...,1032.855 980.9786 945.3121 865.2765 953.76636 ...,3,AAVQVV,FTMS,10,b6,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936,54.40785598754883 67.25708770751953 86.0960617...,1862.5776 957.45294 1108.9438 969.226 1119.701...,4,SP,FTMS,10,b2,,,,...,,,,,,,,,,
937,54.4068603515625 54.409610748291016 86.1714172...,1486.2264 1031.2069 1134.4431 1055.9144 946.46...,4,NNH,FTMS,10,b3,,,,...,,,,,,,,,,
939,54.406558990478516 54.40922164916992 64.626823...,1887.2615 1679.5621 881.72406 911.07367 969.77...,5,LH,FTMS,10,b2,,,,...,,,,,,,,,,
940,52.34059143066406 54.408260345458984 83.345787...,923.42883 2598.8687 890.72424 1071.2543 873.99...,5,GSIGGK,FTMS,10,b6,,,,...,,,,,,,,,,


In [26]:
# remove all rows that are completely empty
for i in df_augmented.index:
    if df_augmented['matches_charge1'][i] == df_augmented['matches_charge2'][i] == df_augmented['matches_charge3'][i] == df_augmented['matches_charge4'][i] == df_augmented['matches_charge5'][i] == df_augmented['matches_charge6'][i]:
        df_augmented.drop(i, axis=0, inplace=True)
        s_series.drop(i, inplace=True)

In [27]:
# to run csv function 'charge' needs to be replaced with 'precursor_charge' ?
df_augmented.rename(columns = {'charge':'precursor_charge'}, inplace = True)

In [28]:
df_augmented

Unnamed: 0,masses_raw,intensities_raw,precursor_charge,modified_sequence,mass_analyzer,collision_energy,target_fragment,matches_charge1,masses_the_charge1,masses_raw_charge1,...,masses_raw_charge4,intensities_raw_charge4,matches_charge5,masses_the_charge5,masses_raw_charge5,intensities_raw_charge5,matches_charge6,masses_the_charge6,masses_raw_charge6,intensities_raw_charge6
21,54.40834045410156 72.08840942382812 72.6154479...,1847.457 1044.1062 944.0088 927.47253 941.6198...,2,SE,FTMS,10,y2,y1,148.060434167,148.0601806640625,...,,,,,,,,,,
44,54.40696716308594 54.40972137451172 70.0648117...,1802.849 918.2612 4112.844 878.95984 1029.2969...,2,IP,FTMS,10,b2,y1,98.060041167,98.06021118164062,...,,,,,,,,,,
49,53.29237365722656 54.40982437133789 58.5242881...,848.94354 1105.8538 1257.8536 921.262 1054.482...,2,TK,FTMS,10,b2,y1,129.102240167,129.1013946533203,...,,,,,,,,,,
51,62.79367446899414 66.73774719238281 68.6735992...,910.83417 941.222 894.2407 840.0121 787.6718 8...,2,TK,FTMS,10,b2,y1,129.102240167,129.102294921875,...,,,,,,,,,,
82,50.54601287841797 54.40740966796875 54.4102096...,1046.0387 1604.429 1038.8164 902.5171 960.406 ...,2,SE,FTMS,10,y2,y1,148.060434167,148.0605010986328,...,,,,,,,,,,
105,54.26076889038086 54.40673065185547 54.4090995...,1247.3359 1070.2787 1315.2032 960.42065 1136.5...,2,PTV,FTMS,10,y3,b2,199.10771946699998,199.108154296875,...,,,,,,,,,,
158,54.408485412597656 67.83517456054688 77.749832...,1726.4888 1166.8363 1586.8018 1156.2223 1270.5...,3,KE,FTMS,10,y2,b1,129.102239467,129.10206604003906,...,,,,,,,,,,
189,50.5665168762207 52.24224090576172 54.40814590...,910.6408 1043.2239 1768.921 959.5971 985.99115...,3,GKNFE,FTMS,10,y5,,,,...,,,,,,,,,,
190,50.72300720214844 51.47105407714844 54.4080429...,887.91815 851.0057 2122.2375 920.20667 964.361...,4,ISGK,FTMS,10,b4,y1,129.102240167,129.1019287109375,...,,,,,,,,,,
218,54.405670166015625 54.40823745727539 56.175708...,1195.0625 1576.0629 974.8536 995.3304 1021.442...,2,MCA,FTMS,10,b3,b2,292.0784101906,292.0765380859375,...,,,,,,,,,,


In [29]:
# import another important function from tensorize script
import tensorize as tens

In [30]:
# for reloading with troubleshooting
import importlib
importlib.reload(tens)

<module 'tensorize' from 'C:\\Users\\miar\\Documents\\GitHub\\DeNovo\\tensorize.py'>

In [31]:
# create dictionary of data
data = tens.csv_training(df_augmented, s_series)

[[130.04987017  65.52857332  -1.          88.03930447  44.52329047
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.          -1.
   -1.          -1.          -1.          -1.   

  array = array / maxima[:, numpy.newaxis]


In [32]:
data

{'collision_energy_aligned_normed': array([[0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1],
        [0.1]]),
 'sequence_integer': array([[16,  4,  0, ...,  0,  0,  0],
        [ 8, 13,  0, ...,  0,  0,  0],
        [17,  9,  0, ...,  0,  0,  0],
        ...,
        [ 6,  8, 18, ...,  0,  0,  0],
        [ 6,  8, 18, ...,  0,  0,  0],
        [ 8,  1, 14, ...,  0,  0,  0]]),
 'precursor_charge_onehot': array([[0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 

In [26]:
# import another important function from io_local script
#from io_local import to_hdf5, from_hdf5

# convert dictionary data to hdf5 
#to_hdf5(data, 'hdf5_data2.hdf5')

In [None]:
# load in ML packages and created data matrix
#import tensorflow
#from tensorflow import keras
#from keras.utils import HDF5Matrix

# load in hdf5 snd create matrix
#tensor = from_hdf5('hdf5_data.hdf5', n_samples=None)

In [None]:
# the following code does not work properly 
# GPU configuration issues
# moved to computer cluster 

In [None]:
# import model 
#import training
#import model as model_lib
# load in model
#model, model_config = model_lib.load(r"C:\Users\miar\Downloads\model_fragmentation_prediction\prosit1", trained=True)

In [None]:
#import importlib
#importlib.reload(model_lib)

In [None]:
#checking for GPU

In [None]:
#from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())

In [None]:
#import tensorflow as tf 
#tf.test.gpu_device_name()

In [None]:
#from tensorflow.python.client import device_lib
#def get_available_devices():
#    local_device_protos = device_lib.list_local_devices()
#    return [x.name for x in local_device_protos]
#print(get_available_devices()) 