In [1]:
# import libraries
from pyopenms import * # main package used for handling MS data
import os # changing directories
import pandas as pd # creating and manipulating dataframe

In [2]:
# change directory to find data files of interest
os.chdir(r'C:\Users\miar\Desktop\data')

In [3]:
# experiment files 
mzML = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD10.mzML'
log = 'App-2022-06-12_14-16-26.log'
realtime = 'HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD10_realtimesearch.tsv'

In [4]:
# load the content of the mzML file into the exp variable of type MSExperiment
exp = MSExperiment()
MzMLFile().load(mzML, exp)

In [5]:
# loop through the spectra to filter MS3 scans
specM3 = [] # list of MS3 spectra 
row_data = []
for s in exp.getSpectra():
    
    if s.getMSLevel() == 3:
        specM3.append(s)
        
        # get scan number
        s_number = s.getNativeID().split(' ')[-1]
        _, scan_number = s_number.split('=')
        
        # obtain mz and intensity values 
        mz, intensity = s.get_peaks()
        
        mz_mod = " ".join(str(m) for m in mz)
        intensity_mod = " ".join(str(i) for i in intensity)
        
        # create dict (rows of dataframe)
        data = {'MS3_Scan':scan_number,
       'masses_raw':mz_mod,
       'intensities_raw':intensity_mod}
        
        row_data.append(data)

In [6]:
# set MS3 scans as index
df = pd.DataFrame(row_data)
df.set_index('MS3_Scan', inplace=True)

In [7]:
# parse functions
def parseScanLine(input):
    x = input.split(" For: ")
    [scan_number, mzs] = x[1].split(", ")
    [precursor_mz, fragment_mz] = mzs.split(";")
    trimmed_fragment_mz = fragment_mz.strip() # trim fragment strings to remove \n
    return [scan_number, precursor_mz, trimmed_fragment_mz]

def parseTargetIons(input):
    i = input.split('Target Fragment: ')
    ion = i[1].split(',')[0]
    return ion 

In [8]:
# checking lines of log file and creating dictionary of scan numbers and fragment mzs
try:
  
    # words to search for
    search = ' Submitted Custom Scan For:'
    
    # dict for scan numbers and corresponding fragments 
    scan2frag = dict()
    with open(log) as f:
        for line in f:
            if search in line:
                scan_number, precursor_mz, trimmed_fragment_mz = parseScanLine(line)
                scan2frag[scan_number] = [float(precursor_mz), float(trimmed_fragment_mz)]
            
    # if the input string doesn't exist in the text file
    if len(scan2frag)==0:
        print("\n\"" + search + "\" is not found in \"" + log + "\"!")

except FileNotFoundError:
    print("The file does not exist!")

In [9]:
# obtain MS3 scan numbers
# obtain precursor and fragment mzs directly from the MS3 spectrum

ms3scan2MZs = dict()
for s in specM3:
    s_number = s.getNativeID().split(' ')[-1]
    _, scan_number = s_number.split('=')
   
    fragment, precursor = s.getPrecursors()
    precursor_mz = precursor.getMZ()
    fragment_mz = fragment.getMZ()
    
    ms3scan2MZs[int(scan_number)] = [round(float(precursor_mz), 4), round(float(fragment_mz), 4)] # 4 decimal places, similar to log

In [10]:
def matchingMS3s(ms2_mzs, ms3_mzs): # either fragment or precursor
    
    # making sure they are within 100 scans of each other
    too_far = []
    for ms2scan, ms3scan in zip(list(scan2frag), list(ms3scan2MZs)):
        scan_diff = int(ms3scan) - int(ms2scan)
        if scan_diff > 100:
            too_far.append('Scans are not within 100 scans of each other...' + 'MS2 = ' + str(ms2scan) + ' MS3 = ' + str(ms3scan))
    
    # do they not match off the bat?
    if ms2_mzs != ms3_mzs:
        # taking into consideration rounding discrepencies between the log and the spectrum
        mismatch = []
        for i in range(0, len(list(ms3scan2MZs))):
            
            precursor_diff = float(list(ms3scan2MZs.values())[i][0]) - float(list(scan2frag.values())[i][0]) 
            if precursor_diff < 0.000101 or (precursor_diff < 0 and precursor_diff > -0.000101): # because sometimes max number will be 0.0001000002 for example
                pass
            else:
                mismatch.append(i)
            
            fragment_diff = float(list(ms3scan2MZs.values())[i][1]) - float(list(scan2frag.values())[i][1]) 
            if fragment_diff < 0.000101 or (fragment_diff < 0 and fragment_diff > -0.000101): # because sometimes max number will be 0.0001000002 for example
                pass
            else:
                mismatch.append(i)
                
        # no mismatch after rounding and within 100 scans
        if len(mismatch) == len(too_far) == 0:
            print('Scans match up after taking rounding discrepencies into consideration')
            ms2_scans = list(scan2frag)
            return ms2_scans
        
        elif len(mismatch) != 0:
            print('There is mismatch at the following indicies:') # if this is the case, need to do more work...
            for i in mismatch:
                print(i) 
                
        elif len(too_far) != 0:
            print(too_far)
        
    # they match perfectly
    elif ms2_mzs == ms3_mzs:
        # within 100 scans
        if len(too_far) == 0:
            print('Scans match up perfectly!')
            ms2_scans = list(scan2frag)
            return ms2_scans
        else:
            print(too_far)

In [11]:
# make sure that MS3 scans are in the same order as MS2 scans
ms2_scans = matchingMS3s(list(ms3scan2MZs.values()), list(scan2frag.values()))

Scans match up after taking rounding discrepencies into consideration


In [12]:
# use realtime file to obtain peptide sequence and charge
# read in peptide sequence from tsv
tsv = pd.read_csv(realtime, sep='\t')

# create dictionary with scan # as key and sequence/charge as values
scan2PeptideCharge = dict([(i, [x,y]) for i, x,y, in zip(tsv['Scan Number'], tsv['Peptide'], tsv['Charge State'])])

# removing all NaN sequences (not useful)
scan2PeptideCharge_modified = {k:v for k,v in scan2PeptideCharge.items() if str(v[0]) != 'nan'}

In [13]:
# collect data for dataframe
seqs = []
charges = []
analyzer = []
collision = []

energy = int(realtime.split('_')[-2][-2:])

for scan in ms2_scans:
    if int(scan) in list(scan2PeptideCharge_modified):
        charge = scan2PeptideCharge_modified[int(scan)][1]
        charges.append(charge)
        
        sequence = scan2PeptideCharge_modified[int(scan)][0]     
        trimmed_sequence = sequence[2:-2] # remove first two and last two characters 
        seqs.append(trimmed_sequence)
        
        # all ms3 scans are orbit trap 
        # to be added to dataframe with other MS3 info
        analyzer.append('FTMS')
        
        # all scans have same collision energy
        collision.append(energy)

In [14]:
# add all data columns to dataframe
df = df.assign(charge=charges, modified_sequence=seqs, mass_analyzer=analyzer, collision_energy=collision)

In [15]:
#df

In [16]:
# remove all modified sequences
for i in df.index:
    if ('[' or ']') in df['modified_sequence'][i]:
        df.drop(i, axis=0, inplace=True)

In [17]:
#df

In [18]:
# remove all sequences more than 30 in length
for i in df.index:
    if len(df['modified_sequence'][i]) >= 30:
        df.drop(i, axis=0, inplace=True)

In [19]:
#df

In [20]:
# importing variables and functions from other scripts
from constants import ION_TYPES, DEFAULT_MAX_CHARGE
from match import augment

In [21]:
# running augment function
df_augmented = augment(df, ION_TYPES, DEFAULT_MAX_CHARGE)
df_augmented.sort_values(by='matches_charge1', inplace=True)

# remove all rows that are completely empty
for i in df_augmented.index:
    if df_augmented['matches_charge1'][i] == df_augmented['matches_charge2'][i] == df_augmented['matches_charge3'][i] == df_augmented['matches_charge4'][i] == df_augmented['matches_charge5'][i] == df_augmented['matches_charge6'][i]:
        df_augmented.drop(i, axis=0, inplace=True)

In [96]:
df

Unnamed: 0,masses_raw,intensities_raw,precursor_charge,modified_sequence,mass_analyzer,collision_energy,matches_charge1,masses_the_charge1,masses_raw_charge1,intensities_raw_charge1,...,masses_raw_charge4,intensities_raw_charge4,matches_charge5,masses_the_charge5,masses_raw_charge5,intensities_raw_charge5,matches_charge6,masses_the_charge6,masses_raw_charge6,intensities_raw_charge6
0,54.40858840942383 78.0984115600586 101.2928619...,1879.8071 883.1377 824.6557 841.20575 1010.667...,4,NRQNLLSQSHAYQQFLRD,FTMS,10,,,,,...,,,,,,,,,,
1,54.40818405151367 59.515201568603516 73.793617...,1741.3868 870.67285 860.9621 1102.1569 912.719...,2,HFIQMLNEPVQE,FTMS,10,,,,,...,,,,,,,,,,
2,54.407527923583984 56.788421630859375 57.04209...,2116.7505 998.3178 909.0501 892.9229 1023.9375...,4,HLKNPVIAQKIQKLMD,FTMS,10,,,,,...,,,,,,,,,,
3,54.398983001708984 54.40628433227539 54.408821...,932.71857 1157.4438 1570.9752 945.7519 910.068...,6,AVKQGSATVGLKSKTHAVLVALKRAQSE,FTMS,10,,,,,...,,,,,,,y10,181.6200145836667,181.61807250976562,1719.2412
4,52.09868240356445 54.40813446044922 54.5404930...,836.9001 1246.0452 1011.11926 1026.341 1002.71...,4,KSFRQNSTLIQHKKVHTGQKPFQCTD,FTMS,10,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,50.84954071044922 54.40803146362305 62.4500274...,914.90027 1854.4222 901.7314 944.06946 1007.72...,3,QKGGKPEPPAMPQPVPTA,FTMS,10,y2-H2O,199.10771946699998,199.10772705078125,2392.881,...,,,,,,,,,,
86,54.40839767456055 61.05514907836914 61.9944915...,1929.7275 839.2207 1000.2381 1041.044 1005.574...,3,QKGGKPEPPAMPQPVPTA,FTMS,10,y2-H2O,199.10771946699998,199.10800170898438,1334.5397,...,,,,,,,,,,
87,54.26076889038086 54.40673065185547 54.4090995...,1247.3359 1070.2787 1315.2032 960.42065 1136.5...,2,AAQKAVNSATGVPTV,FTMS,10,y2-H2O;y3,199.10771946699998;316.186698167,199.108154296875;316.1875,1114.1882;12386.037,...,,,,,,,,,,
88,55.38409423828125 69.18057250976562 72.0357437...,954.0856 841.7803 929.5201 923.4518 7007.549 9...,2,HCIFASNTSALPISE,FTMS,10,y3-H2O,298.176132467,298.17486572265625,2386.6758,...,,,,,,,,,,


In [107]:
df['matches_charge6'][0]

''

In [88]:
df.masses_raw[0]

'54.40858840942383 78.0984115600586 101.29286193847656 124.80253601074219 127.69288635253906 149.73477172851562 149.75 181.6172637939453 181.63616943359375 392.8066711425781 448.5534362792969 513.2549438476562 733.4716796875'

In [60]:
df.intensities_raw[0]

'1879.8071 883.1377 824.6557 841.20575 1010.6672 5583.698 4405.5474 1575.8624 2427.5974 977.4921 933.7181 1483.1978 1041.579'

In [23]:
# to run csv function 'charge' needs to be replaced with 'precursor_charge' ?
df_augmented.rename(columns = {'charge':'precursor_charge'}, inplace = True)


In [119]:
import importlib
importlib.reload(tens)

<module 'tensorize' from 'C:\\Users\\miar\\Documents\\GitHub\\DeNovo\\tensorize.py'>

In [24]:
# import another important function from tensorize script
import tensorize as tens


In [120]:
# create dictionary of data
data = tens.csv_training(df_augmented)

[184.1043087  513.25281947 257.13004797 171.7557908  678.35695117
 339.68211382 226.79050137 626.33688347 313.67207997 209.4504788
 806.41552917 403.71140282 269.47669403 739.42094747 370.21411197
 247.1451668  969.47885817 485.24306732 323.83113703 826.45297547]


TypeError: 'in <string>' requires string as left operand, not numpy.float64

In [125]:
test = list(df['masses_raw'])
test
# need to convert str to list of numbers :)
# need to convert str to list of numbers :)
# need to convert str to list of numbers :)
# need to convert str to list of numbers :)
# need to convert str to list of numbers :)
# need to convert str to list of numbers :)
# need to convert str to list of numbers :)
# need to convert str to list of numbers :)
# need to convert str to list of numbers :)
# need to convert str to list of numbers :)


['54.40858840942383 78.0984115600586 101.29286193847656 124.80253601074219 127.69288635253906 149.73477172851562 149.75 181.6172637939453 181.63616943359375 392.8066711425781 448.5534362792969 513.2549438476562 733.4716796875',
 '54.40818405151367 59.515201568603516 73.79361724853516 78.63480377197266 79.80007934570312 83.3802719116211 87.92951965332031 88.4729232788086 89.9736099243164 91.03730773925781 95.8277587890625 121.07746887207031 149.736083984375 149.75115966796875 151.7523956298828 181.62713623046875 285.1358947753906 297.14837646484375 436.96771240234375 577.7107543945312 628.9053955078125 657.8412475585938',
 '54.407527923583984 56.788421630859375 57.04209518432617 63.37095260620117 125.64320373535156 141.4635772705078 149.73719787597656 164.23887634277344 181.63455200195312 216.98635864257812 223.15509033203125 251.15072631835938 296.6070251464844 386.7359619140625 421.3332824707031 452.2904052734375',
 '54.398983001708984 54.40628433227539 54.40882110595703 55.7184295654

In [26]:
# import another important function from io_local script
#from io_local import to_hdf5, from_hdf5

# convert dictionary data to hdf5 
#to_hdf5(data, 'hdf5_data2.hdf5')

In [None]:
# load in ML packages and created data matrix
#import tensorflow
#from tensorflow import keras
#from keras.utils import HDF5Matrix

# load in hdf5 snd create matrix
#tensor = from_hdf5('hdf5_data.hdf5', n_samples=None)

In [None]:
# import model 
#import training
#import model as model_lib
# load in model
#model, model_config = model_lib.load(r"C:\Users\miar\Downloads\model_fragmentation_prediction\prosit1", trained=True)

In [None]:
#import importlib
#importlib.reload(model_lib)

In [None]:
#checking for GPU

In [None]:
#from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())


In [None]:
#import tensorflow as tf 
#tf.test.gpu_device_name()

In [None]:
#from tensorflow.python.client import device_lib
#def get_available_devices():
#    local_device_protos = device_lib.list_local_devices()
#    return [x.name for x in local_device_protos]
#print(get_available_devices()) 