In [22]:
# import libraries
from pyopenms import * # main package used for handling MS data
import os # changing directories
import pandas as pd # to read in tsv and for dataframe creation/manipulation
import subprocess # running R script 

import tensorize # for formatting data
import match 
from constants import ION_TYPES, DEFAULT_MAX_CHARGE

import losses # for getting spectral angle

In [2]:
# after running main script
# change experiment files in DeNovo_main.py as well for now
from DeNovo_main import *

Experiment Files: 
HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD30.mzML
App-2022-06-12_22-28-53.log
HEK293T_De_Novo_061122_Glu-C_B_BP_anyLength_HCD30_realtimesearch.tsv
-----------------------------------------------------------------------------------------
Number of MS2 scans: 81780
Number of MS3 scans: 681
-----------------------------------------------------------------------------------------
Precursor mzs match up after taking rounding discrepencies into consideration
Fragment mzs match up perfectly!


In [3]:
new_fragment_df

Unnamed: 0,Scan_Number,Sequence,Charge,Sequence_Length,Missing_Fragment_Locations,Number_Missing,Target_Fragment,MS3_Scan,Fragment_Sequence,Target_Charge,Locations_Found
13,4039,VSLTQKTDPSVRPMHE,4,16,,0,y4,4048,PMHE,3,[2]
40,4610,AGLNVTTSHSPAAPGE,2,16,13815.0,4,y3,4621,PGE,1,[2]
42,4630,AGLNVTTSHSPAAPGE,2,16,1.0,1,y3,4639,PGE,1,[2]
55,4986,HASIQMNVAE,2,10,,0,b5,4994,HASIQ,1,[2]
60,5057,AAKLQTTKVKKPTGTRNLYLARE,5,23,,0,b4,5062,AAKL,3,[3]
88,5600,FKPNKPKPCGLCNQFGHE,4,18,,0,y3,5602,GHE,1,[2]
91,5713,GSSIKKAQQAVANKALTE,3,18,,0,b2,5723,GS,1,[1]
133,7286,IVGGATRIPAVKE,3,13,2.0,1,y2,7299,KE,1,[1]
141,7640,KPAKAITSSRVPGE,3,14,,0,b2,7644,KP,1,[1]
157,8134,RRQLIVPPHLAHGE,4,14,,0,b6,8141,RRQLIV,2,"[4, 5]"


In [4]:
# creating stick plot for single case 

In [5]:
# choose index where length of fragment sequence is greater than 2
# pretrained model can not predict on peptides less than 3 in length
s = specM3[212]

# obtain mz and intensity values 
mz, intensity = s.get_peaks()

In [6]:
# is the target a b or y ion?
# there exists a special case when there is an internal y ion from a b target 
# the pretrained model does not take this into account 
new_fragment_df['Target_Fragment'][212][0]

'y'

In [7]:
# special case when there is an internal y ion in a b fragment
def findTargetMZs(peptide_object, charge, i):
    
    mzs = []
    
    # targeted fragment is a y ion
    if new_fragment_df['Target_Fragment'][i].startswith('y'):
        y_num = new_fragment_df['Target_Fragment'][i][-1]

        # the full sequence of the fragment
        full_seq = peptide_object.getSuffix(int(y_num))

        # checking fragment for its y ions
        for ion in range(1, int(y_num)):
            y_ion = full_seq.getSuffix(ion)
            for z in range(1, charge):
                mz_y = y_ion.getMonoWeight(Residue.ResidueType.YIon, z) / z
                mzs.append(mz_y)

        # checking fragment for b ions
        for ion in range(1, int(y_num)):
            b_ion = full_seq.getPrefix(ion)
            for z in range(1, charge):
                mz_b = b_ion.getMonoWeight(Residue.ResidueType.Internal, z) / z
                mzs.append(mz_b)
                
    # targeted fragment is a b ion
    elif new_fragment_df['Target_Fragment'][i].startswith('b'):
        b_num = new_fragment_df['Target_Fragment'][i][-1]

        # the full sequence of the fragment
        full_seq = peptide_object.getPrefix(int(b_num))

        # checking fragment for its b ions
        for ion in range(1, int(b_num)):
            b_ion = full_seq.getPrefix(ion)
            for z in range(1, charge):
                mz_b = b_ion.getMonoWeight(Residue.ResidueType.BIon, z) / z
                mzs.append(mz_b)

        # checking fragment for y ions
        for ion in range(1, int(b_num)):
            y_ion = full_seq.getSuffix(ion)
            for z in range(1, charge):
                mz_y = y_ion.getMonoWeight(Residue.ResidueType.Internal, z) / z
                mzs.append(mz_y)
    return mzs

In [8]:
# create peptide object for the target sequence
peptide_object = AASequence.fromString(new_fragment_df['Sequence'][212])
    
# the charge associated with this sequence
charge = new_fragment_df['Target_Charge'][212] + 1
    
# call findTargetMZs function 
# returns list of all mzs associated with target sequence
mzs = findTargetMZs(peptide_object, charge, 212)

In [9]:
# want only mzs/intensities that correspond to the sequence of the target frag
mz_new = []
remove_list = []
count = -1
for m in mz:
    count = count + 1
    if m < max(mzs)+0.02 and m > min(mzs)-0.02: # range of interest, based on the target mzs
        mz_new.append(m)
    else:
        remove_list.append(count) # indicies of intensities to remove

intensity_new = np.delete(intensity, remove_list)

# normalize intensities (values between 0 and 1)
base_normalized = [x/(intensity_new.max()) for x in intensity_new]

In [10]:
# save to csv for plotting in R
oneScan_mz_intensity = pd.DataFrame()
observed = oneScan_mz_intensity.assign(mz=mz_new, Intensity=base_normalized)
#observed.to_csv('mzs_intensities_one.csv', index=False)

In [11]:
# for a single case:
# calculate spectral angle for pair (observed vs predicted)
# have to get predicted values by running pretrained model on compute cluster

In [12]:
# load in predicted values
predicted = pd.read_csv("predicted.csv") 

In [13]:
predicted

Unnamed: 0,Intensity,mz,Unnamed: 2,Before normalizing
0,0.075257,148.060434,,0.017473
1,0.344922,205.081898,,0.080085
2,1.0,292.113926,,0.232184
3,0.096844,185.092069,,0.022486
4,0.727335,242.113532,,0.168875


In [14]:
observed

Unnamed: 0,mz,Intensity
0,112.087303,0.16226
1,139.945969,0.160119
2,148.060303,0.587331
3,149.728348,0.283135
4,149.748886,0.508689
5,149.756729,0.360357
6,157.096909,1.0
7,181.633377,0.262249
8,185.092239,0.903246
9,292.113983,0.253807


In [11]:
# create dataframe with all mzs and intensities for this experiment
rows = []
for i in new_fragment_df.index:
    # spectrum
    s = specM3[i]
    
    # obtain mz and intensity values 
    mz, intensity = s.get_peaks()
    
    # base normalize intensity 
    #intensity_normed = [x/(intensity.max()) for x in intensity]
    
    mz_mod = " ".join(str(m) for m in mz)
    intensity_mod = " ".join(str(i) for i in intensity)
    
    targets = new_fragment_df['Target_Fragment'][i]
    
    # create dict (rows of dataframe)
    data = {'mz':mz_mod,
       'intensity':intensity_mod,
           'target_fragment':targets}
    
    rows.append(data)
    
# create dataframe
df = pd.DataFrame(rows)

# save
#df.to_csv('mzs_intensities.csv', index=False)

In [13]:
# for all scans in experiment:
# calculate spectral angle for all pairs of observed vs predicted

In [14]:
# create csv file used for input for predicted data
# contains modified_sequence, collision_energy, precursor_charge
mod_seqs = new_fragment_df['Fragment_Sequence']
targ_charge = new_fragment_df['Target_Charge']
energy = int(realtime.split('_')[-2][-2:])

# parse collision energy
collision = []
analyzer = []
for i in range(0, len(mod_seqs)):
    collision.append(energy)
    analyzer.append('FTMS')

In [15]:
# create dataframe and convert to csv
peptidelist = pd.DataFrame()
peptidelist =  peptidelist.assign(modified_sequence=mod_seqs, collision_energy=collision, precursor_charge=targ_charge)
peptidelist.reset_index(inplace=True)

# remove all modified sequences
for i in peptidelist.index:
    if ('[' or ']') in peptidelist['modified_sequence'][i]:
        peptidelist.drop(i, axis=0, inplace=True)
        df.drop(i, axis=0, inplace=True)
        analyzer.pop(i)

In [79]:
# save 
#peptidelist.to_csv('peptidelist.csv', index=False) 
# next, run pretrained model to get predictions 

In [16]:
df_concat = pd.concat([df, peptidelist], axis=1)

In [17]:
df_concat = df_concat.assign(mass_analyzer=analyzer)

In [18]:

df_concat.rename(columns = {'mz':'masses_raw', 'intensity':'intensities_raw', 'precursor_charge':'charge'}, inplace = True)

In [23]:
# running augment function
test = match.augment(df_concat, ION_TYPES, DEFAULT_MAX_CHARGE)

In [25]:
# remove all rows that are completely empty
for i in test.index:
    if test['matches_charge1'][i] == test['matches_charge2'][i] == test['matches_charge3'][i] == test['matches_charge4'][i] == test['matches_charge5'][i] == test['matches_charge6'][i]:
        test.drop(i, axis=0, inplace=True)

In [27]:
# to run csv function 'charge' needs to be replaced with 'precursor_charge' ?
test.rename(columns = {'charge':'precursor_charge'}, inplace = True)

In [39]:
data = tensorize.csv(test)

In [41]:
for key, value in data.items():
    print (key, value)

collision_energy_aligned_normed [[0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]
 [0.3]]
sequence_integer [[13  6  4 ...  0  0  0]
 [13  6  4 ...  0  0  0]
 [ 7  1 16 ...  0  0  0]
 ...
 [13 11  5 ...  0  0  0]
 [13  6  4 ...  0  0  0]
 [ 9 15  6 ...  0  0  0]]
precursor_charge_onehot [[1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 1 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 1 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 1 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 1 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 1 0 0 0]
 [1 0 0 0 0 0]
 [0 1 0 0 0 0]
 [1 0 0 0 0 0

In [14]:
def get_spectral_angle(true, pred, batch_size):
    import tensorflow

    n = true.shape[0]
    sa = numpy.zeros([n])

    def iterate():
        if n > batch_size:
            for i in range(n // batch_size):
                true_sample = true[i * batch_size : (i + 1) * batch_size]
                pred_sample = pred[i * batch_size : (i + 1) * batch_size]
                yield i, true_sample, pred_sample
            i = n // batch_size
            yield i, true[(i) * batch_size :], pred[(i) * batch_size :]
        else:
            yield 0, true, pred

    for i, t_b, p_b in iterate():
        tensorflow.compat.v1.reset_default_graph() 
        with  tensorflow.compat.v1.Session() as s:
            sa_graph = losses.masked_spectral_distance(t_b, p_b)
            sa_b = 1 - s.run(sa_graph)
            print(sa_b)
            sa[i * batch_size : i * batch_size + sa_b.shape[0]] = sa_b
    sa = numpy.nan_to_num(sa)
    return sa

In [14]:
batch_size = 600
get_spectral_angle(d["intensity"], df["intensity"], batch_size=batch_size)

Using TensorFlow backend.


0     [0.6924702, 0.27853397, 0.28234604, 0.32314664...
1     [0.13462088, 0.15286781, 0.116988316, 0.200044...
2     [0.11371654, 1.0, 0.10345023, 0.11486543, 0.11...
3     [0.10031957, 0.18191542, 0.1178763, 0.09230649...
4     [0.15489565, 0.27973375, 0.17002675, 0.1641234...
5     [0.16630122, 0.16307032, 0.15583996, 0.1553845...
6     [0.58755255, 0.56874394, 0.55277187, 0.5168081...
7     [0.071955085, 0.07270705, 0.096765034, 0.06940...
8     [0.5211301, 0.37080136, 0.2135754, 0.21948116,...
9     [0.11522895, 0.26575413, 0.14488827, 0.1133909...
10    [0.22260197, 0.4519368, 0.17165656, 0.17540142...
11    [0.106587745, 0.13385363, 1.0, 0.56744534, 0.1...
12    [0.25678694, 0.34844422, 0.3849318, 0.56633675...
13    [0.06406655, 0.035669364, 0.03776245, 0.185469...
14    [0.08604191, 1.0, 0.07475805, 0.08578217, 0.07...
15    [0.28541204, 0.16615492, 0.17796668, 0.1543775...
16    [0.18481041, 0.15362445, 0.18787944, 0.1645397...
17    [0.05210185, 1.0, 0.037401684, 0.0213405, 

TypeError: can only concatenate list (not "int") to list