In [1]:
###Import libraries###
from itertools import groupby
from rdkit import Chem
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import warnings
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.inchi import *
from rdkit import RDLogger
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import MolsToGridImage
import logging
import seaborn as sns
from sklearn.model_selection import train_test_split
from custom_layers import *
from model import *
#remove RDKit warnings
tf.get_logger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")
RDLogger.DisableLog("rdApp.*")
from utils import *
np.random.seed(123)
tf.random.set_seed(123)

In [2]:
###Predictions can be made for a file with smiles strings

ds = pd.read_csv('test.smi', sep=',')
ds['mw'] = ds.smiles.apply(lambda x: int(0.5+Chem.Descriptors.ExactMolWt(Chem.MolFromSmiles(x))))


In [3]:
ds

Unnamed: 0,smiles,mw
0,CCCCOP(=O)(SCCN(CC)CCC)C(C)C,309


In [4]:
###Or you can manualy change Smiles and MW in this cell
'''
ds = pd.DataFrame({'smiles':'CCCCOP(=O)(SCCN(CC)CCC)C(C)C', "mw":308}, index=[0])
'''

'\nds = pd.DataFrame({\'smiles\':\'CCCCOP(=O)(SCCN(CC)CCC)C(C)C\', "mw":308}, index=[0])\n'

In [5]:
batch_size=32
X=graphs_from_smiles(ds.smiles)
y = np.zeros((ds.shape[0],560))
dataset = MPNNDataset(X, np.vstack(y))

In [6]:
mpnn_s = MPNNModel(atom_dim=X[0][0][0].shape[0], bond_dim=X[1][0][0].shape[0], batch_size=batch_size)
mpnn_s.load_weights('1for.h5')
mpnn_n = MPNNModel(atom_dim=X[0][0][0].shape[0], bond_dim=X[1][0][0].shape[0], batch_size=batch_size)
mpnn_n.load_weights('1nl.h5')

In [7]:
def neutral_loss(spec, mw):
    neutral_loss_array = np.zeros(spec.shape[0])
    peaks = np.nonzero(spec)[0]
    for peak in peaks:
        if mw-peak<560:
            neutral_loss_array[mw-peak] = spec[peak]
    return neutral_loss_array


In [8]:
predicted_spectrum = mpnn_s.predict(dataset)
predicted_nl = mpnn_n.predict(dataset)

ds['pred_s'] = list(predicted_spectrum)
ds['pred_n'] = list(predicted_nl)

m_s = np.arange(0, 560)**0.5
m_n = np.flip(m_s)

m_s[0]=1
m_n[-1]=1

ds['pred_s'] = ds['pred_s'].apply(lambda x:x**2/m_s)
ds['pred_n'] = ds['pred_n'].apply(lambda x:x**2/m_n)

ds['pred_n_r'] = ds.apply(lambda x: neutral_loss(x.pred_n, x.mw), axis=1)
ds['pred_avg'] = (ds.pred_n_r+ds.pred_s)/2



ds['pred_avg'] = ds['pred_avg'].apply(lambda x: 999*x/np.max(x))
ds['pred_avg'] = ds['pred_avg'].apply(lambda x: np.where(x<1, 0,x))
ds = ds[ds.pred_avg.apply(lambda x:len(np.nonzero(x)[0])>0)]

In [9]:
ds 

Unnamed: 0,smiles,mw,pred_s,pred_n,pred_n_r,pred_avg
0,CCCCOP(=O)(SCCN(CC)CCC)C(C)C,309,"[9.972725756313139e-09, 1.0394349203579623e-07...","[6.953756426806197e-06, 2.244469587053551e-05,...","[4.4611957088342955e-08, 2.9457883239505495e-0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [10]:
def spectra_to_MSP(df, output_file='predicted_spectra.msp'):
    with open(output_file, "w") as f:
        for i in df.index:
            f.write('Name:'+df.iloc[i].smiles+'\n')
            f.write('MW:'+str(df.iloc[i].mw)+'\n')
            f.write('Num Peaks:'+str(len(np.nonzero(df.iloc[i].pred_avg[1:df.iloc[i].mw+10])[0]))+'\n')
            
            for j in np.nonzero(df.iloc[i].pred_avg[1:df.iloc[i].mw+10])[0]:
                f.write(str(j+1)+' ' + str(df.iloc[i].pred_avg[1:df.iloc[i].mw+10][j])+ ';\n')


In [11]:
spectra_to_MSP(ds, 'test.msp')