In [1]:
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import layers
from keras.initializers import RandomNormal
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.model_selection import train_test_split
import numpy as np
import re
import pickle

from classes import Tokenizer, DataGenerator

In [2]:
# Load smiles from file
with open("data/cleaned_smiles.smi") as file:
    smiles = [line.strip() for line in file][:2000]

smiles[:3]

['O=C1OC(=O)c2c1cc1cc3c(cc1c2-c1ccc2c(c1)OCO2)OCO3',
 'CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC5CN(C6CCOCC6)C5)c([N+](=O)[O-])c4)c(Oc4cnc5[nH]ccc5c4)c3)CC2)=C(c2ccc(Cl)cc2)C1',
 'CCC(=O)Oc1ccc2c(=O)n(Cc3cccc(NS(=O)(=O)NC)c3F)c(=O)oc2c1']

In [5]:
tokenizer = Tokenizer()

maxlen = np.max([len(tokenizer.tokenize(smi)) for smi in smiles])
print(f"Maxumin length is {maxlen}")

pad_smiles = ['G' + smi + 'E' + 'A' * (maxlen - len(tokenizer.tokenize(smi))) for smi in smiles]
print(pad_smiles[:5])
      
tokenizer.fit_on_texts(pad_smiles)

X = tokenizer.texts_to_vector([smi[:-1] for smi in pad_smiles])
y = tokenizer.texts_to_vector([smi[1:] for smi in pad_smiles])

Maxumin length is 124
['GO=C1OC(=O)c2c1cc1cc3c(cc1c2-c1ccc2c(c1)OCO2)OCO3EAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', 'GCC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC5CN(C6CCOCC6)C5)c([N+](=O)[O-])c4)c(Oc4cnc5[nH]ccc5c4)c3)CC2)=C(c2ccc(Cl)cc2)C1E', 'GCCC(=O)Oc1ccc2c(=O)n(Cc3cccc(NS(=O)(=O)NC)c3F)c(=O)oc2c1EAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', 'GCCOC(=O)COc1cc(-c2cc(=O)c3c(O)cc(OCC(=O)N4CC[N+](C)(Cc5ccc(OC)c(OC)c5OC)CC4)cc3o2)ccc1OCEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', 'GCCN(CC)CC1CCCCN1CC(=O)N1c2ccc(Cl)cc2C(=O)Nc2cccnc21EAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA']


In [6]:
# Save tokenizer
with open('models/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

In [11]:
# Split into train test
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1,
                                                     shuffle=True, random_state=11)

num_classes = tokenizer.vocab_len
batch_size = 256
n_epoch = 10

# Build a generator from X_train and y_train 
train_generator = DataGenerator(X_train, y_train, batch_size, num_classes)

# Validation dataset have to be data itself not generator
X_valid = np.asarray([to_categorical(x, num_classes) for x in X_valid])
y_valid = np.asarray([to_categorical(x, num_classes) for x in y_valid])

In [12]:
# Initialize random biases
weight_init = RandomNormal(mean=0.0,
                           stddev=0.05,
                           seed=11)

# Build model
model = keras.models.Sequential()
model.add(layers.LSTM(256, input_shape=(None, num_classes),
                      return_sequences=True, dropout=0.3,
                      kernel_initializer=weight_init))

model.add(layers.LSTM(256, input_shape=(None, num_classes),
                      return_sequences=True, dropout=0.5,
                     kernel_initializer=weight_init))

model.add(layers.Dense(num_classes, activation='softmax', kernel_initializer=weight_init))


model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, None, 256)         301056    
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 256)         525312    
_________________________________________________________________
dense_1 (Dense)              (None, None, 37)          9509      
Total params: 835,877
Trainable params: 835,877
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Callbacks to save intermediae results
early_stopping = EarlyStopping(monitor='val_loss', mode='min', patience=5)
model_checkpoint = ModelCheckpoint(filepath = 'models/best_model.h5',
                                   monitor='val_loss', mode='min', save_best_only=True,
                                  save_weights_only=False)



In [None]:
model.fit(train_generator, epochs = n_epoch, use_multiprocessing=True,
          steps_per_epoch = int(len(X_train) * batch_size ** -1),
          validation_data=(X_valid, y_valid),
          callbacks=[early_stopping, model_checkpoint]
         )

model.save("models/end_model.h5")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10