# Training

Train a LSTM to generate molecules.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

from src.features.smiles import SmilesTokenizer
from src.models.lstm_model import build_model

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
dataset = np.load('data/interim/training_dataset.npy')

In [None]:
st = SmilesTokenizer()

vocab_size = st.table_len

np.random.shuffle(dataset)

X = dataset[:, :-1]
labels = dataset[:, -1:]

VAL_SPLIT = .10

y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

X_train, X_test = X[:int(X.shape[0] * (1 - VAL_SPLIT))], X[int(X.shape[0] * (1 - VAL_SPLIT)):]
y_train, y_test = y[:int(y.shape[0] * (1 - VAL_SPLIT))], y[int(y.shape[0] * (1 - VAL_SPLIT)):]

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1]))

max_length = X_train.shape[1]
train_size = X_train.shape[0]

print('Vocabulary size: ', vocab_size)
print('Max length: ', max_length)
print('Train size: ', train_size)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
(model, es) = build_model(vocab_size, 128, .1, 'nadam')

print(model.summary())

#model.load_weights('models/2024-05-27_12-40_model_nadam_128_100epochs_5000batch.h5')

In [None]:
NUM_EPOCHS = 100

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    shuffle=False,
    epochs=NUM_EPOCHS,
    batch_size=1000,
    callbacks=[es]
    )

scores = model.evaluate(X_test, y_test)

print(history)
print(scores)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

model.save('model_nadam_128_100epochs_1000batch.h5')