# Modeling

Evaluates LSTM with various combinations of hyper parameters.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.layers import Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorboard.plugins.hparams import api as hp

from src.features.smiles import SmilesTokenizer
from src.models.lstm_model import build_model

In [None]:
%load_ext tensorboard

In [None]:
!rm -rf ./logs/

## Load training dataset

In [None]:
dataset = np.load('data/interim/smiles_train.npy')
print(dataset.shape)

## Prep training dataset

In [None]:
st = SmilesTokenizer()
vocab_size = st.table_len

np.random.shuffle(dataset)

X = dataset[:, :-1]
labels = dataset[:, -1:]

VAL_SPLIT = .10

y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

X_train, X_test = X[:int(X.shape[0] * (1 - VAL_SPLIT))], X[int(X.shape[0] * (1 - VAL_SPLIT)):]
y_train, y_test = y[:int(y.shape[0] * (1 - VAL_SPLIT))], y[int(y.shape[0] * (1 - VAL_SPLIT)):]

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1]))

max_length = X_train.shape[1]
train_size = X_train.shape[0]

print('Vocabulary size: ', vocab_size)
print('Max length: ', max_length)
print('Train size: ', train_size)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Set Hyperparameters

In [None]:
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([128, 256]))
HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.1, 0.2))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['nadam', 'adam', 'rmsprop']))

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_OPTIMIZER],
        metrics=[hp.Metric('mae', display_name='MAE'),
                 hp.Metric('acc', display_name='Accuracy'),
                ],
    )

## Build Model

In [None]:
NUM_EPOCHS = 100

tensorboard = tf.keras.callbacks.TensorBoard(log_dir='logs')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss')

checkpoint_filepath = 'models/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_acc',
    mode='max',
    save_best_only=True)

weight_init = RandomNormal(mean=0.0, stddev=0.05, seed=71)

def run(run_dir, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)
        
        model = build_model(vocab_size, hparams[HP_NUM_UNITS], hparams[HP_DROPOUT], hparams[HP_OPTIMIZER])
        
        model.fit(X_train, y_train, validation_data=(X_test, y_test), shuffle=False, epochs=NUM_EPOCHS, batch_size=1000, callbacks=[tensorboard, es, model_checkpoint_callback, reduce_lr_on_plateau])
    
        print(model.summary())
    
        scores = model.evaluate(X_test, y_test)
        
        tf.summary.scalar('mae', scores[0], step=1)
        tf.summary.scalar('acc', scores[1], step=1)

session_num = 0

for num_units in HP_NUM_UNITS.domain.values:
    for dropout_rate in (HP_DROPOUT.domain.min_value, HP_DROPOUT.domain.max_value):
        for optimizer in HP_OPTIMIZER.domain.values:
            hparams = {
                HP_NUM_UNITS: num_units,
                HP_DROPOUT: dropout_rate,
                HP_OPTIMIZER: optimizer,
            }
            run_name = "run-%d" % session_num
            print('--- Starting trial: %s' % run_name)
            print({h.name: hparams[h] for h in hparams})
            run('logs/hparam_tuning/' + run_name, hparams)
            session_num += 1

In [None]:
# %tensorboard --logdir logs/hparam_tuning