# MOUNT GOOGLE Drive


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!pip3 install tensorflow==2.1.0

In [0]:
model_name='lstm_chem' # @param
iden=str(model_name).upper()

# Change your working directory

In [0]:
 cd /content/gdrive/My\ Drive/pySMD/

# TPU CHECK
The model trains faster in TPU (approximately 17 times)

In [0]:
%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime;')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# FIXED PARAMETERS


In [0]:
from glob import glob 
import os 

BUCKET='tfalldata' # @param
TFIDEN='tfrecSMDsample'  # @param
SMI_MAXLEN=77 # @param
TOKENIZER_TABLE_LEN=47 # @param
DATA_DIM=(SMI_MAXLEN+1,TOKENIZER_TABLE_LEN)
BATCH_SIZE=1024 # @param
BUFFER_SIZE=2048 # @param
TRAIN_DATA=10240*35 # @param
EVAL_DATA=10240*9 # @param

EPOCHS=50 # @param
TOTAL_DATA=TRAIN_DATA+EVAL_DATA
STEPS_PER_EPOCH = TOTAL_DATA//BATCH_SIZE
EVAL_STEPS      = EVAL_DATA//BATCH_SIZE
GCS_PATH='gs://{}/{}'.format(BUCKET,TFIDEN)
print(GCS_PATH)

WEIGHT_PATH=os.path.join(os.getcwd(),'weights','{}.h5'.format(iden))
if os.path.exists(WEIGHT_PATH):
  print('FOUND PRETRAINED WEIGHTS')
  LOAD_WEIGHTS=True 
else:
  print('NO PRETRAINED WEIGHTS FOUND')
  LOAD_WEIGHTS=False

# Dataset wrapper with tf.data api

In [0]:
from coreLib.utils import data_input_fn
eval_ds = data_input_fn(GCS_PATH,'Train',DATA_DIM,BATCH_SIZE,BUFFER_SIZE)
train_ds =data_input_fn(GCS_PATH,'Eval',DATA_DIM,BATCH_SIZE,BUFFER_SIZE)
for x,y in eval_ds.take(1):
  print(x.shape)
  print(y.shape)

# model creation

In [0]:
from coreLib.models import LSTM_Chem

with tpu_strategy.scope():
  model = LSTM_Chem(256,TOKENIZER_TABLE_LEN)
  model.compile(optimizer="Adam",
                loss="categorical_crossentropy",
                metrics=["accuracy"])
  if LOAD_WEIGHTS:
    model.load_weights(WEIGHT_PATH)

model.summary()






# Training

In [0]:
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
# reduces learning rate on plateau
lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(factor=0.1,
                                                  cooldown= 10,
                                                  patience=10,
                                                  verbose =1,
                                                  min_lr=0.1e-5)

mode_autosave = tf.keras.callbacks.ModelCheckpoint(WEIGHT_PATH,
                                                  save_best_only=True, 
                                                  verbose=1, 
                                                  period =10)

# stop learining as metric on validatopn stop increasing
early_stopping = tf.keras.callbacks.EarlyStopping(patience=15,verbose=1, mode = 'auto') 

callbacks = [mode_autosave, lr_reducer,early_stopping ]



history = model.fit(train_ds,
                    steps_per_epoch=STEPS_PER_EPOCH,
                    epochs=EPOCHS,
                    verbose=1,
                    validation_data=eval_ds,
                    validation_steps=EVAL_STEPS,
                    callbacks=callbacks)

# save model
#Final_weights=os.path.join(os.getcwd(),'lstm_chem_final.h5')
#model.save_weights(Final_weights)

def plot_history(history):
  """
  Plots model training history 
  """
  fig, (ax_loss, ax_acc) = plt.subplots(1, 2, figsize=(15,5))
  ax_loss.plot(history.epoch, history.history["loss"], label="Train loss")
  ax_loss.plot(history.epoch, history.history["val_loss"], label="Validation loss")
  ax_loss.legend()
  ax_acc.plot(history.epoch, history.history["accuracy"], label="Train accuracy")
  ax_acc.plot(history.epoch, history.history["val_accuracy"], label="Validation accuracy")
  ax_acc.legend()
  plt.show()
# show history
plot_history(history)