In [1]:
import tensorflow as tf
import numpy as np
import pickle
import time
import math
import datetime
import collections
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Activation
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, LearningRateScheduler, ModelCheckpoint
import tensorflow.keras.backend as K

In [2]:
import os.path
if not(os.path.isfile('ptb.zip')):
  !wget -O ptb.zip https://www.dropbox.com/s/bs9ztyq27sxa3l7/ptb.zip?dl=0
!unzip -o ptb.zip

--2020-12-15 21:46:14--  https://www.dropbox.com/s/bs9ztyq27sxa3l7/ptb.zip?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.18, 2620:100:6019:18::a27d:412
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/bs9ztyq27sxa3l7/ptb.zip [following]
--2020-12-15 21:46:15--  https://www.dropbox.com/s/raw/bs9ztyq27sxa3l7/ptb.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uce32eb77e8f1cb88bb78a8d49f3.dl.dropboxusercontent.com/cd/0/inline/BFJ2kKkH1V1zh5nYi0rGXrokZyozB56jqDu2nz8uJ8ff6fGQS82KIz6LXubgAnbmiidTdYYuw-XSVlwcfXz1u0qFOi_dh2HkhJG0LYM77yVq_0L8muzX9hB00JgBQJIE3FI/file# [following]
--2020-12-15 21:46:15--  https://uce32eb77e8f1cb88bb78a8d49f3.dl.dropboxusercontent.com/cd/0/inline/BFJ2kKkH1V1zh5nYi0rGXrokZyozB56jqDu2nz8uJ8ff6fGQS82KIz6LXubgAnbmiidTdYYuw-XSVlwcfXz1u0qFOi_dh2HkhJG0LYM77yVq_0L8muzX9hB00

## Load Tensorboard:

In [3]:
%load_ext tensorboard
logdir = "tensorboard_logs"

## Define Hyperparameters:

In [4]:
batch_size = 20
seq_len = 25
clip_norm = 5
learning_rate = 1.
momentum = 0.8
decay = 0.98
epochs = 70
epochs_no_decay = 15
drop_out_rate = 0.4
hidden_size = 200

## Define Custom Method for Features and Labels Creation from sequence:

In [5]:
# Based on Git Repo by @tmatha (2018)
# Modified based on MIT License
def features_labels(data_array, batch_size, seq_len, batch_first=True):  
  if len(data_array.shape) != 1:
    raise ValueError('Expected 1-d data array, '
                     'instead data array shape is {} '.format(data_array.shape))
  
  def fold(used_array):
    shaped_array=np.reshape(used_array,(batch_size,seq_len*steps),order='C')    
    if batch_first:
      return np.concatenate(np.split(shaped_array,steps,axis=1),axis=0)
    else:
      return np.transpose(shaped_array)

  steps = (data_array.shape[0]-1)//(batch_size*seq_len)
  used = batch_size * seq_len * steps  
  features = fold(data_array[:used])
  labels   = fold(data_array[1:used+1])
  
  Data = collections.namedtuple('Data',['features','labels'])
  return Data(features=features,labels=labels), steps

## Create Sequences:

In [6]:
seq = {'train': [], 'valid': [], 'test': []}
for seq_type in seq.keys():
  with open('ptb.'+ seq_type + '.txt','r') as cur_file:
      cur_seq = cur_file.read().replace('\n','<eos>').split(' ')
  cur_seq = list(filter(None, cur_seq))
  print(f'First ten in {seq_type} sequence: {cur_seq[:10]}')
  print(f'Size of {seq_type} sequence: {len(cur_seq)}')
  seq[seq_type] = cur_seq

First ten in train sequence: ['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec']
Size of train sequence: 929589
First ten in valid sequence: ['consumers', 'may', 'want', 'to', 'move', 'their', 'telephones', 'a', 'little', 'closer']
Size of valid sequence: 73760
First ten in test sequence: ['no', 'it', 'was', "n't", 'black', 'monday', '<eos>', 'but', 'while', 'the']
Size of test sequence: 82430


## Create the Vocabulary and Datasets:

In [7]:
vocab_train = set(seq['train'])
vocab_valid = set(seq['valid'])
vocab_test  = set(seq['test'])

assert vocab_valid.issubset(vocab_train)
assert vocab_test.issubset(vocab_train)
print(f'Train vocabulary length: {len(vocab_train)}')
print(f'Valid vocabulary length: {len(vocab_valid)}')
print(f'Test vocabulary length: {len(vocab_test)}')

vocab_train = sorted(vocab_train)
word2id = {w:i for i,w in enumerate(vocab_train)}
id2word = {i:w for i,w in enumerate(vocab_train)}

Train vocabulary length: 10000
Valid vocabulary length: 6022
Test vocabulary length: 6049


In [8]:
ids_train = np.array([word2id[word] for word in seq['train']], copy=False, order='C')
ids_valid = np.array([word2id[word] for word in seq['valid']], copy=False, order='C')
ids_test  = np.array([word2id[word] for word in seq['test']], copy=False, order='C')

data_train, steps_train = features_labels(
    ids_train, batch_size, seq_len, batch_first=True)
data_valid, steps_valid = features_labels(
    ids_valid, batch_size, seq_len, batch_first=True)
data_test, steps_test = features_labels(
    ids_test, batch_size, seq_len, batch_first=True)

print(f'Steps Train: {steps_train}')
print(f'Steps Validation: {steps_valid}')
print(f'Steps Test: {steps_test}')

Steps Train: 1859
Steps Validation: 147
Steps Test: 164


In [9]:
dataset_train = tf.data.Dataset.from_tensor_slices(data_train).batch(batch_size,
    drop_remainder=True)
dataset_valid = tf.data.Dataset.from_tensor_slices(data_valid).batch(batch_size,
    drop_remainder=True)
dataset_test = tf.data.Dataset.from_tensor_slices(data_test).batch(batch_size,
    drop_remainder=True)

In [10]:
# Check the dataset's shapes, input and target:
for input_example_batch, target_example_batch in dataset_train.take(1):
    print(f'Shape of input: {input_example_batch.shape}')
    print(f'Shape of target: {target_example_batch.shape}')
    
    print(' '.join([id2word[id] for id in input_example_batch[0].numpy()]))
    print(' '.join([id2word[id] for id in target_example_batch[0].numpy()]))
    print(' '.join([id2word[id] for id in input_example_batch[1].numpy()]))
    print(' '.join([id2word[id] for id in target_example_batch[1].numpy()]))

Shape of input: (20, 25)
Shape of target: (20, 25)
aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter <eos>
banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter <eos> pierre
<unk> on airline acquisitions that would so load a carrier up with debt that it would <unk> safety or a carrier 's ability to compete
on airline acquisitions that would so load a carrier up with debt that it would <unk> safety or a carrier 's ability to compete rep.


## Define LSTM Based Sequential Model Class:


In [11]:
class LSTM_Based(Sequential):
  def __init__(self, name, vocabulary_size, hidden_size, seq_len, Drop=None):
    super().__init__(name = name)
    self.add(Embedding(vocabulary_size, hidden_size, input_length=seq_len))
    if Drop:
      self.add(Dropout(drop_out_rate))
    self.add(LSTM(hidden_size, return_sequences=True))
    if Drop:
      self.add(Dropout(drop_out_rate))
    self.add(LSTM(hidden_size, return_sequences=True))
    if Drop:
      self.add(Dropout(drop_out_rate))
    self.add(Dense(vocabulary_size))

## Define Custom Perplexity Metric, Learning Scheduler and Optimizer:

In [12]:
def perplexity(y_true, y_pred):
    cross_entropy = K.mean(K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True))
    perplexity = K.exp(cross_entropy)
    return perplexity

def lr_decay(epoch, lr):
  if epoch < epochs_no_decay:
    return learning_rate
  else:
    return max(round(lr * decay, 2), 0.01)

def get_callbacks(name):
  log_dir = logdir + "/" + name + "/" + datetime.datetime.now().strftime("%m%d-%H%M")
  file_path = name + '.weights.hdf5'
  checkpointer = ModelCheckpoint(filepath=file_path, verbose = 1, save_best_only=True)
  return [
          LearningRateScheduler(lr_decay, verbose=1),
          TensorBoard(log_dir=log_dir, histogram_freq=1),
          checkpointer
  ]

In [13]:
def create_compile(model_def):
  if model_def['Type'] == 'LSTM':
    model = LSTM_Based(name = model_def['name'],
                       vocabulary_size = len(word2id),
                       hidden_size = model_def['hidden_size'],
                       seq_len = model_def['seq_len'],
                       Drop = model_def['Drop'])
  elif model_def['Type'] == 'GRU':
    model = GRU_Based(vocabulary_size = len(word2id),
                       hidden_size = model_def['hidden_size'],
                       seq_len = model_def['seq_len'],
                       Drop = model_def['Drop'])
    
  optimizer = tf.keras.optimizers.SGD(learning_rate=0.01,
                                      clipnorm=clip_norm,
                                      momentum=momentum)
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  model.compile(optimizer=optimizer, loss=loss, metrics=[perplexity])
  return model

## Train Models:

In [14]:
model_defs = [
              {'name': 'LSTM_Drop', 'Type': 'LSTM', 'hidden_size': hidden_size, 'seq_len': seq_len, 'Drop': True},
              {'name': 'LSTM_No_Drop', 'Type': 'LSTM', 'hidden_size': hidden_size, 'seq_len': seq_len, 'Drop': False},
              ]

In [None]:
history = {}
for model_def in model_defs:
  model = create_compile(model_def)
  model.summary()
  fit_hist = model.fit(x=dataset_train,
                       validation_data=dataset_valid,
                       batch_size=batch_size,
                       epochs=epochs,
                       callbacks=get_callbacks(model.name))
  history[model_def['name']] = fit_hist.history
  model.save(model.name + '.h5')

Model: "LSTM_Drop"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 200)           2000000   
_________________________________________________________________
dropout (Dropout)            (None, 25, 200)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 25, 200)           320800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 25, 200)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 200)           320800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 25, 200)           0         
_________________________________________________________________
dense (Dense)                (None, 25, 10000)         20

In [None]:
with open('trainHistoryLSTM', 'wb') as file:
        pickle.dump(history, file)

## Re-Load Models with best weights and evaluate:

In [None]:
results = {}
for model_def in model_defs:
  model = create_compile(model_def)
  model.load_weights(model.name + '.weights.hdf5')

  cur_res = {}
  [cur_res['train_loss'], cur_res['train_perp']] = model.evaluate(dataset_train)
  [cur_res['valid_loss'], cur_res['valid_perp']] = model.evaluate(dataset_valid)
  [cur_res['test_loss'], cur_res['test_perp']] = model.evaluate(dataset_test)
  results[model.name] = cur_res

In [None]:
%tensorboard --logdir tensorboard_logs/