In [1]:
%load_ext autoreload
%autoreload 2

## Check GPU 

In [2]:
from tensorflow.python.client import device_lib


[x for x in device_lib.list_local_devices() if x.device_type == 'GPU']

[]

## Clone github repository

In [None]:
! git clone https://github.com/nabacg/aml-nlp-notes.git

In [None]:
import os 
print(os.getcwd())
[n for n in dir(os) if 'ch' in n]
os.chdir('/content/aml-nlp-notes/language-model')
print(os.getcwd())

## Download data files

## cornell dataset

In [None]:
! sh download_cornell.sh

## Imports required

In [14]:
import tensorflow as tf
import os
from seq2seqdataprocessing import load_dataset, word_to_idx
from sklearn.model_selection import train_test_split
from language_model import Encoder, Decoder, train_model, answer 


1.13.1


## Prepare train, test dataset

In [43]:
dataset_name = 'cornell'
input_tensor, target_tensor, dict_index, max_length_inp, max_length_targ = load_dataset(dataset_name = dataset_name, max_sentence_length=20)

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.1)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

100%|██████████| 83097/83097 [00:07<00:00, 11151.71it/s]


(22312, 22312, 2480, 2480)

In [44]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
word2idx = dict_index[0]
embedding_dim = 256
units = 1024
vocab_size = len(word2idx)


dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

## Create encoder, decoder instance

In [None]:
embedding, embedding_dim, emb_matrix  = load_embeddings('data/starspace_embeddings.tsv', word2idx)

In [None]:
# https://stackoverflow.com/a/52566623
# https://github.com/tensorflow/tensorflow/issues/14392

pretrained_emb = tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                                           embeddings_initializer = tf.initializers.constant(emb_matrix),
                                           trainable=False)

In [46]:
encoder = Encoder(vocab_size, embedding_dim, units, BATCH_SIZE, pretrained_emb)
decoder = Decoder(vocab_size, embedding_dim, units, BATCH_SIZE, pretrained_emb)
optimizer = tf.train.AdamOptimizer()

## Mount Google drive to save model checkpoints

In [None]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

## Checkpoints 

In [40]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

### Set checkpoint for Google drive 

In [None]:
checkpoint_dir = "/content/drive/My Drive/cornell-ds-training"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

### Restore previous checkpoints 

In [49]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt-12'

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

## Training 

In [None]:
train_model(encoder, 
            decoder, 
            optimizer,
            dataset, 
            BATCH_SIZE, 
            N_BATCH,
            start_word_index=word_to_idx(word2idx, '<start>'),
            epochs=3,
            save_checkpoint=lambda: checkpoint.save(file_prefix = checkpoint_prefix))

## Evaluation

In [None]:
answer("Hi", encoder, decoder, dict_index, units, max_length_targ)