# Necessary Imports 

In [None]:
!pip install torchtext==0.6.0 --quiet
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator, TabularDataset
import numpy as np
import pandas as pd
import spacy
import random
from torchtext.data.metrics import bleu_score
from pprint import pprint
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
'''
# Seeding for reproducible results everytime
SEED = 777

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True'''

[K     |████████████████████████████████| 64 kB 1.8 MB/s 
[K     |████████████████████████████████| 1.2 MB 11.3 MB/s 
[?25h

'\n# Seeding for reproducible results everytime\nSEED = 777\n\nrandom.seed(SEED)\nnp.random.seed(SEED)\ntorch.manual_seed(SEED)\ntorch.cuda.manual_seed(SEED)\ntorch.backends.cudnn.deterministic = True'

# 2. Data Preparation & Pre-processing

Loading the SpaCy's vocabulary for our desired languages. SpaCy also supports many languages like french, german etc,.



In [None]:
!python -m spacy download en --quiet

[K     |████████████████████████████████| 12.0 MB 5.2 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
spacy_english = spacy.load("en")

Now let's create custom tokenization methods for the languages. Tokenization is a process of breaking the sentence into a list of individual tokens (words).

We can make use of PyTorch's TorchText library for data pre-processing and SpaCy for vocabulary building (English and German) & tokenization of our data.

In [None]:
def tokenize_english(text):
  return [token.text for token in spacy_english.tokenizer(text)]

### Sample Run ###

sample_text_1 = "are you free today"
sample_text_2 = "YOU FREE TODAY"
print(tokenize_english(sample_text_1))
print(tokenize_english(sample_text_2))

['are', 'you', 'free', 'today']
['YOU', 'FREE', 'TODAY']


In [None]:
from google.colab import drive
drive.mount('/content/drive')


So once we get to understand what can be done in torch text, let's talk about how it can be implemented in the torch text module. Here we are going to make use of 3 classes under torch text.

1. Fields :
> This is a class under the torch text, where we specify how the preprocessing should be done on our data corpus.
2. TabularDataset : 
> Using this class, we can actually define the Dataset of columns stored in CSV, TSV, or JSON format and also map them into integers.
3. BucketIterator :
> Using this class, we can perform padding our data for approximation and make batches with our data for model training.

Here our source language (SRC - Input) is German and target language (TRG - Output) is English. We also add 2 extra tokens "start of sequence" <sos> and "end of sequence" <EOS> for effective model training.

In [None]:
src = Field(tokenize=tokenize_english,
               lower=True,
               init_token="<sos>",
               eos_token="<eos>")

trg = Field(tokenize=tokenize_english,
               lower=True,
               init_token="<sos>",
               eos_token="<eos>")

fields = {'English': ('src', src), 'ISL-Gloss': ('trg', trg)}

train_data, valid_data, test_data = TabularDataset.splits(path = '/content/drive/My Drive/Colab Notebooks/project',
                                                          train = '/content/drive/My Drive/Colab Notebooks/project/data_train.csv',
                                                          test = '/content/drive/My Drive/Colab Notebooks/project/data_test.csv',
                                                          validation = '/content/drive/My Drive/Colab Notebooks/project/data_validate.csv',
                                                          format ='csv', 
                                                          fields = fields )


src.build_vocab(train_data, max_size=10000, min_freq=2)
trg.build_vocab(train_data, max_size=10000, min_freq=2)

In [None]:
print(f"Unique tokens in source english vocabulary: {len(src.vocab)}")
print(f"Unique tokens in target isl-gloss vocabulary: {len(trg.vocab)}")

In [None]:
# dir(english.vocab)

print(src.vocab.__dict__.keys())
print(list(src.vocab.__dict__.values()))
e = list(src.vocab.__dict__.values())
for i in e:
  print(i)

In [None]:
word_2_idx = dict(e[3])
idx_2_word = {}
for k,v in word_2_idx.items():
  idx_2_word[v] = k

# Dataset sneek peek

In [None]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

print(train_data[5].__dict__.keys())
pprint(train_data[5].__dict__.values())

After setting the language pre-processing criteria, the next step is to create batches of training, testing and validation data using iterators.

Creating batches is an exhaustive process, luckily we can make use of TorchText's iterator libraries.

Here we are using BucketIterator for effective padding of source and target sentences. We can access the source (german) batch of data using .src attribute and it's correspondign (english) batch of data using .trg attribute.

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 4

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), 
                                                                      batch_size = BATCH_SIZE, 
                                                                      sort_within_batch=True,
                                                                      sort_key=lambda x: len(x.src),
                                                                      device = device)

## Actual text data before tokenized

In [None]:
count = 0
max_len_eng = []
max_len_isl = []
for data in train_data:
  max_len_eng.append(len(data.src))
  max_len_isl.append(len(data.trg))
  if count < 10 :
    print("English - ",*data.src, " Length - ", len(data.src))
    print("ISL Gloss - ",*data.trg, " Length - ", len(data.trg))
    print()
  count += 1

print("Maximum Length of English sentence {} and ISL-Gloss sentence {} in the dataset".format(max(max_len_eng),max(max_len_isl)))
print("Minimum Length of English sentence {} and ISL-Gloss sentence {} in the dataset".format(min(max_len_eng),min(max_len_isl)))

In [None]:
count = 0
for data in train_iterator:
  if count < 1 :
    print("Shapes", data.src.shape, data.trg.shape)
    print()
    print("English - ",*data.src, " Length - ", len(data.src))
    print()
    print("ISL Gloss - ",*data.trg, " Length - ", len(data.trg))
    temp_eng = data.src
    temp_isl = data.trg
    count += 1

In [None]:
temp_eng_idx = (temp_eng).cpu().detach().numpy()
temp_isl_idx = (temp_isl).cpu().detach().numpy()

I just experimented with a batch size of 32 and a sample target batch is shown below. The sentences are tokenized into list of words and indexed according to the vocabulary. The "pad" token gets an index of 1.

Each column corresponds to a sentence indexed into numbers and we have 32 such sentences in a single target batch and the number of rows corresponds to the maximum length of that sentence. Short sentences are padded with 1 to compensate.
The table (Idx.csv) contains the numerical indices of the words, which is later fed into the word embedding and converted into dense representation for Seq2Seq processing.

In [None]:
df_isl_idx = pd.DataFrame(data = temp_isl_idx, columns = [str("S_")+str(x) for x in np.arange(1, 5)])
df_isl_idx.index.name = 'Time Steps'
df_isl_idx.index = df_isl_idx.index + 1 
# df_isl_idx.to_csv('/content/idx.csv')
df_isl_idx

In [None]:
df_isl_word = pd.DataFrame(columns = [str("S_")+str(x) for x in np.arange(1, 3)])
df_isl_word  = df_isl_idx.replace(idx_2_word)
# df_isl_word .to_csv('/content/Words.csv')
df_isl_word 

# 3. Long Short Term Memory (LSTM) - Under the Hood

# 4. Encoder Model Architecture (Seq2Seq)

# 5. Encoder Code Implementation (Seq2Seq)

In [None]:
class EncoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
    super(EncoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    #self.input_size = input_size

    # Output size of the word embedding NN
    #self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Regularization parameter
    self.dropout = nn.Dropout(p)
    self.tag = True

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(input_size, embedding_size)
    
    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

  # Shape of x (26, 32) [Sequence_length, batch_size]
  def forward(self, x):

    # Shape -----------> (26, 32, 300) [Sequence_length , batch_size , embedding dims]
    embedding = self.dropout(self.embedding(x))
    
    # Shape --> outputs (26, 32, 1024) [Sequence_length , batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size]
    outputs, (hidden_state, cell_state) = self.LSTM(embedding)

    return hidden_state, cell_state

input_size_encoder = len(src.vocab)
encoder_embedding_size = 100
hidden_size = 1024
num_layers = 2
encoder_dropout = 0.5

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)
print(encoder_lstm)

# 7. Decoder Code Implementation (Seq2Seq)

In [None]:
class DecoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
    super(DecoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    #self.input_size = input_size

    # Output size of the word embedding NN
    #self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Size of the one hot vectors that will be the output to the encoder (English Vocab Size)
    self.output_size = output_size

    # Regularization parameter
    self.dropout = nn.Dropout(p)

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(input_size, embedding_size)

    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

    # Shape -----------> (1024, 4556) [embedding dims, hidden size, num layers]
    self.fc = nn.Linear(hidden_size, output_size)

  # Shape of x (32) [batch_size]
  def forward(self, x, hidden_state, cell_state):

    # Shape of x (1, 32) [1, batch_size]
    x = x.unsqueeze(0)

    # Shape -----------> (1, 32, 300) [1, batch_size, embedding dims]
    embedding = self.dropout(self.embedding(x))

    # Shape --> outputs (1, 32, 1024) [1, batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size] (passing encoder's hs, cs - context vectors)
    outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))

    # Shape --> predictions (1, 32, 4556) [ 1, batch_size , output_size]
    predictions = self.fc(outputs)

    # Shape --> predictions (32, 4556) [batch_size , output_size]
    predictions = predictions.squeeze(0)

    return predictions, hidden_state, cell_state

input_size_decoder = len(trg.vocab)
decoder_embedding_size = 100
hidden_size = 1024
num_layers = 2
decoder_dropout = 0.5
output_size = len(trg.vocab)

decoder_lstm = DecoderLSTM(input_size_decoder, decoder_embedding_size,
                           hidden_size, num_layers, decoder_dropout, output_size).to(device)
print(decoder_lstm)

# 8. Seq2Seq (Encoder + Decoder) Interface

In [None]:
for batch in train_iterator:
  print(batch.src.shape)
  print(batch.trg.shape)
  break

x = batch.trg[1]
print(x)

# 9. Seq2Seq (Encoder + Decoder) Code Implementation

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, Encoder_LSTM, Decoder_LSTM):
    super(Seq2Seq, self).__init__()
    self.Encoder_LSTM = Encoder_LSTM
    self.Decoder_LSTM = Decoder_LSTM

  def forward(self, source, target, tfr=0.5):
    # Shape - Source : (10, 32) [(Sentence length German + some padding), Number of Sentences]
    batch_size = source.shape[1]

    # Shape - Source : (14, 32) [(Sentence length English + some padding), Number of Sentences]
    target_len = target.shape[0]
    target_vocab_size = len(trg.vocab)
    
    # Shape --> outputs (14, 32, 5766) 
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    # Shape --> (hs, cs) (2, 32, 1024) ,(2, 32, 1024) [num_layers, batch_size size, hidden_size] (contains encoder's hs, cs - context vectors)
    hidden_state, cell_state = self.Encoder_LSTM(source)

    # Shape of x (32 elements)
    x = target[0] # Trigger token <SOS>

    for i in range(1, target_len):
      # Shape --> output (32, 5766) 
      output, hidden_state, cell_state = self.Decoder_LSTM(x, hidden_state, cell_state)
      outputs[i] = output
      best_guess = output.argmax(1) # 0th dimension is batch size, 1st dimension is word embedding
      x = target[i] if random.random() < tfr else best_guess # Either pass the next word correctly from the dataset or use the earlier predicted word

    # Shape --> outputs (14, 32, 5766) 
    return outputs


In [None]:
# Hyperparameters

learning_rate = 0.001
writer = SummaryWriter(f"runs/loss_plot")
step = 0

model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = trg.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
model

In [None]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    spacy_isl = spacy.load("en")

    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_isl(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)
    text_to_indices = [german.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.Decoder_LSTM(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    print('saving')
    print()
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, '/content/checkpoint-NMT')
    torch.save(model.state_dict(),'/content/checkpoint-NMT-SD')

# 10. Seq2Seq Model Training

In [None]:
epoch_loss = 0.0
num_epochs = 100
best_loss = 999999
best_epoch = -1
sentence1 = "I am sleeping."
ts1  = []

for epoch in range(num_epochs):
  print("Epoch - {} / {}".format(epoch+1, num_epochs))
  model.eval()
  translated_sentence1 = translate_sentence(model, sentence1, src, trg, device, max_length=50)
  print(f"Translated example sentence 1: \n {translated_sentence1}")
  ts1.append(translated_sentence1)

  model.train(True)
  for batch_idx, batch in enumerate(train_iterator):
    input = batch.src.to(device)
    target = batch.trg.to(device)

    # Pass the input and target for model's forward method
    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    # Clear the accumulating gradients
    optimizer.zero_grad()

    # Calculate the loss value for every epoch
    loss = criterion(output, target)

    # Calculate the gradients for weights & biases using back-propagation
    loss.backward()

    # Clip the gradient value is it exceeds > 1
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Update the weights values using the gradients we calculated using bp 
    optimizer.step()
    step += 1
    epoch_loss += loss.item()
    writer.add_scalar("Training loss", loss, global_step=step)

  if epoch_loss < best_loss:
    best_loss = epoch_loss
    best_epoch = epoch
    checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss) 
    if ((epoch - best_epoch) >= 10):
      print("no improvement in 10 epochs, break")
      break
  print("Epoch_Loss - {}".format(loss.item()))
  print()
  
print(epoch_loss / len(train_iterator))

score = bleu(test_data[1:100], model, src, trg, device)
print(f"Bleu score {score*100:.2f}")

In [None]:
score = bleu(test_data, model, src, trg, device)
print(f"Bleu score {score}")

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs/

# 11. Seq2Seq Model Inference

In [None]:
progress  = []
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
for i, sen in enumerate(ts1):
  progress.append(TreebankWordDetokenizer().detokenize(sen))
print(progress)

In [None]:
progress_df = pd.DataFrame(data = progress, columns=['Predicted Sentence'])
progress_df.index.name = "Epochs"
progress_df.to_csv('/content/predicted_sentence.csv')
progress_df.head()

# Model Inference

In [None]:
model.eval()
test_sentences  = ["He stuck the broken pieces together.", "The food is getting cold.", "I cooked dinner."]
actual_sentences  = ["he broken piece stick broken", "food cold get", "i dinner cook"]
pred_sentences = []

for idx, i in enumerate(test_sentences):
  model.eval()
  translated_sentence = translate_sentence(model, i, src, trg, device, max_length=50)
  progress.append(TreebankWordDetokenizer().detokenize(translated_sentence))
  print("English : {}".format(i))
  print("Actual Sentence in ISL Gloss : {}".format(actual_sentences[idx]))
  print("Predicted Sentence in ISL Gloss : {}".format(progress[-1]))
  print()
