<a href="https://colab.research.google.com/github/oikn2018/CS6910_assignment_3/blob/main/AP_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchtext==0.6.0
import locale
locale.getpreferredencoding = lambda: "UTF-8"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard
# from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

In [None]:
!python3 -m spacy download de_core_news_md
!python3 -m spacy download en_core_web_md 


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.5.0/de_core_news_md-3.5.0-py3-none-any.whl (44.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-md
Successfully installed de-core-news-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_md')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0

In [None]:
spacy_ger = spacy.load('de_core_news_md')
spacy_eng = spacy.load('en_core_web_md')

# Takes some text and returns a list of strings
def tokenizer_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]

In [None]:
german = Field(tokenize = tokenizer_ger, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize = tokenizer_eng, lower=True, init_token='<sos>', eos_token='<eos>')

In [None]:
train_data, val_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 1.05MB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 283kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 276kB/s]


In [None]:
# if a word is present at least 2 times, then only we'll add to our vocab
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [None]:
class Encoder(nn.Module):
  # input_size = size of vocab
  # embedding_size - to map each input to some d dim space
  # num_layers 
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(p)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)


  # x - vector of indices, each token of sentence will be mapped to an index in vocab
  def forward(self, x):
    # x shape: (seq_length, N) -> N: batch size

    embedding = self.dropout(self.embedding(x))
    #embedding shape: (seq_length, N, embedding_size) -> each word(seq_length) will be mapped to an embedding of embedding_size

    outputs, (hidden, cell) = self.rnn(embedding)
    return hidden, cell
    #outputs not important, only hidden and cell is important as they form the context vector

class Decoder(nn.Module):
  # input_size - size of english vocab, output_size same as input_size
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
      super(Decoder,self).__init__()
      self.hidden_size = hidden_size
      self.num_layers = num_layers

      self.dropout = nn.Dropout(p)
      self.embedding = nn.Embedding(input_size, embedding_size)
      self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

      # Note: Hidden size of encoder and decoder must be the same

      self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
      #Prediction is done one word at a time, but for N words in a batch, so (1,N)
      # shape of x: (N) but we want (1, N) -> i.e. N batches of a single word, Decoder predicts 1 word at a time, taking prev Decoder output and prev hidden cell.
      x = x.unsqueeze(0)

      embedding = self.dropout(self.embedding(x))
      # embedding shape: (1,N, embedding_size)

      # all 3 important now: outputs, hidden, cell
      outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
      # shape of outputs: (1, N, hidden_size)

      predictions = self.fc(outputs)
      # shape of predictions: (1, N, length_of_vocab) -> (N, length_of_vocab)

      predictions = predictions.squeeze(0)

      return predictions, hidden, cell


class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, teacher_force_ratio = 0.5):
    batch_size = source.shape[1] # source dim: (target_len, N) -> N: batch size
    target_len = target.shape[0]
    target_vocab_size = len(english.vocab)

    # predict 1 word at a time, but do it for an entire batch, every vector will be of that entire vocab size
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
    hidden, cell = self.encoder(source)

    # Grab start token
    x = target[0]

    # send to decoder word by word
    for t in range(1, target_len):
      output, hidden, cell = self.decoder(x, hidden, cell)

      outputs[t] = output # adding along 1st dimension -> target_len
      # output dim -> (N, english_vocab_size) -> doing argmax along this dimension, we'll get index corresponding to best guess that decoder outputted.
      best_guess = output.argmax(1)

      # implementing ground truth
      x = target[t] if random.random() < teacher_force_ratio else best_guess

    return outputs



In [None]:
### Now model is ready to train

In [None]:
# Training Hyperparameters
num_epochs = 20
learning_rate = 0.001
batch_size = 64

# Model Hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size = batch_size,
    # Examples of similar length will be in same batch to minimize padding and save on compute
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi['<pad>']
# if all examples in batch are of similar length, don't incur penalty for this padding
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

if load_model:
  load_checkpoint(torch.load('my_checkpoint.pth.ptar'), model, optimizer)


sentence = 'Ein Boot mit anderen Männern wird von einem großen Pferdegespann ans Ufer gezogen.'
og_translation = 'a boat with other men is pulled to the shore by a large team of horses.'
for epoch in range(num_epochs):
  print(f'Epoch [{epoch} / {num_epochs}]')

  checkpoint = {
      'state_dict': model.state_dict(),
      'optimizer': optimizer.state_dict()
  }

  model.eval() # turns off Dropout
  # translated_sentence = translate(model, sentence, german, english, device, max_length=50)
  # print(f'Translated example sentence \n {og_translation},{translated_sentence}')
  model.train()


  for batch_idx, batch in enumerate(train_iterator):
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)

    output = model(inp_data, target)
    # output shape: (target_len, batch_size, output_dim)
    # Cross entropy wants a matrix but this would be a 3 dim vector
    # eg. in MNIST (N, 10)  and target would be (N)

    #basically reshape output keeping last output_dim same
    output = output[1:].reshape(-1, output.shape[2]) # so that first start token is not sent to out model
    # target -> (target_len, batch_size)
    target = target[1:].reshape(-1)
    optimizer.zero_grad()
    loss = criterion(output, target)

    loss.backward()

    # to avoid exploding gradients, clip them when they are above a threshold
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()

  print('Training Loss: ', loss)
    # writer.add_scalar('Training Loss: ', loss, global_step=step)
    # step+=1


Epoch [0 / 20]
Training Loss:  tensor(4.1217, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch [1 / 20]
Training Loss:  tensor(3.8926, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch [2 / 20]
Training Loss:  tensor(2.8902, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch [3 / 20]
Training Loss:  tensor(3.9690, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch [4 / 20]
Training Loss:  tensor(3.5430, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch [5 / 20]
Training Loss:  tensor(3.5687, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch [6 / 20]
Training Loss:  tensor(2.6264, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch [7 / 20]
Training Loss:  tensor(1.9372, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch [8 / 20]
Training Loss:  tensor(2.3012, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch [9 / 20]
Training Loss:  tensor(1.9115, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch [10 / 20]
Training Loss:  tensor(1.9724, device='cuda:0', grad_fn=<NllLossBackward0>