<a href="https://colab.research.google.com/github/nrajmalwar/END2.0/blob/main/Session_09/Bleu_Perplexity_Bert_Score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Download the English and German languages for tokenization from spacy
%%bash
python -m spacy download en
python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [2]:
!pip install transformers==3.0.0
!pip install bert_score==0.3.4



In [3]:
# Check GPU
!nvidia-smi

Sun Jul 11 04:06:57 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P8    12W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

# Import classes and functions for modern way of building data pipeline
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from typing import Iterable, List

import spacy
import numpy as np

import random
import math
import time

# remove truncation warning for bert tokenizer
import logging
logging.basicConfig(level=logging.ERROR)

Then set a random seed for deterministic results/reproducability.

In [5]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Instantiate our German and English spaCy models.

In [6]:
# Define Source and Target Language
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

Create our fields to process our data. This will append the "start of sentence" and "end of sentence" tokens as well as converting all words to lowercase.

In [7]:
# Place-holders
token_transform = {}
vocab_transform = {}

# Create source and target language tokenizer. Make sure to install the dependencies.
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en')

# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

Load our data.

In [8]:
# We use train, valid and test data as dataloader objects defined later in model training and evaluation functions 
train_iterator = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))

We'll also print out an example just to double check they're not reversed.

In [9]:
print(next(train_iterator))

('Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.\n', 'Two young, White males are outside near many bushes.\n')


Then create our vocabulary, converting all tokens appearing less than twice into `<unk>` tokens.

In [10]:
# Modern way of building vocab
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  # Training data Iterator 
  train_iterator = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
  
  # Create torchtext's Vocab object 
  vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iterator, ln),
                                                    min_freq=2,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set UNK_IDX as the default index. This index is returned when the token is not found. 
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary. 
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)

Device:  cuda


## Building the Seq2Seq Model

### Encoder


In [12]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim) #no dropout as only one layer!
        
        self.rnn = nn.GRU(emb_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded) #no cell state!
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden

## Decoder

In [13]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, context):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #context = [n layers * n directions, batch size, hid dim]
        
        #n layers and n directions in the decoder will both always be 1, therefore:
        #hidden = [1, batch size, hid dim]
        #context = [1, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        emb_con = torch.cat((embedded, context), dim = 2)
            
        #emb_con = [1, batch size, emb dim + hid dim]
            
        output, hidden = self.rnn(emb_con, hidden)
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #seq len, n layers and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), 
                           dim = 1)
        
        #output = [batch size, emb dim + hid dim * 2]
        
        prediction = self.fc_out(output)
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden

## Seq2Seq Model


In [14]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is the context
        context = self.encoder(src)
        
        #context also used as the initial hidden state of the decoder
        hidden = context
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and the context state
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, context)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

# Training the Seq2Seq Model


In [15]:
# Modern way of retrieving vocab of source and target language
INPUT_DIM = len(vocab_transform[SRC_LANGUAGE])
OUTPUT_DIM = len(vocab_transform[TGT_LANGUAGE])

ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# We define batch size that will be used for the dataloader objects
BATCH_SIZE = 128

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [16]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(8015, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(6192, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=6192, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 14,720,816 trainable parameters


We initiaize our optimizer.

In [18]:
optimizer = optim.Adam(model.parameters())

We also initialize the loss function, making sure to ignore the loss on `<pad>` tokens.

In [19]:
# Pad token idx is already predefined in the modern way
loss_fn = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [20]:
######################################################################
# Collation
# ---------
#   
# As seen in the ``Data Sourcing and Processing`` section, our data iterator yields a pair of raw strings. 
# We need to convert these string pairs into the batched tensors that can be processed by our ``Seq2Seq`` network 
# defined previously. Below we define our collate function that convert batch of raw strings into batch tensors that
# can be fed directly into our model.   
#

from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]), 
                      torch.tensor(token_ids), 
                      torch.tensor([EOS_IDX])))

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

We then create the training loop...

In [21]:
######################################################################
# Modern way of training using dataloaders with collate function 
# Let's define training and evaluation loop that will be called for each 
# epoch.
#

from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    
    for src, tgt in train_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)


        optimizer.zero_grad()
        output = model(src, tgt)
        output = output[1:].view(-1, output.shape[-1])
        tgt = tgt[1:].view(-1)
        loss = loss_fn(output, tgt)
        loss.backward()
        clip = 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)

...and the evaluation loop, remembering to set the model to `eval` mode and turn off teaching forcing.

In [22]:
######################################################################
# Modern way of evaluating using dataloaders with collate function 

def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        output = model(src, tgt)
        output = output[1:].view(-1, output.shape[-1])
        tgt = tgt[1:].view(-1)
        loss = loss_fn(output, tgt)
        losses += loss.item()

    return losses / len(val_dataloader)

We'll also define the function that calculates how long an epoch takes.

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

Then, we train our model, saving the parameters that give us the best validation loss.

# BLEU Score

In [24]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(model):
  model.eval()
  losses = 0

  # calculate score on test data
  test_iter = Multi30k(split='test', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
  test_dataloader = DataLoader(test_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

  tgt_sentences = []
  output_sentences = []

  for src, tgt in test_dataloader:
      src = src.to(device)
      tgt = tgt.to(device)

      output = model(src, tgt)
      output = torch.argmax(output, dim=2)

      # transpose to have batch first, size=[batch_size, text_length]
      tgt_list = torch.transpose(tgt, 1, 0).tolist()
      output_list = torch.transpose(output, 1, 0).tolist()

      for x, y in zip(tgt_list, output_list):
        # change integer to strings
        tgt_line = vocab_transform[TGT_LANGUAGE].lookup_tokens(x)
        # remove first token and truncate at first <eos> token found
        tgt_line = tgt_line[1:tgt_line.index("<eos>")]

        output_line = vocab_transform[TGT_LANGUAGE].lookup_tokens(y)
        # only if <eos> is found, we truncate the line at that point
        if "<eos>" in output_line:
          output_line = output_line[1:output_line.index("<eos>")]

        # collect all the lines in a list
        tgt_sentences.append([tgt_line])
        output_sentences.append(output_line)

  return bleu_score(output_sentences, tgt_sentences)

# BERT Score

In [25]:
from bert_score import score

def calculate_bert(model):
  model.eval()
  losses = 0

  # calculate score on test data
  test_iter = Multi30k(split='test', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
  test_dataloader = DataLoader(test_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

  tgt_sentences = []
  output_sentences = []

  for src, tgt in test_dataloader:
    src = src.to(device)
    tgt = tgt.to(device)

    output = model(src, tgt)
    output = torch.argmax(output, dim=2)

    # transpose to have batch first, size=[batch_size, text_length]
    tgt_list = torch.transpose(tgt, 1, 0).tolist()
    output_list = torch.transpose(output, 1, 0).tolist()

    for x, y in zip(tgt_list, output_list):
      # change integer to strings
      tgt_line = vocab_transform[TGT_LANGUAGE].lookup_tokens(x)
      # remove first token and truncate at first <eos> token found
      tgt_line = tgt_line[1:tgt_line.index("<eos>")]
      # create a single string for each line. This is the requirement for BERT score
      tgt_line = " ".join(tgt_line)

      output_line = vocab_transform[TGT_LANGUAGE].lookup_tokens(y)
      # only if <eos> is found, we truncate the line at that point
      if "<eos>" in output_line:
        output_line = output_line[1:output_line.index("<eos>")]
      output_line = " ".join(output_line)

      # collect all the lines in a list
      tgt_sentences.append([tgt_line])
      output_sentences.append(output_line)

  P, R, F1 = score(output_sentences, tgt_sentences, lang="en", verbose=False)

  return P.mean(), R.mean(), F1.mean()

# Model Training

In [26]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()

    # Use dataloaders directly in train and evaluate functions, instead of passing iterator objects
    train_loss = train_epoch(model, optimizer)
    valid_loss = evaluate(model)

    end_time = time.time()
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    score_bleu = calculate_bleu(model)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    print(f'\t BLEU Score: {score_bleu*100:.2f}')

    # Since RAM is limited, we calculate BERT score for the last 4 epochs only. Otherwise, the session crashes.
    if epoch > N_EPOCHS-5:
      P, R, F1 = calculate_bert(model)
      print(f'\t BERT Score: Precision={P:.3f}, Recall={R:.3f}, F1 Score={F1:.3f}')

Epoch: 01 | Time: 0m 43s
	Train Loss: 5.035 | Train PPL: 153.672
	 Val. Loss: 4.450 |  Val. PPL:  85.589
	 BLEU Score: 2.61
Epoch: 02 | Time: 0m 42s
	Train Loss: 4.327 | Train PPL:  75.735
	 Val. Loss: 4.157 |  Val. PPL:  63.908
	 BLEU Score: 3.63
Epoch: 03 | Time: 0m 42s
	Train Loss: 4.081 | Train PPL:  59.222
	 Val. Loss: 3.988 |  Val. PPL:  53.956
	 BLEU Score: 4.07
Epoch: 04 | Time: 0m 42s
	Train Loss: 3.825 | Train PPL:  45.826
	 Val. Loss: 3.811 |  Val. PPL:  45.183
	 BLEU Score: 6.71
Epoch: 05 | Time: 0m 42s
	Train Loss: 3.534 | Train PPL:  34.278
	 Val. Loss: 3.488 |  Val. PPL:  32.734
	 BLEU Score: 9.17
Epoch: 06 | Time: 0m 42s
	Train Loss: 3.263 | Train PPL:  26.122
	 Val. Loss: 3.354 |  Val. PPL:  28.605
	 BLEU Score: 11.81
Epoch: 07 | Time: 0m 42s
	Train Loss: 3.006 | Train PPL:  20.208
	 Val. Loss: 3.188 |  Val. PPL:  24.248
	 BLEU Score: 15.04
	 BERT Score: Precision=0.892, Recall=0.890, F1 Score=0.891
Epoch: 08 | Time: 0m 43s
	Train Loss: 2.754 | Train PPL:  15.709
	 Val