!pip install transformers -U

In [4]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/ac/aa/1437691b0c7c83086ebb79ce2da16e00bef024f24fec2a5161c35476f499/sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2MB)
[K     |▎                               | 10kB 20.0MB/s eta 0:00:01[K     |▌                               | 20kB 24.8MB/s eta 0:00:01[K     |▉                               | 30kB 27.3MB/s eta 0:00:01[K     |█                               | 40kB 24.4MB/s eta 0:00:01[K     |█▍                              | 51kB 15.8MB/s eta 0:00:01[K     |█▋                              | 61kB 13.3MB/s eta 0:00:01[K     |██                              | 71kB 14.6MB/s eta 0:00:01[K     |██▏                             | 81kB 14.9MB/s eta 0:00:01[K     |██▍                             | 92kB 13.5MB/s eta 0:00:01[K     |██▊                             | 102kB 14.5MB/s eta 0:00:01[K     |███                             | 112kB 14.5MB/s eta 0:00:01

In [2]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
import math
import random
import time
from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import sentencepiece as spm
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

import spacy
import torchtext
from torchtext.legacy.data import BucketIterator, Field
from torchtext.legacy.datasets import Multi30k, TranslationDataset

from models import *
from utils import *
from translate import *

In [5]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# En-De Translation Data

In [None]:
!mkdir data
!curl https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en --output data/train.en
!curl https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de --output data/train.de

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  3  615M    3 20.8M    0     0  6388k      0  0:01:38  0:00:03  0:01:35 6386k

In [None]:
def restructure_data(
    train_en: Path,
    train_de: Path,
    train_data_output: Path,
    validation_data_output,
    test_size=0.33,
    random_state=42,
) -> dict:
    """
    Converts the text data into a json of list of dictionaries which map
    English sentence to corresponding German Sentence

    args :
      train_en : [ Path ] : Training data path for English Sentences
      train_de : [ Path ] : Training data path for German Sentences
      train_data_output : [ Path ] : Training data output path
      validation_data_output : [ Path ] : Validation data path
      test_size : [ float ] : size of test split (OPTIONAL) DEFAULT=0.33
      random_state : [ float ] : random state of train-test-split (OPTIONAL) DEFAULT=42

    Returns :
        [dict]: Training Data
        [dict]: Validation data
    """
    data = [
        {"src": en, "trg": de}
        for en, de in zip(train_en.open().readlines(), train_de.open().readlines())
    ]
    train_data, dev_data = train_test_split(
        data, test_size=test_size, random_state=random_state
    )
    json.dump(train_data, train_data_output.open("w"), indent=2)
    json.dump(dev_data, validation_data_output.open("w"), indent=2)
    return train_data, dev_data

In [None]:
def split_train_test(train_path, output_dir, lang, test_size=0.33, random_state=42):
    """
    Splits train data in train, dev and test

    args:
      train_path : [ Path ] : Training data path for lang
      output_dir : [ Path/string ] : Output directory for the splits
      lang : [ str ] : Language of the training file
      test_size : [ float ] : size of test split (OPTIONAL) DEFAULT=0.33
      random_state : [ float ] : random state of train-test-split (OPTIONAL) DEFAULT=42
    """
    output_dir = str(output_dir)
    data = train_path.open().readlines()
    train_data, test_data = train_test_split(
        data, test_size=test_size, random_state=random_state
    )
    train_data, dev_data = train_test_split(
        train_data, test_size=test_size, random_state=random_state
    )
    Path(f"{output_dir}/train_sample.{lang}").open("w").write("\n".join(train_data))
    Path(f"{output_dir}/test_sample.{lang}").open("w").write("\n".join(test_data))
    Path(f"{output_dir}/dev_sample.{lang}").open("w").write("\n".join(dev_data))

In [None]:
data_path = Path("data")
train_en = data_path / "train.en"
train_de = data_path / "train.de"
train_data_output = data_path / "train.json"
validation_data_output = data_path / "dev.json"

In [None]:
# train_data, dev_data = restructure_data(train_en, train_de, train_data_output, validation_data_output)
split_train_test(train_en, "data", "en", test_size=0.33, random_state=SEED)
split_train_test(train_de, "data", "de", test_size=0.33, random_state=SEED)

# SentencePiece Training

In [None]:
sp = spm.SentencePieceProcessor()


def train_sp(data_path, out_path):
    """
    Training SentencePiece Tokenize
    args:
      data_path : [ Path/str ] : Path for text data for training
      out_path : [ Path/str ] : Path for output of the model with the name
    """
    data_path = str(data_path)
    out_path = str(out_path)
    spm.SentencePieceTrainer.train(
        f"--input={data_path} --model_prefix={out_path} --vocab_size=32000"
    )


train_sp("data/train_sample.en", "en")
train_sp("data/train_sample.de", "de")

# Tokenisation

SentencePiece Tokenisation

In [None]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    sp.load("de.model")
    return sp.encode_as_pieces(text)


def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    sp.load("en.model")
    return sp.encode_as_pieces(text)

In [None]:
SRC = Field(
    tokenize=tokenize_de,
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    truncate_first=True,
    fix_length=64,
    batch_first=True,
)

TRG = Field(
    tokenize=tokenize_en,
    init_token="<sos>",
    eos_token="<eos>",
    fix_length=64,
    lower=True,
    truncate_first=True,
    batch_first=True,
)

In [None]:
train_data = TranslationDataset(
    path="data/train_sample", exts=(".en", ".de"), fields=(SRC, TRG)
)

valid_data = TranslationDataset(
    path="data/dev_sample", exts=(".en", ".de"), fields=(SRC, TRG)
)

test_data = TranslationDataset(
    path="data/test_sample", exts=(".en", ".de"), fields=(SRC, TRG)
)

In [None]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device
)

## Training the Seq2Seq Model

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.01
DEC_DROPOUT = 0.01

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [None]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [None]:
model.apply(initialize_weights);

In [None]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'ende-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [None]:
model.load_state_dict(torch.load('ende-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

## Inference

Now we can can translations from our model with the `translate_sentence` function below.

The steps taken are:
- tokenize the source sentence if it has not been tokenized (is a string)
- append the `<sos>` and `<eos>` tokens
- numericalize the source sentence
- convert it to a tensor and add a batch dimension
- create the source sentence mask
- feed the source sentence and mask into the encoder
- create a list to hold the output sentence, initialized with an `<sos>` token
- while we have not hit a maximum length
  - convert the current output sentence prediction into a tensor with a batch dimension
  - create a target sentence mask
  - place the current output, encoder output and both masks into the decoder
  - get next output token prediction from decoder along with attention
  - add prediction to current output sentence prediction
  - break if the prediction was an `<eos>` token
- convert the output sentence from indexes to tokens
- return the output sentence (with the `<sos>` token removed) and the attention from the last layer

In [None]:
example_idx = 8

src = vars(train_data.examples[example_idx])["src"]
trg = vars(train_data.examples[example_idx])["trg"]

print(f"src = {src}")
print(f"trg = {trg}")

In [None]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {translation}')

In [None]:
example_idx = 6

src = vars(valid_data.examples[example_idx])['src']
trg = vars(valid_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'trg = {trg}')

## BLEU

Finally we calculate the BLEU score for the Transformer.

In [None]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):
    
    trgs = []
    pred_trgs = []
    
    for datum in data:
        
        src = vars(datum)['src']
        trg = vars(datum)['trg']
        
        pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len)
        
        #cut off <eos> token
        pred_trg = pred_trg[:-1]
        
        pred_trgs.append(pred_trg)
        trgs.append([trg])
        
    return bleu_score(pred_trgs, trgs)

In [None]:
bleu_score = calculate_bleu(test_data, SRC, TRG, model, device)

print(f'BLEU score = {bleu_score*100:.2f}')