# Seq2seq NMT with RNN



[Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473)

**NOTE:**

-  use clean bpe data
-  use a piece of triaing data during coding or low in credits

You have to implement:

- Encoder
- Attention (Bahdanau)
- Decoder

Goal:

- Loss in training, validation and test





In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import random
import time

In [None]:
#if you dont have bpe data use sacremoses tokenizer
#!pip install sacremoses

In [None]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from torchtext.utils import download_from_url, extract_archive
import io

#ran out of RAM on my other google account :/
#poliaeva.polina@gmail.com and upb203040@gmail.com are both me
#more accounts may be coming in the future with how quickly RAM gets used up :'D
path = ""
#0=en 1=de
# soure and target data
#NOTE: USE clean bpe data!
train_filepaths = [path+'train.en-de.bpe.en', path+'train.en-de.bpe.de']
val_filepaths = [path+'dev.en-de.bpe.en', path+'dev.en-de.bpe.de']
test_filepaths = [path+'test.en-de.bpe.en', path+'test.en-de.bpe.de']


#de_tokenizer = get_tokenizer('moses', language='de')
#en_tokenizer = get_tokenizer('moses', language='en')


def build_vocab(filepath, tokenizer=None):
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      #counter.update(tokenizer(string_))
      counter.update(string_.split())
  return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

#Vocab
en_vocab = build_vocab(train_filepaths[0])
de_vocab = build_vocab(train_filepaths[1])

en_vocab.set_default_index(en_vocab['<unk>'])
de_vocab.set_default_index(de_vocab['<unk>'])

def data_process(filepaths):
  raw_en_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_de_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_en, raw_de) in zip(raw_en_iter, raw_de_iter):
    en_tensor_ = torch.tensor([en_vocab[token] for token in raw_en.split()], #en_tokenizer(raw_en)
                            dtype=torch.long)
    de_tensor_ = torch.tensor([de_vocab[token] for token in raw_de.split()], #de_tokenizer(raw_de)
                            dtype=torch.long)
    data.append((en_tensor_, de_tensor_))
  return data

#pre-process
train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)



In [None]:
#NOTE: if you are low on credits or testing only use a piece of the data
train_data = train_data[:1000]

In [None]:
len(train_data)

100

Define the device.

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


Create the iterators.

In [None]:
BATCH_SIZE = 8
PAD_IDX = de_vocab['<pad>']
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
    en_batch, de_batch = [], []
    for (en_item, de_item) in data_batch:
        de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX, batch_first=True)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX, batch_first=True)
    return en_batch, de_batch

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=False, collate_fn=generate_batch)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=False, collate_fn=generate_batch)


# Encoder

single layer bidirectional GRU, however we now use a *bidirectional RNN*.



$$\begin{align*}
h_t^\rightarrow &= \text{EncoderGRU}^\rightarrow(e(x_t^\rightarrow),h_{t-1}^\rightarrow)\\
h_t^\leftarrow &= \text{EncoderGRU}^\leftarrow(e(x_t^\leftarrow),h_{t-1}^\leftarrow)
\end{align*}$$


The GRU returns `outputs` and `hidden`.

`outputs`  size **[batch size, srclen, H * num directions]**


`hidden` size **[n layers * num directions, batch size, hid dim]**

 **[-2, :, :]**  top layer forward RNN hidden state after the final time-step

 **[-1, :, :]** top layer backward RNN hidden state after the final time-step

The decoder needs a single context vector (`hidden`) $z$,  as the initial hidden state,

$$z=\tanh(g(h_T^\rightarrow, h_T^\leftarrow)) = \tanh(g(z^\rightarrow, z^\leftarrow)) = s_0$$


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True, batch_first=True)

        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):

        #[B, srclen]
        #embedd and dropout
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)

        #[B, srclen, emb]
        outputs, hidden = self.rnn(embedded)

        #[B, srclen, H*2]
        #h[B, n layers * num directions, hid dim]
        #[forward_1, backward_1, forward_2, backward_2, ...]
        #[-2, :, : ] last state forward RNN
        #[-1, :, : ] last state backward RNN
        hidden_forward = hidden[-2, :, :]
        hidden_backward = hidden[-1, :, :]
        hidden = torch.cat((hidden_forward, hidden_backward), dim=1)
        hidden = torch.tanh(self.fc(hidden))

        return outputs, hidden

# Attention

## Luong Attention

Takes in the previous hidden state of the decoder, $s_{t-1}$, and all of the stacked forward and backward hidden states from the encoder, $H$.

The layer will output an attention vector, $a_t$, that is the length of the source sentence, each element is between 0 and 1 and the entire vector sums to 1.


Compute the *score* between the previous decoder hidden state and the encoder hidden states. $E_t$, between them by concatenating them together and passing them through a linear layer (`attn`) and a $\tanh$ activation function.

$$E_t = \tanh(\text{attn}(s_{t-1}, H))$$

.

$$\hat{a}_t = v E_t$$



Attention vector constraints between 0 and 1 and the vector summing to 1 by passing it through a $\text{softmax}$ .

$$a_t = \text{softmax}(\hat{a_t})$$




In [None]:
class LuongAttention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()

        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)

    def forward(self, hidden, encoder_outputs): #keys, query

        #[B, H]
        #[B, srclen, H* 2]
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]

        #x times decoder hidden state for src_len
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)

        #[B, srclen, H]
        scores = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))


        ##[B, srclen, H]
        attention = self.v(scores).squeeze(2)

        #[B,  srclen]

        return F.softmax(attention, dim=1)

In [None]:
#this thing took ages off of my life

class BahdanauAttention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(dec_hid_dim, dec_hid_dim, bias=False)
        self.Ua = nn.Linear(enc_hid_dim * 2, dec_hid_dim, bias=False)
        self.Va = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs): #keys, query

        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]

        #x times decoder hidden state for src_len
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)

        scores = torch.tanh(self.Wa(hidden) + self.Ua(encoder_outputs))
        scores = self.Va(scores).squeeze(2)

        weights = torch.softmax(scores, dim=1)

        return weights

# Decoder


The decoder contains the attention `attention`, and we use this attention vector to create a weighted source vector, $w_t$ `weighted`, which is a weighted sum of the encoder hidden states, $H$, using $a_t$ as the weights.

$$w_t = a_t H$$

The embedded input word, $d(y_t)$ `embedded`, the weighted source vector, $w_t$, and the previous decoder hidden state, $s_{t-1}$, are all passed into the decoder RNN, with $d(y_t)$ and $w_t$ concatenated.

$$s_t = \text{DecoderGRU}(d(y_t), w_t, s_{t-1})$$

Pass $d(y_t)$, $w_t$ and $s_t$ through the linear layer, $f$, to make a prediction of the next word in the target sentence, $\hat{y}_{t+1}$ `prediciton`.

$$\hat{y}_{t+1} = f(d(y_t), w_t, s_t)$$


In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim

        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim, batch_first=True)

        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

        #attention
        #self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        #self.v = nn.Linear(dec_hid_dim, 1, bias = False)

    def forward(self, input, hidden, encoder_outputs):

        #[B]
        #[B, H]
        #[B, srclen, H * 2]
        #unsqueeze https://pytorch.org/docs/stable/generated/torch.unsqueeze.html
        input = input.unsqueeze(1)

        #[B, 1]
        #embeded and dropout
        embedded = self.dropout(self.embedding(input))

        #[B, 1, emb]
        #call attention
        a = self.attention(hidden, encoder_outputs)

        #[B, srclen]
        a = a.unsqueeze(1)

        #encoder_outputs = encoder_outputs.permute(1, 0, 2)

        #[B, srclen, H * 2]
        #w with bmm between (a, embedded)
        #https://pytorch.org/docs/stable/generated/torch.bmm.html
        weighted =  torch.bmm(a, encoder_outputs) #weighted = a.bmm(encoder_outputs)

        #[B, 1, H * 2]
        # permute
        #weighted = weighted.permute(1, 0, 2)

        #[1, B, H * 2]
        #concat embedded and weighted
        rnn_input = torch.cat((embedded, weighted), dim=2)

        #[1, B, (H * 2) + emb]
        #[B, 1, (H * 2) + emb]
        #permute https://pytorch.org/docs/stable/generated/torch.permute.html
       # rnn_input = rnn_input.permute(1, 0, 2)
        #unsqueeze
        #hidden = hidden.unsqueeze(0)

        #call rnn with (rnn_input, hidden)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))

        #[B, M, H * n directions]
        #[B, n layers * n directions, H]
        #[1, B, H]
        # drop specific dims
        embedded = embedded.squeeze(1)
        output = output.squeeze(1)
        weighted = weighted.squeeze(1)

        #prediction layer with concatenated outputm weighted and embedded
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))

        #[B, V]

        return prediction, hidden.squeeze(0)

# Seq2Seq

contains encoder, and decoder

steps:
- the `outputs` is created to hold all predictions, $\hat{Y}$
- the source sequence, $X$, is fed into the encoder to receive $z$ and $H$
- the initial decoder hidden state is set to be the `context` vector, $s_0 = z = h_T$
- batch of `<sos>` tokens as the first `input`, $y_1$
- decode within a loop:
  - inserting the input token $y_t$, previous hidden state, $s_{t-1}$, and all encoder outputs, $H$, into the decoder
  - prediction, $\hat{y}_{t+1}$, and a new hidden state, $s_t$
  - decide if we are going to teacher force or not, setting the next input as appropriate


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device


    def forward(self, src, trg, teacher_forcing_ratio = 0.5):

        #[B, srclen]
        #[B, trglen]
        #teacher_forcing_ratio is probability to use teacher forcing
        # 0.75 teacher forcing 75% of the time

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        #torch zeros [B, trglen, trgV]
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        #encoder_outputs is all hidden states of the input sequence
        #hidden is the final forward and backward hidden states, passed through a linear layer
        #call encoder
        encoder_outputs, hidden = self.encoder(src)

        #first input to the decoder is the <sos> tokens
        input = trg[:,0]
        # unroll RNN
        for t in range(1, trg_len):

            #insert input token embedding, previous hidden state and all encoder hidden states
            #call decoder
            output, hidden = self.decoder(input, hidden, encoder_outputs)

            #predictions
            #save hidden
            outputs[:, t] = output

            #teacher forcing
            # if random num < teacher ratio
            teacher_force =  random.random() < teacher_forcing_ratio

            #greedy search
            #argmax over predictions
            #https://pytorch.org/docs/stable/generated/torch.argmax.html#torch.argmax
            top1 = output.argmax(1)

            #if teacher forcing, use gold token as next input
            #if not, use predicted token
            input = trg[:, t] if teacher_force else top1

        return outputs

## Training the Seq2Seq Model



In [None]:
INPUT_DIM = len(en_vocab) #TODO #DID
OUTPUT_DIM = len(de_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.2

#attn = LuongAttention(ENC_HID_DIM, DEC_HID_DIM)
#TODO!!
attn = BahdanauAttention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
print(len(en_vocab)) #BPE size 16k approx
print(len(de_vocab))

17949
19168


In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(17949, 256)
    (rnn): GRU(256, 512, batch_first=True, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): BahdanauAttention(
      (Wa): Linear(in_features=512, out_features=512, bias=False)
      (Ua): Linear(in_features=1024, out_features=512, bias=False)
      (Va): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(19168, 256)
    (rnn): GRU(1280, 512, batch_first=True)
    (fc_out): Linear(in_features=1792, out_features=19168, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 50,302,944 trainable parameters


We create an optimizer.

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)

We initialize the loss function.

In [None]:
TRG_PAD_IDX = de_vocab['<pad>'] #TODO

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
print(TRG_PAD_IDX)

1


In [None]:
def train(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0

    for (src, trg) in tqdm(iterator):

        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        #[B, trg len]
        #[B, trg len, H]

        output = output.permute(1, 0, 2)

        output_dim = output.shape[-1]
        trg = trg.permute(1, 0)


        #[B * (trg len - 1), output dim]
        #[B * (trg len - 1)]
        output = output[1:].reshape(-1, output_dim)
        trg = trg[1:].reshape(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for (src, trg) in iterator:
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0) #turn off teacher forcing

            #[B * (trg len - 1), output dim]
            #[B * (trg len - 1)]
            output = output.permute(1, 0, 2)
            output_dim = output.shape[-1]
            trg = trg.permute(1, 0)
            output = output[1:].reshape(-1, output_dim)
            trg = trg[1:].reshape(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
N_EPOCHS = 5
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}\tTrain PPL: {np.exp(train_loss):7.3f}')
    print(f'\t Validation Loss: {valid_loss:.3f}\tValidation PPL: {np.exp(valid_loss):7.3f}')

100%|██████████| 13/13 [00:05<00:00,  2.37it/s]


Epoch: 01
	Train Loss: 9.321	Train PPL: 11167.060
	 Validation Loss: 8.799	Validation PPL: 6627.976


100%|██████████| 13/13 [00:05<00:00,  2.32it/s]


Epoch: 02
	Train Loss: 7.600	Train PPL: 1999.181
	 Validation Loss: 9.166	Validation PPL: 9570.115


100%|██████████| 13/13 [00:05<00:00,  2.35it/s]


Epoch: 03
	Train Loss: 7.205	Train PPL: 1346.577
	 Validation Loss: 9.860	Validation PPL: 19155.350


100%|██████████| 13/13 [00:05<00:00,  2.25it/s]


Epoch: 04
	Train Loss: 7.014	Train PPL: 1112.299
	 Validation Loss: 9.891	Validation PPL: 19758.134


100%|██████████| 13/13 [00:06<00:00,  2.14it/s]


Epoch: 05
	Train Loss: 6.852	Train PPL: 945.661
	 Validation Loss: 10.307	Validation PPL: 29927.322


In [None]:
#load model from file
model.load_state_dict(torch.load('model.pt'))

test_loss = evaluate(model, test_iter, criterion)

print(f'\tTest Loss: {test_loss:.3f}\tTest PPL: {np.exp(test_loss):7.3f}')

	Test Loss: 8.796	Test PPL: 6609.470


In [None]:
#clean mem
del model
del train_iter
del valid_iter
del test_iter
torch.cuda.empty_cache()