# Introduction to Seq2Seq Networks using Pytorch

https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
!python -m spacy download en
!python -m spacy download de

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [4]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [5]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
SRC = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

TRG = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

In [7]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(SRC, TRG))

In [8]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [9]:
print(vars(train_data.examples[0]))

{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [10]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [11]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
batch_size = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    device=device)

In [14]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [15]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dense = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.dense(output.squeeze(0))
        return prediction, hidden, cell

In [16]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, "hid_dim must be the same"
        assert encoder.n_layers == decoder.n_layers, "n_layers must be the same"
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        
        input = trg[0,:]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        
        return outputs

In [17]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [18]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dense): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,899,013 trainable parameters


In [20]:
optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [21]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [22]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            
            src = batch.src
            trg = batch.trg
            
            output = model(src, trg, 0)
            
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '../models/2-torch-seq-to-seq-introduction-model.pt')
        
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 34s
	Train Loss: 5.055 | Train PPL: 156.769
	 Val. Loss: 4.938 |  Val. PPL: 139.522
Epoch: 02 | Time: 0m 34s
	Train Loss: 4.485 | Train PPL:  88.658
	 Val. Loss: 4.812 |  Val. PPL: 122.961
Epoch: 03 | Time: 0m 33s
	Train Loss: 4.185 | Train PPL:  65.666
	 Val. Loss: 4.560 |  Val. PPL:  95.580
Epoch: 04 | Time: 0m 33s
	Train Loss: 3.968 | Train PPL:  52.894
	 Val. Loss: 4.479 |  Val. PPL:  88.144
Epoch: 05 | Time: 0m 34s
	Train Loss: 3.811 | Train PPL:  45.185
	 Val. Loss: 4.395 |  Val. PPL:  81.016
Epoch: 06 | Time: 0m 34s
	Train Loss: 3.693 | Train PPL:  40.175
	 Val. Loss: 4.327 |  Val. PPL:  75.739
Epoch: 07 | Time: 0m 33s
	Train Loss: 3.584 | Train PPL:  36.015
	 Val. Loss: 4.220 |  Val. PPL:  68.001
Epoch: 08 | Time: 0m 33s
	Train Loss: 3.457 | Train PPL:  31.710
	 Val. Loss: 4.082 |  Val. PPL:  59.244
Epoch: 09 | Time: 0m 34s
	Train Loss: 3.330 | Train PPL:  27.941
	 Val. Loss: 4.040 |  Val. PPL:  56.820
Epoch: 10 | Time: 0m 34s
	Train Loss: 3.248 | Train PPL

In [25]:
model.load_state_dict(torch.load('../models/5-torch-seq-to-seq-improvements-attention-2.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 4.002 | Test PPL:  54.699 |


In [26]:
test_batch = next(iter(test_iterator))
src = test_batch.src
trg = test_batch.trg

model.eval()
output = torch.argmax(model.forward(src, trg, 0), -1)
output, trg

(tensor([[  0,   0,   0,  ...,   0,   0,   0],
         [ 16,  48,   4,  ...,   4,   4,  16],
         [ 63, 112,  34,  ...,  14,  24,  30],
         ...,
         [  3,   3,   3,  ...,   3,   3,   3],
         [  3,   3,   3,  ...,   3,   3,   3],
         [  3,   3,   3,  ...,   3,   3,   3]], device='cuda:0'),
 tensor([[   2,    2,    2,  ...,    2,    2,    2],
         [  16,  110,    4,  ...,    4,   24,   16],
         [1909,   19,   34,  ...,   14,   14,   30],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0'))

In [27]:
def tokens_to_sentence(tokens, vocab):
    words = list()
    for tok in tokens:
        words.append(vocab.itos[tok])
    return ' '.join(words)
for i, (out, tar) in enumerate(zip(output.permute(1, 0), trg.permute(1, 0))):
    print("Sentence ", i)
    print("Translated Sentence:", tokens_to_sentence(out, TRG.vocab))
    print("Expected Sentence:", tokens_to_sentence(tar, TRG.vocab), '\n')

Sentence  0
Translated Sentence: <unk> two children are running in the snow . <eos> <eos> <eos> <eos> <eos>
Expected Sentence: <sos> two medium sized dogs run across the snow . <eos> <pad> <pad> <pad> 

Sentence  1
Translated Sentence: <unk> three dogs play in a field . <eos> <eos> <eos> <eos> <eos> <eos>
Expected Sentence: <sos> four people are playing soccer on a beach . <eos> <pad> <pad> <pad> 

Sentence  2
Translated Sentence: <unk> a boy is riding a skateboard on a skateboard . <eos> <eos> <eos>
Expected Sentence: <sos> a boy riding a skateboard on a skateboarding ramp <eos> <pad> <pad> <pad> 

Sentence  3
Translated Sentence: <unk> a dog jumps over a water . <eos> <eos> <eos> <eos> <eos> <eos>
Expected Sentence: <sos> a dog is jumping through a <unk> obstacle . <eos> <pad> <pad> <pad> 

Sentence  4
Translated Sentence: <unk> two children play in the grass . <eos> <eos> <eos> <eos> <eos> <eos>
Expected Sentence: <sos> the two kids are playing at the playground . <eos> <pad> <pad> 

Translated Sentence: <unk> a dog is his his hand . <eos> <eos> <eos> <eos> <eos> <eos>
Expected Sentence: <sos> a cowboy wrapping up his arm with a bandage . <eos> <pad> <pad> 

Sentence  65
Translated Sentence: <unk> an asian american - haired man is to the . <eos> <eos> <eos>
Expected Sentence: <sos> the african american man <unk> against <unk> <unk> . <eos> <pad> <pad> <pad> 

Sentence  66
Translated Sentence: <unk> an old man walking down the street . <eos> <eos> <eos> <eos> <eos>
Expected Sentence: <sos> an african american man walking down the street . <eos> <pad> <pad> <pad> 

Sentence  67
Translated Sentence: <unk> a young man is a tennis ball . <eos> <eos> <eos> <eos> <eos>
Expected Sentence: <sos> a young man about to throw a football . <eos> <pad> <pad> <pad> 

Sentence  68
Translated Sentence: <unk> a group of people are walking . <eos> <eos> <eos> <eos> <eos> <eos>
Expected Sentence: <sos> a group of people do tricks on motorbikes . <eos> <pad> <pad> <pad> 

Sentence  69
T

In [28]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):

    model.eval()
        
    if isinstance(sentence, str):
        nlp = spacy.load('de')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    src_len = torch.LongTensor([len(src_indexes)]).to(device)
    
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor, src_len)

    mask = model.create_mask(src_tensor)
        
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device)
    
    for i in range(max_len):

        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
                
        with torch.no_grad():
            output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs, mask)

        attentions[i] = attention
            
        pred_token = output.argmax(1).item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attentions[:len(trg_tokens)-1]

In [29]:
def display_attention(sentence, translation, attention):
    
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    
    attention = attention.squeeze(1).cpu().detach().numpy()
    
    cax = ax.matshow(attention, cmap='bone')
   
    ax.tick_params(labelsize=15)
    ax.set_xticklabels(['']+['<sos>']+[t.lower() for t in sentence]+['<eos>'], 
                       rotation=45)
    ax.set_yticklabels(['']+translation)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    plt.close()

In [30]:
example_idx = 12

src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'trg = {trg}')

src = ['.', 'kämpfen', 'hund', 'gefleckter', 'ein', 'und', 'hund', 'schwarzer', 'ein']
trg = ['a', 'black', 'dog', 'and', 'a', 'spotted', 'dog', 'are', 'fighting']


In [31]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {translation}')

TypeError: forward() takes 2 positional arguments but 3 were given

In [32]:
display_attention(src, translation, attention)

NameError: name 'translation' is not defined

In [33]:
example_idx = 14

src = vars(valid_data.examples[example_idx])['src']
trg = vars(valid_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'trg = {trg}')

src = ['.', 'geige', 'ihrer', 'auf', 'lied', 'ein', 'spielt', 'frau', 'eine']
trg = ['a', 'female', 'playing', 'a', 'song', 'on', 'her', 'violin', '.']


In [34]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {translation}')

display_attention(src, translation, attention)

TypeError: forward() takes 2 positional arguments but 3 were given

In [None]:


example_idx = 18

src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'trg = {trg}')



In [None]:


translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {translation}')

display_attention(src, translation, attention)



In [None]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):
    
    trgs = []
    pred_trgs = []
    
    for datum in data:
        
        src = vars(datum)['src']
        trg = vars(datum)['trg']
        
        pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len)
        
        #cut off <eos> token
        pred_trg = pred_trg[:-1]
        
        pred_trgs.append(pred_trg)
        trgs.append([trg])
        
    return bleu_score(pred_trgs, trgs)

In [None]:
bleu_score = calculate_bleu(test_data, SRC, TRG, model, device)

print(f'BLEU score = {bleu_score*100:.2f}')