In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

# Data

In [2]:
spacyGer = spacy.load('de')
spacyEng = spacy.load('en')

def tokenizerGer(text):
    return [tok.text for tok in spacyGer.tokenizer(text)]
def tokenizerEng(text):
    return [tok.text for tok in spacyEng.tokenizer(text)]

german = Field(tokenize=tokenizerGer, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenizerEng, lower=True, init_token='<sos>', eos_token='<eos>')

trainData, validationData, testData = Multi30k.splits(exts=('.de', '.en'),
                                                     fields=(german, english))

german.build_vocab(trainData, max_size=10000, min_freq=2)
english.build_vocab(trainData, max_size=10000, min_freq=2)

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 540kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 85.5kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 119kB/s] 


# Model

In [43]:
class Encoder(nn.Module):
    def __init__(self, inputSize, embeddingSize, hiddenSize, numLayers, p):
        super(Encoder, self).__init__()
        self.hiddenSize = hiddenSize
        self.numLayers = numLayers
        
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(inputSize, embeddingSize)
        self.lstm = nn.LSTM(embeddingSize, hiddenSize, numLayers, dropout=p)
    
    def forward(self, x):
        # x shape : (seq length, batch size)
        
        embedding = self.dropout(self.embedding(x))
        # embedding shape : (seq length, batch size, embedding size)
        
        outputs, (hidden, cell) = self.lstm(embedding)
        
        return hidden, cell

In [61]:
class Decoder(nn.Module):
    def __init__(self, inputSize, embeddingSize, hiddenSize, outputSize, numLayers, p):
        super(Decoder, self).__init__()
        
        self.hiddenSize = hiddenSize
        self.numLayers = numLayers
        
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(inputSize, embeddingSize)
        self.lstm = nn.LSTM(embeddingSize, hiddenSize, numLayers, dropout=p)
        self.fc = nn.Linear(hiddenSize, outputSize)
        
    def forward(self, x, hidden, cell):
        # x shape : (batch size) -> (1,batch size)
        x = x.unsqueeze(0)
        
        embedding = self.dropout(self.embedding(x))
        # embedding shape : (1, batch size, embedding size)
        
        outputs, (hidden, cell) = self.lstm(embedding, (hidden, cell))
        # outputs shape : (1, batch size, hidden size)
        
        predictions = self.fc(outputs)
        # predictions shape : (1, batch size, length of target vocab)
        
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell

In [62]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacherForceRatio=0.5):
        batchSize = source.shape[1]
        targetLen = target.shape[0]
        targetVocabSize = len(english.vocab)
        
        outputs = torch.zeros(targetLen, batchSize, targetVocabSize).to(device)
        
        hidden, cell = self.encoder(source)
        
        # start token
        x = target[0]
        
        for t in range(1, targetLen):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            bestGuess = output.argmax(1)  
            x = target[t] if random.random() < teacherForceRatio else bestGuess
            
        return outputs       

# train

In [63]:
# hyperparameters
numEpochs = 20
learningRate  = 0.001
batchSize = 64

In [64]:
# model hyperparameters
loadModel = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputSizeEncoder = len(german.vocab)
inputSizeDecoder = len(english.vocab)
outputSize = len(english.vocab)
encoderEmbeddingSize = 300
decoderEmbeddingSize = 300
hiddenSize = 1024
numLayers = 2
encoderDropout = 0.5
decoderDropout = 0.5

In [65]:
# Tensorboard
writer = SummaryWriter(f'runs/lossPlot')
step = 0

trainIterator, valIterator, testIterator = BucketIterator.splits(
    (trainData, validationData, testData),
    batch_size=batchSize,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device)

encoderNet = Encoder(inputSizeEncoder, encoderEmbeddingSize,
                    hiddenSize, numLayers, encoderDropout).to(device)


decoderNet = Decoder(inputSizeDecoder, decoderEmbeddingSize,
                    hiddenSize, outputSize, numLayers, decoderDropout).to(device)

In [66]:
model = Seq2Seq(encoderNet, decoderNet).to(device)
optimizer = optim.Adam(model.parameters(), lr=learningRate)

In [67]:
model

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7854, 300)
    (lstm): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(5893, 300)
    (lstm): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=5893, bias=True)
  )
)

In [68]:
padIdx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=padIdx)

In [69]:
if loadModel:
    load_checkpoint(torch.load('my_checkpoint.pth.tar'), model, optimizer)
    
sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."
    
for epoch in range(numEpochs):
    print(f'[Epoch {epoch+1} / {numEpochs}]')
    
    checkpoint = {'state_dict' : model.state_dict(), 'optimizer' : optimizer.state_dict()}
    save_checkpoint(checkpoint)
    
    model.eval()
    
    translatedSentence = translate_sentence(model, sentence, german, english, device, max_length=50)
    
    print(translatedSentence)
    
    model.train()
    
    for batchIdx, batch in enumerate(trainIterator):
        inputData = batch.src.to(device)
        target = batch.trg.to(device)
        
        output = model(inputData, target)
        # output shape : (trg len, batch size, output dim)
        
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        
        optimizer.zero_grad()
        loss = criterion(output, target)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        
        optimizer.step()
        
        writer.add_scalar("training loss", loss, global_step=step)
        step += 1
        
score = bleu(testData[1:100], model, german, english, device)
print(f'Bleu score {score*100:.2f}')

[Epoch 1 / 20]
=> Saving checkpoint
['think', 'bloom', 'lettuce', 'lettuce', 'lots', 'outfielder', 'outfielder', 'outfielder', 'mr.', 'lighted', 'sing', 'sing', 'think', 'railroad', 'length', 'vine', 'pineapple', 'pineapple', 'dives', 'dives', 'lap', 'escalators', 'reacting', 'tackle', 'tackle', 'pile', 'travel', 'standing', 'mattress', 'mattress', 'hulk', 'hulk', 'attempt', 'attempt', 'dishes', 'dishes', 'dishes', 'player', 'time', 'enjoy', 'enjoy', 'bmw', 'tourist', 'experiment', 'experiment', 'photographers', 'cherry', 'cherry', 'experiment', 'cherry']
[Epoch 2 / 20]
=> Saving checkpoint
['a', 'child', 'in', 'a', 'blue', 'shirt', 'is', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
[Epoch 3 / 20]
=> Saving checkpoint
['a', '<unk>', 'player', 'with', 'a', '<unk>', '<unk>', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
[Epoch 4 / 20]
=> Saving checkpoint
['a', 'skier', 'with', 'a', '<unk>', 'is', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
[Epoch 5 / 20]
=> Saving checkpoint
['a'