In [19]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import torchtext
from torchtext import data
from torchtext.datasets import Multi30k
import spacy
import random

In [16]:
spacy_german = spacy.load('de')

def tokenize_german(text):
    return [tok.text for tok in spacy_german.tokenizer(text)][::-1] # reverse input

SOURCE = data.Field(tokenize=tokenize_german,
                    init_token='<sos>',
                    eos_token='<eos>', 
                    lower=True)

TARGET = data.Field(tokenize='spacy',
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True)

In [27]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(SOURCE, TARGET))
    
SOURCE.build_vocab(train_data)
TARGET.build_vocab(train_data)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data), 
                                                                           batch_size=256)

In [21]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 2)
        
    def forward(self, inputs):
        out = self.embedding(inputs)
        out, (hidden, cell) = self.lstm(out)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 2)
        self.prediction_layer = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, inputs, hidden, cell):
        out = inputs.unsqueeze(0)
        out = self.embedding(out)
        out, (hidden, cell) = self.lstm(out, (hidden, cell))
        prediction = self.prediction_layer(out.squeeze(0))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target):
        batch_size = target.shape[1]
        max_length = target.shape[0]
        target_vocab_size = self.decoder.vocab_size
        
        outputs = torch.zeros(max_length, batch_size, target_vocab_size)
        hidden, cell = self.encoder(source)
        inputs = target[0,:]
        
        for t in range(1, max_length):
            output, hidden, cell = self.decoder(inputs, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < 0.5
            top1 = output.max(1)[1]
            inputs = (target[t] if teacher_force else top1)
            
        return outputs

In [24]:
encoder = Encoder(len(SOURCE.vocab), 100, 200)
decoder = Decoder(len(TARGET.vocab), 100, 200)

model = Seq2Seq(encoder, decoder)

def init_weights(model):
    for name, parameter in model.named_parameters():
        nn.init.uniform_(parameter.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(18660, 100)
    (lstm): LSTM(100, 200, num_layers=2)
  )
  (decoder): Decoder(
    (embedding): Embedding(9799, 100)
    (lstm): LSTM(100, 200, num_layers=2)
    (prediction_layer): Linear(in_features=200, out_features=9799, bias=True)
  )
)

In [None]:
optimizer = optim.Adam(model.parameters())
loss_function = nn.CrossEntropyLoss(ignore_index = TARGET.vocab.stoi['<pad>'])

model.train()
epoch_loss = 0
for i, batch in enumerate(train_iterator):
    source = batch.src
    target = batch.trg
    
    optimizer.zero_grad()
    
    output = model(source, target)
    output = output[1:].view(-1, output.shape[-1])
    target = target[1:].view(-1)
    
    loss = loss_function(output, target)
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    epoch_loss += loss.item()
print(epoch_loss / len(train_iterator))

Thanks to https://github.com/bentrevett/pytorch-seq2seq for reference

