In [0]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torchtext import data
from torchtext.datasets import Multi30k
import spacy
import random

In [0]:
spacy_german = spacy.load('de')

def tokenize_german(text):
    return [tok.text for tok in spacy_german.tokenizer(text)][::-1] # reverse input

SOURCE = data.Field(tokenize=tokenize_german,
                    init_token="<sos>",
                    eos_token="<eos>",
                    lower=True)

TARGET = data.Field(tokenize='spacy',
                    init_token="<sos>",
                    eos_token="<eos>",
                    lower=True)

In [0]:
train, valid, test = Multi30k.splits(exts=('.de', '.en'),
                                     fields=(SOURCE, TARGET))

SOURCE.build_vocab(train, min_freq=3)
TARGET.build_vocab(train, min_freq=3)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train, valid, test),
                                                                           batch_size=256)

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, source_embed, target_embed, fully_connected):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.source_embed = source_embed
        self.target_embed = target_embed
        self.fully_connected = fully_connected
        
    def forward(self, source, target, source_mask, target_mask):
        return self.decode(self.encode(source, source_mask), source_mask, target, target_mask)
        
    def encode(self, source, source_mask):
        return self.encoder(source, source_mask)
    
    def decode(self, memory, source_mask, target, target_mask):
        return self.decoder(self.target_embed(target), memory, source_mask, target_mask)

In [0]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Paramater(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
    
    def forward(self, inputs):
        mean = inputs.mean(-1, keepdim=True)
        std = inputs.std(-1, keepdim=True)
        return self.a_2 * (inputs - mean) / (std + self.eps) + self.b_2

In [0]:
class Sublayer(nn.Module):
    def __init__(self, size, dropout):
        super(Sublayer, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, inputs, sublayer):
        return inputs + self.dropout(sublayer(self.norm(inputs)))

In [0]:
class Encoder(nn.Module):
    def __init__(self, layer, depth):
        super(Encoder, self).__init__()
        self.layers = copies(layer, depth)
        self.norm = LayerNorm(layer.size)
    
    def forward(self, inputs, mask):
        for layer in self.layers:
            inputs = layer(inputs, mask)
        return self.norm(inputs)    
        
    @staticmethod
    def copies(module, count):
        return nn.ModuleList([copy.deepcopy(module) for _ in range(count)])

In [0]:
class EncoderLayer(nn.Module):
    def __init__(self, size, attention, feedforward, dropout):
        super(EncoderLayer, self).__init__()
        self.attention = attention
        self.feed_forward = feed_forward
        self.sublayers = copies(Sublayer(size, dropout), 2)
        self.size = size
        
    def forward(self, inputs, mask):
        x = self.sublayers[0](inputs, lambda inputs: self.attention(inputs, inputs, inputs, mask))
        return self.sublayers[1](x, self.feed_forward)

    @staticmethod
    def copies(module, count):
        return nn.ModuleList([copy.deepcopy(module) for _ in range(count)])

In [0]:
class Decoder(nn.Module):
    def __init__(self, layer, depth):
        super(Decoder, self).__init__()
        self.layers = copies(layer, depth)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, inputs, memory, src_mask, tgt_mask):
        for layer in self.layers:
            inputs = layer(inputs, memory, src_mask, tgt_mask)
        return self.norm(inputs)
    
    @staticmethod
    def copies(module, count):
        return nn.ModuleList([copy.deepcopy(module) for _ in range(count)])

In [0]:
class DecoderLayer(nn.Module):
    def __init__(self, size, attention, source_attention, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.attention = attention
        self.source_attention = source_attention
        self.feed_forward = feed_forward
        self.sublayers = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, inputs, memory, source_mask, target_mask):
        inputs = self.sublayers[0](inputs, lambda inputs: self.attention(inputs, inputs, inputs, target_mask))
        inputs  = self.sublayers[1](inputs, lambda inputs: self.source_attention(inputs, memory, memory, source_mask))
        return self.sublayers[2](inputs, self.feed_forward)

In [0]:
def subsequent_mask(size):
    subsequent_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn