# Transformer

In [1]:
import os
import copy
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from utils import Tokenizer, Iterator, batch_size_fn

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data

In [3]:
from Embedder import Embedder
from PositionalEncoder import PositionalEncoder
from Sublayers import Norm, MultiHeadSelfAttention, FeedForward
from Layers import EncoderLayer, DecoderLayer

## Set Configs

In [4]:
N_EPOCHS = 2
N_LAYERS = 6
N_HEADS = 8
D_MODEL = EMBEDDING_DIM = 512
MAX_LENGTH = 80

DROPOUT = 0.1
BATCH_SIZE = 1500
LR = 0.0001

BETAS1= 0.9
BETAS2= 0.98
EPS =1e-9

In [5]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

## Load Datasets

In [6]:
SOURCE_LANG = 'fr'
TARGET_LANG = 'en'

SOURCE_DATA = open('./datasets/french.txt').read().strip().split('\n')
TARGET_DATA = open('./datasets/english.txt').read().strip().split('\n')

## Pre-process Datasets

In [7]:
tok_source = Tokenizer(SOURCE_LANG)
tok_target = Tokenizer(TARGET_LANG)

SOURCE_FIELD = data.Field(lower=True, tokenize=tok_source.tokenize)
TARGET_FIELD = data.Field(lower=True, tokenize=tok_target.tokenize, init_token='<SOS>', eos_token='<EOS>')

In [8]:
raw_data = {'SOURCE': [line for line in SOURCE_DATA], 'TARGET': [line for line in TARGET_DATA]}
df_datasets = pd.DataFrame(raw_data, columns=['SOURCE', 'TARGET'])
df_datasets.head()

Unnamed: 0,SOURCE,TARGET
0,Va !,Go.
1,Cours !,Run!
2,Courez !,Run!
3,Au feu !,Fire!
4,À l'aide !,Help!


In [9]:
mask = (df_datasets['SOURCE'].str.count(' ') < MAX_LENGTH) & (df_datasets['TARGET'].str.count(' ') < MAX_LENGTH)
df_datasets = df_datasets.loc[mask]

df_datasets.to_csv('./datasets/translate_transformer_temp.csv', index=False)
data_fields = [('SOURCE', SOURCE_FIELD), ('TARGET', TARGET_FIELD)]
train = data.TabularDataset('./datasets/translate_transformer_temp.csv', format='csv', fields=data_fields)
train_iter = Iterator(train, batch_size=BATCH_SIZE, device=device, repeat=False, 
                      sort_key=lambda x: (len(x.SOURCE), len(x.TARGET)),
                      batch_size_fn=batch_size_fn, train=True, shuffle=True)

In [10]:
SOURCE_FIELD.build_vocab(train)
TARGET_FIELD.build_vocab(train)

pickle.dump(SOURCE_FIELD, open('datasets/SOURCE.pkl', 'wb'))
pickle.dump(TARGET_FIELD, open('datasets/TARGET.pkl', 'wb'))

SOURCE_PAD = SOURCE_FIELD.vocab.stoi['<pad>']
TARGET_PAD = TARGET_FIELD.vocab.stoi['<pad>']

for i, b in enumerate(train_iter): 
    TRAIN_LENGTH = i
    pass

In [11]:
source_vocab_size = len(SOURCE_FIELD.vocab)
target_vocab_size = len(TARGET_FIELD.vocab)

## Build [Transformer](https://arxiv.org/pdf/1706.03762.pdf) Network

In [12]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super(Encoder, self).__init__()
            
        def get_clones(module, N):
            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

        self.N = N
        self.embedding_layer = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.encoder_layer = get_clones(EncoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
    
    def forward(self, input_seq, mask):
        
        x = self.embedding_layer(input_seq)
        x = self.pe(x)
        for i in range(self.N):
            x = self.encoder_layer[i](x, mask)
            
        x = self.norm(x)
        return x

In [13]:
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super(Decoder, self).__init__()
        
        def get_clones(module, N):
            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
        
        self.N = N
        self.embedding_layer = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.decoder_layer = get_clones(DecoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
        
    def forward(self, target_seq, encoder_outputs, source_mask, target_mask):
        
        x = self.embedding_layer(target_seq)
        x = self.pe(x)
        for i in range(self.N):
            x = self.decoder_layer[i](x, encoder_outputs, source_mask, target_mask)
            
        x = self.norm()
        return x

In [14]:
class Transformer(nn.Module):
    
    def __init__(self, source_vocab, target_vocab, d_model, N, heads, dropout):
        super(Transformer, self).__init__()
        
        self.encoder = Encoder(source_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(target_vocab, d_model, N, heads, dropout)
        self.fc_layer = nn.Linear(d_model, target_vocab)
        
    def forward(self, source_seq, target_seq, source_mask, target_mask):
        
        encoder_outputs = self.encoder(source_seq, source_mask)
        decoder_output = self.decoder(target_seq, encoder_outputs, source_mask, target_mask)
        output = self.fc_layer(decoder_output)
        
        return output

#### Initialize Transformer Network

In [15]:
transformer = Transformer(source_vocab_size, target_vocab_size, D_MODEL, N_LAYERS, N_HEADS, DROPOUT)
transformer.to(device)
for p in transformer.parameters(): 
    if p.dim() > 1: nn.init.xavier_uniform_(p)      

---