# Transformer

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from Embedder import Embedder
from PositionalEncoder import PositionalEncoder
from Layers import EncoderLayer, DecoderLayer

## Set Configs

In [3]:
N_EPOCHS = 2
N_LAYERS = 6
N_HEADS = 8

DROPOUT = 0.1
BATCH_SIZE = 1500
LR = 0.0001

BETAS1= 0.9
BETAS2= 0.98
EPS =1e-9

In [4]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

## Load Datasets

In [5]:
SOURCE_DATA = open('./datasets/french.txt').read().strip().split('\n')
TARGET_DATA = open('./datasets/english.txt').read().strip().split('\n')

## Pre-process Datasets

In [6]:
# tokenize.py, batch.py, process.py

## Build [Transformer](https://arxiv.org/pdf/1706.03762.pdf) Network

In [7]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super(Encoder, self).__init__()
            
        def get_clones(module, N):
            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

        self.N = N
        self.embedding_layer = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.encoder_layer = get_clones(EncoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
    
    def forward(self, input_seq, mask):
        
        x = self.embedding_layer(input_seq)
        x = self.pe(x)
        for i in range(self.N):
            x = self.encoder_layer[i](x, mask)
            
        x = self.norm(x)
        return x

In [8]:
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super(Decoder, self).__init__()
        
        def get_clones(module, N):
            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
        
        self.N = N
        self.embedding_layer = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.decoder_layer = get_clones(DecoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
        
    def forward(self, target_seq, encoder_outputs, source_mask, target_mask):
        
        x = self.embedding_layer(target_seq)
        x = self.pe(x)
        for i in range(self.N):
            x = self.decoder_layer[i](x, encoder_outputs, source_mask, target_mask)
            
        x = self.norm()
        return x

In [9]:
class Transformer(nn.Module):
    
    def __init__(self, source_vocab, target_vocab, d_model, N, heads, dropout):
        super(Transformer, self).__init__()
        
        self.encoder = Encoder(source_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(target_vocab, d_model, N, heads, dropout)
        slef.fc_layer = nn.Linear(d_model, target_vocab)
        
    def forward(self, source_seq, target_seq, source_mask, target_mask):
        
        encoder_outputs = self.encoder(source_seq, source_mask)
        decoder_output = self.decoder(target_seq, encoder_outputs, source_mask, target_mask)
        output = self.fc_layer(decoder_output)
        
        return output

#### Initialize Transformer Network

In [None]:
transformer = Transformer(source_vocab, target_vocab, d_model, n_layers, heads, dropout)
transformer.to(device)
for p in transformer.parameters():
    if p.dim() > 1: nn.init.xavier_uniform_(p)      

---