<a href="https://colab.research.google.com/github/onism/MyLearning/blob/master/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import math
import time
import spacy
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import torchtext
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

In [3]:
SEED = 1
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
!python -m spacy download de


Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 3.9MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=1cf02b248a2ffb5b866103febc3c41defdfd167c3bf701e6acc45fd4be6fb469
  Stored in directory: /tmp/pip-ephem-wheel-cache-dxosjxy8/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/

In [5]:
# use spacy for tokening the data
spacy_en = spacy.load('en')
spacy_de = spacy.load('de')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1] ## list[::-1] used to reverse the list

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

German is the src (input) English is the trg (output)

Append <sos> (start of sentence), <eos>(end of sentence) tokens to all sentences. This can be done simply by specifing the init_token, eos_token arguments in the field

In [6]:
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True)

In [7]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))
print('Loaded data...')
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 803kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 244kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 234kB/s]


Loaded data...
Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [8]:
print(f"src: {vars(train_data.examples[0])['src']}")
print(f"trg: {vars(train_data.examples[0])['trg']}")

src: ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei']
trg: ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [9]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)
print('Vocab builded...')
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

BATCH_SIZE = 32

Vocab builded...
Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


We use BucketIterator instead of the standard Iterator as it creates batches in such a way that it minimizes the amount of padding in both the sourcee and target sentences

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

![Multi-Head Attention Layer](https://ravirajag.dev/assets/images/transformer/multihead.png)

In [11]:
# https://ravirajag.dev/machine%20learning/data%20science/deep%20learning/attention/transformers/sequence-to-sequence/2019/03/22/transformerimp.html
class SelfAttention(nn.Module):

    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        self.hid_dim = hid_dim 
        self.n_heads = n_heads 
        self.w_q = nn.Linear(hid_dim, hid_dim)
        self.w_k = nn.Linear(hid_dim, hid_dim)
        self.w_v = nn.Linear(hid_dim, hid_dim)

        self.do = nn.Dropout(dropout)

        self.fc = nn.Linear(hid_dim, hid_dim)
        # scale factor to be applied in calculation of self-attention. This is the sqrt of dimension of key vector.

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim // n_heads])).to(device)
    
    def forward(self, query, key, value, mask=None):

        batch_size,_, hidden_dim = query.shape 
        assert self.hid_dim == hidden_dim, "Hidden dim must match"

        Q = self.w_q(query)
        K = self.w_k(key)
        V = self.w_v(value)
        # Q, K, V: [batch_size, sent_len, hidden_dim]

        Q = Q.view(batch_size, -1, self.n_heads, self.hid_dim//self.n_heads).permute(0,2,1,3)
        K = K.view(batch_size, -1, self.n_heads, self.hid_dim//self.n_heads).permute(0,2,1,3)
        V = V.view(batch_size, -1, self.n_heads, self.hid_dim//self.n_heads).permute(0,2,1,3)
        # Q, K, V: [batch_size, n_heads, sent_len, hid_dim//n_heads]

        # z = softmax( (Q,K)/sqrt(q_dim)   ).V

        energy = torch.matmul(  Q,  K.permute(0,1,3,2) ) / self.scale

        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = self.do( F.softmax(energy, dim=-1) )
        x = x.permute(0,2,1,3).contiguous()
        # x: [batch_size, sent_len, n_heads, hid_dim//h_heads]

        # combine all heads
        x = x.view(batch_size, -1, self.hid_dim)

        x = self.fc(x)
        return x




**Positionwise Feedforward Layer**
The second layer present in Encoder is a position-wise feed forward layer.

A Feed Forward Network is applied to each position separately and identically, containing 1 hidden_layer with ReLU activation

In [12]:
class PositionwiseFeedforward(nn.Module):

    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.hid_dim = hid_dim 
        self.pf_dim  = pf_dim 

        self.fc_1 = nn.Conv1d(hid_dim, pf_dim, 1)
        self.fc_2 = nn.Conv1d(pf_dim, hid_dim, 1)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        # x: batch_size, sent_len, hidden_dim

        x = x.permute(0,2,1)

        x = self.dropout(F.relu(self.fc_1(x)))
        # x : [batch_size, pf_dim, sent_len]
        x = self.fc_2(x)
        # x: batch_size, hid_dim, sent_len
        x = x.permute(0,2,1)
        return x

**Positional Encoding**

Since the transformer contains no recurrence and no covolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence.

In [13]:
class PositionEncoding(nn.Module):

    def __init__(self, d_model, dropout, device, max_len=1000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model).to(device)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):

        x = x + Variable(self.pe[:,:x.size(1)], requires_grad=False)
        return self.dropout(x)

Each encoder block contains layers:
1. self-attention layer followed by layer normalization
2. positionwise feed-forward layer followd by layer normalization

In [14]:
class EncoderLayer(nn.Module):

    def __init__(self, hid_dim, n_heads, pf_dim, self_attention, positionwise_feedward, dropout, device):
        super().__init__()
        self.sa = self_attention(hid_dim, n_heads, dropout, device)
        self.pf = positionwise_feedward(hid_dim, pf_dim, dropout)
        self.ln = nn.LayerNorm(hid_dim)
        self.do = nn.Dropout(dropout)

    
    def forward(self, src, src_mask):
        src = self.ln( src + self.do(self.sa(src, src, src, src_mask)))
        src = self.ln(src + self.do(self.src))
        return src


In [15]:
# Encoder contains multiple EncoderLayers. N=6 given in paper.
class Encoder(nn.Module):

    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pd_dim, encoder_layer, self_attention, positionwise_feedforward,
                 positional_encoding, dropout, device):
        super().__init__()
        self.input_dim = input_dim 
        self.hid_dim = hid_dim 
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pf_dim = pf_dim 
        self.encoder_layer = encoder_layer
        self.self_attention = self_attention
        self.positionwise_feedforward = positionwise_feedforward
        self.positional_encoding = positional_encoding
        self.device = device

        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(1000, hid_dim)

        self.layers = nn.ModuleList([encoder_layer(hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device) for _ in range(n_layers)])
        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
    
    def forward(self, src, src_mask):

        src = self.dropout( (self.tok_embedding(src)*self.scale) )
        src = self.positional_encoding(src)
        for layer in self.layers:
            src = layer(src, src_mask)
        return src



In [16]:
class DecoderLayer(nn.Module):

    def __init__(self, hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward,
                 dropout, device):
        super().__init__()
        self.sa = self_attention(hid_dim, n_heads, dropout, device)
        self.ea = self_attention(hid_dim, n_heads, dropout, device)
        self.pf = positionwise_feedforward(hid_dim, pf_dim, dropout)
        self.ln = nn.LayerNorm(hid_dim)
        self.do = nn.Dropout(dropout)

    def forward(self, trg, src, trg_mask, src_mask):
        trg = self.ln( trg + self.do( self.sa(trg, trg, trg, trg_mask) ))
        trg = self.ln( trg + self.do( self.ea(trg, src, src, src_mask) ))
        trg = self.ln(trg + self.do(self.pf(trg)))
        return trg

In [17]:
class Decoder(nn.Module):

    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim,
                 decoder_layer, self_attention, positionwise_feedforward, positional_encoding, dropout, devic):
        super().__init__()
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(1000, hid_dim)

        self.layers = nn.ModuleList(
            [decoder_layer(hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device) for _ in range(n_layers)]
        )
        self.fc = nn.Linear(hid_dim, output_dim)
        self.do = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
      
    def forward(self, trg, src, trg_mask, src_mask):

        trg = self.do((  self.tok_embedding(trg) * self.scale ))
        trg = self.positional_encoding(trg)
        for layer in self.layers:
            trg = layer(trg, src, trg_mask, src_mask)
        trg = self.fc(trg)
        return trg

In [21]:
class Transformer(nn.Module):

    def __init__(self, encoder, decoder, pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.device = device 
    
    def make_masks(self, src, trg):
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        trg_pad_mask = (trg != self.pad_idx).unsqueeze(1).unsqueeze(3)

        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), dtype=torch.uint8, device=self.device))

        trg_mask = trg_pad_mask & trg_sub_mask

        return src_mask, trg_mask
    
    def forward(self, src, trg):
        src_mask, trg_mask = self.make_masks(src, trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, trg_mask, src_mask)
        return out 

In [22]:
input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)
hid_dim = 512
n_layers = 6
n_heads = 8
pf_dim = 2048
dropout = 0.1
pad_idx = SRC.vocab.stoi['<pad>']

PE = PositionEncoding(hid_dim, dropout, device)
enc = Encoder(input_dim, hid_dim, n_layers, n_heads, pf_dim, EncoderLayer, SelfAttention, PositionwiseFeedforward, PE, dropout, device)
dec = Decoder(output_dim, hid_dim, n_layers, n_heads, pf_dim, DecoderLayer, SelfAttention, PositionwiseFeedforward, PE, dropout, device)
model = Transformer(enc, dec, pad_idx, device).to(device)

In [23]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model) } trainable parameters")

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

The model has 55206149 trainable parameters


In [24]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))

In [25]:
optimizer = NoamOpt(hid_dim, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))


In [26]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)


In [27]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.optimizer.zero_grad()

        output = model(src, trg[:,:-1])
                
        #output = [batch size, trg sent len - 1, output dim]
        #trg = [batch size, trg sent len]
            
        output = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg sent len - 1, output dim]
        #trg = [batch size * trg sent len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [28]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg[:,:-1])
            
            #output = [batch size, trg sent len - 1, output dim]
            #trg = [batch size, trg sent len]
            
            output = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg sent len - 1, output dim]
            #trg = [batch size * trg sent len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [29]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [30]:
N_EPOCHS = 10
CLIP = 1
SAVE_DIR = '.'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'transformer-seq2seq.pt')

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'| Epoch: {epoch+1:03} | Time: {epoch_mins}m {epoch_secs}s| Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')

RuntimeError: ignored