In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IWSLT
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
import time


from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import TransformerDecoder, TransformerDecoderLayer
import torch.nn.functional as F

import warnings
warnings.filterwarnings("ignore")

In [2]:
!wget https://github.com/maroxtn/IWSLT-BACKUP/archive/main.zip -q
!unzip -q main.zip

!mkdir .data
!mv IWSLT-BACKUP-main/iwslt .data

!rm -rf main.zip IWSLT-BACKUP-main

In [3]:
CFG = {"IN_LANG":"de", "OUT_LANG": "en"}

In [4]:
import spacy.cli
import en_core_web_sm
import de_core_news_sm


spacy.cli.download("en_core_web_sm")
spacy.cli.download("de_core_news_sm")


if CFG["IN_LANG"] == "en":
    spacy_in_lang = en_core_web_sm.load()
    spacy_out_lang = de_core_news_sm.load()
else:
    spacy_in_lang = de_core_news_sm.load()
    spacy_out_lang = en_core_web_sm.load()
    

✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
✔ Download and installation successful
You can now load the model via spacy.load('de_core_news_sm')


In [5]:
def tokenizer_in(text):
    return [tok.text for tok in spacy_in_lang.tokenizer(text)]

def tokenizer_out(text):
    return [tok.text for tok in spacy_out_lang.tokenizer(text)]

in_lang = Field(tokenize=tokenizer_in, lower=True)
out_lang = Field(tokenize=tokenizer_out, lower=True, init_token="<sos>", eos_token="<eos>")

In [6]:
MAX_LEN = 25

train_data, valid_data, test_data = IWSLT.splits(
        exts=("."+CFG["IN_LANG"], "."+CFG["OUT_LANG"]), fields=(in_lang, out_lang ),filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
            len(vars(x)['trg']) <= MAX_LEN)

In [7]:
in_lang.build_vocab(train_data, min_freq=2)
out_lang.build_vocab(train_data, min_freq=2)

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.scale = nn.Parameter(torch.ones(1))

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(
            0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.scale * self.pe[:x.size(0), :]
        return self.dropout(x)

In [9]:
class TransformerModel(nn.Module):
    
    def __init__(self, intoken, outtoken ,hidden, enc_layers=2, dec_layers=2, dropout=.1, nheads=2, ff_model=128):
        super(TransformerModel, self).__init__()
        
        self.encoder = nn.Embedding(intoken, hidden)
        self.pos_encoder = PositionalEncoding(hidden, dropout)

        self.decoder = nn.Embedding(outtoken, hidden) 
        self.pos_decoder = PositionalEncoding(hidden, dropout)
        
        
        encoder_layers = TransformerEncoderLayer(d_model=hidden, nhead = nheads, dim_feedforward = ff_model, dropout=dropout, activation='relu')
        self.transformer_encoder = TransformerEncoder(encoder_layers, enc_layers)

        encoder_layers = TransformerDecoderLayer(hidden, nheads, ff_model, dropout, activation='relu')
        self.transformer_decoder = TransformerDecoder(encoder_layers, dec_layers)        

        self.fc_out = nn.Linear(hidden, outtoken)

        self.src_mask = None
        self.trg_mask = None
        self.memory_mask = None

        
    def generate_square_subsequent_mask(self, sz, sz1=None):
        
        if sz1 == None:
            mask = torch.triu(torch.ones(sz, sz), 1)
        else:
            mask = torch.triu(torch.ones(sz, sz1), 1)
            
        return mask.masked_fill(mask==1, float('-inf'))

    def make_len_mask_enc(self, inp):
        return (inp == in_pad_idx).transpose(0, 1)   #(batch_size, output_seq_len)
    
    def make_len_mask_dec(self, inp):
        return (inp == out_pad_idx).transpose(0, 1) #(batch_size, input_seq_len)
    


    def forward(self, src, trg): #SRC: (seq_len, batch_size)

        if self.trg_mask is None or self.trg_mask.size(0) != len(trg):
            self.trg_mask = self.generate_square_subsequent_mask(len(trg)).to(trg.device)
            

        #Adding padding mask
        src_pad_mask = self.make_len_mask_enc(src)
        trg_pad_mask = self.make_len_mask_dec(trg)
             

        #Add embeddings Encoder
        src = self.encoder(src)  #Embedding, (seq_len, batch_size, d_model)
        src = self.pos_encoder(src)   #Pos embedding
        
        
        #Add embedding decoder
        trg = self.decoder(trg) #(seq_len, batch_size, d_model)
        trg = self.pos_decoder(trg)

        
        memory = self.transformer_encoder(src, None, src_pad_mask)
        output = self.transformer_decoder(tgt = trg, memory = memory, tgt_mask = self.trg_mask, memory_mask = None, 
                                          tgt_key_padding_mask = trg_pad_mask, memory_key_padding_mask = src_pad_mask)

        output = self.fc_out(output)

        return output

In [10]:
def rebatch(batch):
    return Batch(batch.src, batch.trg)

In [11]:
class Batch:

    def __init__(self, src, trg):
        
        self.src = src
        
        self.trg = None
        self.trg_y = None
        self.ntokens = None

        if trg is not None:
            self.trg = trg[:-1,]
            self.trg_y = trg[1:].reshape(-1)
            self.ntokens = (self.trg_y != out_pad_idx).sum().item()  
        
        if device == torch.device('cuda'):
            self.src = self.src.cuda()

            if trg is not None:
                self.trg = self.trg.cuda()
                self.trg_y = self.trg_y.cuda()
                

In [12]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))

In [13]:
#Training Hyperparameters
num_epochs = 30
batch_size = 256
maxlen = 50

In [14]:
#Model Hyperparameter
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(in_lang.vocab)
input_size_decoder = len(out_lang.vocab)
output_size = len(out_lang.vocab)

d_model = 256

import math 

model = TransformerModel(input_size_encoder, input_size_decoder ,d_model, enc_layers=1, dec_layers=1, dropout=.1, nheads=1, ff_model=1028).to(device)

In [15]:
in_pad_idx = in_lang.vocab.stoi['<pad>']
out_pad_idx = out_lang.vocab.stoi['<pad>']


train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

criterion = nn.CrossEntropyLoss(ignore_index=out_pad_idx)
optimizer = NoamOpt(d_model, 1, 4000 ,optim.Adam(model.parameters(), lr=0))

In [16]:
def translate_sentence_transformer(model, sentence, max_length=50):
    model.eval()
    tokens = [token.text.lower() for token in spacy_in_lang(sentence)]

    text_to_indices = [in_lang.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    preds = [out_lang.vocab.stoi[out_lang.init_token]]

    with torch.no_grad():
        
        emb_src = model.encoder(sentence_tensor)
        emb_src = model.pos_encoder(emb_src)

        memory = model.transformer_encoder(emb_src)

        for i in range(50):

            trg = torch.Tensor(preds).long().unsqueeze(1).to(device)
            trg = model.decoder(trg)
            trg = model.pos_decoder(trg)

            out = model.transformer_decoder(tgt = trg, memory = memory)
            out = model.fc_out(out)
            
            

            new = out.squeeze(1)[-1].argmax().item()
            preds.append(new)
            if new == out_lang.vocab.stoi["<eos>"]:
                break

    
    return " ".join([out_lang.vocab.itos[i] for i in preds][1:-1])

In [17]:
def get_out_encoder(src):
    
    model.eval()
    tokens = [token.text.lower() for token in spacy_in_lang(src)]

    text_to_indices = [in_lang.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)    

    with torch.no_grad():
        
        emb_src = model.encoder(sentence_tensor)
        emb_src = model.pos_encoder(emb_src)

        memory = model.transformer_encoder(emb_src)

        return memory

In [18]:
def beam(phrase, k):
    
    model.eval()
    memory = get_out_encoder(phrase)

    sos = out_lang.vocab.stoi["<sos>"]
    tgt = [sos]

    with torch.no_grad():

        trg = torch.Tensor(tgt).long().unsqueeze(1).to(device)
        trg = model.decoder(trg)
        trg = model.pos_decoder(trg)

        out = model.transformer_decoder(tgt = trg, memory = memory)
        out = F.softmax(model.fc_out(out), dim=-1)[-1].squeeze()

        args = out.argsort()[-k:].detach().cpu().numpy()
        probs = out[args].detach().cpu().numpy()

        probs = np.log(probs)
        possible = list(zip([tgt + [args[i]] for i in range(k)], probs))
        
        for i in range(maxlen):

            test=  []
            for j in range(k):

                tmp_tgt, tmp_prob = possible[j]

                if tmp_tgt[-1] == out_lang.vocab.stoi["<eos>"]:
                    test.append(possible[j])

                else:
                    trg = torch.Tensor(tmp_tgt).long().unsqueeze(1).to(device)
                    trg = model.decoder(trg)
                    trg = model.pos_decoder(trg)

                    out = model.transformer_decoder(tgt = trg, memory = memory)
                    out = F.softmax(model.fc_out(out), dim=-1)[-1].squeeze()

                    tmp_args = out.argsort()[-k:].detach().cpu().numpy()
                    tmp_probs = out[tmp_args].detach().cpu().numpy()
                    tmp_probs = (tmp_prob + np.log(tmp_probs))/(len(tmp_tgt)-1)

                    for r in range(k): 
                        test.append((tmp_tgt + [tmp_args[r]], tmp_probs[r]))


            possible = sorted(test, key=lambda x:x[1], reverse=True)[:k]
            
    return possible

In [19]:
def convert(x):
    
    sentence = x[0]
    sentence = [out_lang.vocab.itos[i] for i in sentence]
    
    return (" ".join(sentence), x[1])

In [20]:
def run_epoch(iterator, log_every=100):
    
    model.train()
    total_loss = 0
    
    start = time.time()
    n_tokens = 0
    
    print("")
    
    for batch_idx, batch in enumerate(iterator):
        
        inp_data = batch.src
        target = batch.trg

        output = model(inp_data, target)
        output = output.reshape(-1, output.shape[2])

        optimizer.optimizer.zero_grad()
        loss = criterion(output, batch.trg_y)
        total_loss += loss
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        

        n_tokens += batch.ntokens
        if (batch_idx % log_every == 0) and (batch_idx > 0):
            tokens_per_sec = n_tokens/(time.time() - start)
            print(" Step %d - Loss %f - Tokens per Sec %f" % (batch_idx, loss.item(), tokens_per_sec))
            
        
    return total_loss/len(train_iterator)

In [21]:
def run_validation(iterator, log_every=100):
    
    model.eval()
    
    total_loss = 0
    
    with torch.no_grad():
    
        for batch_idx, batch in enumerate(iterator):

            inp_data = batch.src.to(device)
            target = batch.trg.to(device)

            output = model(inp_data, target[:-1, ])
            output = output.reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            loss = criterion(output, target)
            total_loss += loss


    return total_loss/len(valid_iterator)

In [22]:
best_loss = 6486468 

for epoch in range(num_epochs):
    
    print(f'Epoch [{epoch} / {num_epochs}]\n')
    
    loss = run_epoch((rebatch(b) for b in train_iterator))
    validation_loss = run_validation((rebatch(b) for b in valid_iterator))
    
    
    rand_i01 = np.random.randint(0, len(train_data))
    rand_i02 = np.random.randint(0, len(valid_data))
    rand_i03 = np.random.randint(0, len(test_data))
    
    sentence01, expected01 = " ".join(train_data[rand_i01].src), " ".join(train_data[rand_i01].trg)
    sentence02, expected02 = " ".join(valid_data[rand_i02].src), " ".join(valid_data[rand_i02].trg)
    sentence03, expected03 = " ".join(test_data[rand_i03].src), " ".join(test_data[rand_i03].trg)

    translated_sentence01 = translate_sentence_transformer(model, sentence01, max_length=50)
    translated_sentence02 = translate_sentence_transformer(model, sentence02, max_length=50)
    translated_sentence03 = translate_sentence_transformer(model, sentence03, max_length=50)
    
    print(f"\nExample #1 (from Train data): \nTranslation: { translated_sentence01 }\nExpected: { expected01 }")
    print(f"\nExample #2 (from Validation): \nTranslation: { translated_sentence02 }\nExpected: { expected02 }")
    print(f"\nExample #3 (from Test data): \nTranslation: { translated_sentence03 }\nExpected: { expected03 }\n")
    
    print(f"\n Train loss {loss} | Validation loss {validation_loss} \n\n\n")
    
    
    if validation_loss < best_loss:
        torch.save(model.state_dict(), "../models/new_transformer")
        best_loss = validation_loss
    

Epoch [0 / 30]


 Step 100 - Loss 9.263897 - Tokens per Sec 47121.283573
 Step 200 - Loss 7.610021 - Tokens per Sec 50239.160757
 Step 300 - Loss 5.980570 - Tokens per Sec 50995.913070
 Step 400 - Loss 5.920236 - Tokens per Sec 52100.895727
 Step 500 - Loss 5.591289 - Tokens per Sec 52586.715767

Example #1 (from Train data): 
Translation: so you can you to be the world .
Expected: and when you do that , the ground around you starts to shift just a little bit .

Example #2 (from Validation): 
Translation: i 'm a lot .
Expected: so i had to find a way of solving this problem .

Example #3 (from Test data): 
Translation: it 's a lot of the world .
Expected: disability is as visual as race .


 Train loss 6.9769978523254395 | Validation loss 4.979701995849609 



Epoch [1 / 30]


 Step 100 - Loss 4.633193 - Tokens per Sec 51526.444654
 Step 200 - Loss 5.227591 - Tokens per Sec 53038.572382
 Step 300 - Loss 4.168788 - Tokens per Sec 53555.258197
 Step 400 - Loss 3.819081 - Tokens per Sec 5

In [23]:
model.load_state_dict(torch.load("../models/new_transformer"))

run_validation((rebatch(b) for b in valid_iterator))

tensor(2.5430, device='cuda:0')