In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import IWSLT

from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
import sys 
import time


import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


import warnings
warnings.filterwarnings("ignore")

In [2]:
!wget https://github.com/maroxtn/IWSLT-BACKUP/archive/main.zip -q
!unzip -q main.zip

!mkdir .data
!mv IWSLT-BACKUP-main/iwslt .data

!rm -rf main.zip IWSLT-BACKUP-main

In [3]:
CFG = {"IN_LANG":"de", "OUT_LANG": "en"}

In [4]:
import spacy.cli
import en_core_web_sm
import de_core_news_sm


spacy.cli.download("en_core_web_sm")
spacy.cli.download("de_core_news_sm")


if CFG["IN_LANG"] == "en":
    spacy_in_lang = en_core_web_sm.load()
    spacy_out_lang = de_core_news_sm.load()
else:
    spacy_in_lang = de_core_news_sm.load()
    spacy_out_lang = en_core_web_sm.load()
    

✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
✔ Download and installation successful
You can now load the model via spacy.load('de_core_news_sm')


In [5]:
def tokenizer_in(text):
    return [tok.text for tok in spacy_in_lang.tokenizer(text)]

def tokenizer_out(text):
    return [tok.text for tok in spacy_out_lang.tokenizer(text)]


in_lang = Field(tokenize=tokenizer_in, lower=True, include_lengths=True)
out_lang = Field(tokenize=tokenizer_out, lower=True, init_token="<sos>", eos_token="<eos>", include_lengths=True)

In [6]:
MAX_LEN = 25

train_data, valid_data, test_data = IWSLT.splits(root="../data",
        exts=("."+CFG["IN_LANG"], "."+CFG["OUT_LANG"]), fields=(in_lang, out_lang ),filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
            len(vars(x)['trg']) <= MAX_LEN)

In [7]:
in_lang.build_vocab(train_data, min_freq=2)
out_lang.build_vocab(train_data, min_freq=2)

In [8]:
class Encoder(nn.Module): 
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, bidirectional=True)

        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(p)

    def forward(self, x, inp_length=None):
        
        embedding = self.dropout(self.embedding(x))
        
        if inp_length == None:
            encoder_states, hidden = self.rnn(embedding)
        else:      
            packed = pack_padded_sequence(embedding, inp_length.cpu()) #To speed up training
            encoder_states, hidden = self.rnn(packed)
            encoder_states, _ = pad_packed_sequence(encoder_states)

        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))

        return encoder_states, hidden


In [9]:
class Decoder(nn.Module):
    
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(hidden_size * 2 + embedding_size, hidden_size, num_layers)

        self.energy = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p)
        
        self.fc_key = nn.Linear(hidden_size, hidden_size)
        self.fc_query = nn.Linear(hidden_size*2, hidden_size)

    def forward(self, x, encoder_states, hidden, source, inp_mask):
        
        x = x.unsqueeze(0)
        embedding = self.dropout(self.embedding(x))

        
        key = self.fc_key(hidden)
        query = self.fc_query(encoder_states)
        
        energy = key+query
        energy = self.energy(torch.tanh(energy))
        
        if inp_mask != None:
            energy = energy.squeeze(-1).masked_fill_(inp_mask, -float('inf')).unsqueeze(-1)

        attention = F.softmax(energy, dim=0) #(seq_len, batch, 1)
                                             #(seq_len, batch, hidden*2)
        
        context_vector = torch.bmm(attention.permute(1, 2, 0), encoder_states.permute(1, 0, 2)).permute(1,0,2)

        #Concatenate the context vector with the embedding of the previous word, and feed it to the GRU
        rnn_input = torch.cat((context_vector, embedding), dim=2)
        outputs, hidden = self.rnn(rnn_input, hidden)

        predictions = self.fc(outputs).squeeze(0)

        return predictions, hidden

In [10]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, inp_length, inp_mask):
        
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(out_lang.vocab)

        
        outputs = []
        
        encoder_states, hidden = self.encoder(source, inp_length)
        
        x = target[0] #<SOS>
        
        for t in range(1, target_len):

            output, hidden = self.decoder(x, encoder_states, hidden, source, inp_mask)

            outputs.append(output)
            best_guess = output.argmax(1)

            x = target[t] #No teacher forcing
            
        
        outputs = torch.cat(outputs)

        return outputs

In [11]:
def rebatch(batch):
    return Batch(batch.src, batch.trg)

In [12]:
class Batch:

    def __init__(self, src, trg):
        
        src, src_lengths = src
        
        self.src = src
        self.src_lengths = src_lengths
        self.src_mask = (src == in_pad_idx)
        
        self.trg = None
        self.trg_y = None
        self.trg_lengths = None
        self.ntokens = None

        if trg is not None:
            trg, trg_lengths = trg
            self.trg = trg
            self.trg_lengths = trg_lengths
            self.trg_y = trg[1:].reshape(-1)
            self.ntokens = (self.trg_y != out_pad_idx).sum().item()  
        
        if device == torch.device('cuda'):
            self.src = self.src.cuda()
            self.src_mask = self.src_mask.cuda()

            if trg is not None:
                self.trg = self.trg.cuda()
                self.trg_y = self.trg_y.cuda()
                

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Training Hyperparameters
num_epochs = 30
lr = 0.0003
batch_size = 64
d_model = 256

input_size_encoder = len(in_lang.vocab)
input_size_decoder = len(out_lang.vocab)
output_size = len(out_lang.vocab)


encoder_embedding_size = d_model
decoder_embedding_size = d_model
hidden_size = d_model

num_layers = 1
dropout = 0.1

In [14]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

In [15]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, 
                      hidden_size, num_layers, dropout).to(device)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size, 
                      hidden_size, output_size, num_layers, dropout).to(device)


model = Seq2Seq(encoder_net, decoder_net).to(device)

In [16]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

get_n_params(model)

23195573

In [17]:
in_pad_idx = in_lang.vocab.stoi['<pad>']
out_pad_idx = out_lang.vocab.stoi['<pad>']


optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=out_pad_idx)

In [18]:
def translate_sentence_bahdanau(model, sentence, max_length=50):
    
    model.eval()

    tokens = [token.text.lower() for token in spacy_in_lang(sentence)]

    text_to_indices = [in_lang.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    preds = [out_lang.vocab.stoi[out_lang.init_token]]

    with torch.no_grad():
        
        encoder_states, hidden = model.encoder(sentence_tensor)
        
        for t in range(max_length):
                    
            trg = torch.Tensor([preds[-1]]).long().to(device)

            output, hidden = model.decoder(trg, encoder_states, hidden, sentence_tensor, None)
            new = output.argmax(1).item()
            
            preds.append(new)
            
            if new == out_lang.vocab.stoi["<eos>"]:
                break
            
        
    return " ".join([out_lang.vocab.itos[i] for i in preds][1:-1])

In [19]:
def beam(phrase, k):  #K: beam width
    
    model.eval()
    
    sos = out_lang.vocab.stoi["<sos>"]
    tgt = [sos]
    
    #Prepare sentence
    tokens = [token.text.lower() for token in spacy_in_lang(phrase)]
    tokens.append(in_lang.eos_token)
    tokens.insert(0, in_lang.init_token)

    text_to_indices = [in_lang.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)    
    

    with torch.no_grad():

        #Get encoder output
        encoder_states, hidden = model.encoder(sentence_tensor)
        
        
        #Get first output from model
        trg = torch.Tensor([tgt[-1]]).long().to(device)

        output, hidden = model.decoder(trg, encoder_states, hidden,sentence_tensor)
        out = F.softmax(output).squeeze()



        args = out.argsort()[-k:]
        probs = out[args].detach().cpu().numpy()
        
        args = args.detach().cpu().numpy()
        
        
        probs = np.log(probs)
        possible = list(zip([tgt + [args[i]] for i in range(k)], probs, [hidden.clone() for j in range(k)]))


        for i in range(50):

            test=  []
            for j in range(k):

                tmp_tgt, tmp_prob, tmp_hidden = possible[j]

                if tmp_tgt[-1] == out_lang.vocab.stoi["<eos>"]:  #If sentence already ended
                    test.append(possible[j])

                else:
                    
                    #Compute output
                    trg = torch.Tensor([tmp_tgt[-1]]).long().to(device)

                    output, hidden = model.decoder(trg, encoder_states, tmp_hidden, sentence_tensor)
                    out = F.softmax(output).squeeze()
                    
                    
                    tmp_args = out.argsort()[-k:]
                    tmp_probs = out[args].detach().cpu().numpy()

                    tmp_args = tmp_args.detach().cpu().numpy()
                    tmp_probs = (tmp_prob + np.log(tmp_probs))/(len(tmp_tgt)-1)


                    for r in range(k): 
                        test.append((tmp_tgt + [tmp_args[r]], tmp_probs[r], hidden))


            possible = sorted(test, key=lambda x:x[1], reverse=True)[:k]


                    
    
    return possible



def convert(x):
    
    sentence = x[0]
    sentence = [out_lang.vocab.itos[i] for i in sentence]
    
    return (" ".join(sentence), x[1])

In [20]:
def run_epoch(iterator, log_every=100):
    
    model.train()
    total_loss = 0
    
    start = time.time()
    n_tokens = 0
    
    print("")
    
    for batch_idx, batch in enumerate(iterator):
        
        inp_data = batch.src
        inp_length = batch.src_lengths
        inp_mask = batch.src_mask
        
        target = batch.trg

        output = model(inp_data, target, inp_length, inp_mask)
        
        
        optimizer.zero_grad()
        
        loss = criterion(output, batch.trg_y)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        
        total_loss += loss.item()
        n_tokens += batch.ntokens
        
        
        
        if (batch_idx % log_every == 0) and (batch_idx > 0):
            tokens_per_sec = n_tokens/(time.time() - start)
            print(" Step %d - Loss %f - Tokens per Sec %f" % (batch_idx, loss.item(), tokens_per_sec))
        
    return total_loss / len(train_iterator)

In [21]:
def run_validation(iterator):
    
    model.eval()
    total_loss = 0
    
    for batch_idx, batch in enumerate(iterator):
        
        inp_data = batch.src
        inp_length = batch.src_lengths
        inp_mask = batch.src_mask
        
        target = batch.trg

        output = model(inp_data, target, inp_length, inp_mask)
        
        loss = criterion(output, batch.trg_y)
        total_loss += loss.item()
        
    return total_loss / len(valid_iterator)

In [22]:
best_loss = 65646

for epoch in range(num_epochs):
    
    print(f'Epoch [{epoch} / {num_epochs}]')


    loss =  run_epoch((rebatch(b) for b in train_iterator))
    validation_loss = run_validation((rebatch(b) for b in valid_iterator))
    
    
    rand_i01 = np.random.randint(0, len(train_data))
    rand_i02 = np.random.randint(0, len(valid_data))
    rand_i03 = np.random.randint(0, len(test_data))
    
    sentence01, expected01 = " ".join(train_data[rand_i01].src), " ".join(train_data[rand_i01].trg)
    sentence02, expected02 = " ".join(valid_data[rand_i02].src), " ".join(valid_data[rand_i02].trg)
    sentence03, expected03 = " ".join(test_data[rand_i03].src), " ".join(test_data[rand_i03].trg)
    
    translated_sentence01 = translate_sentence_bahdanau(model, sentence01, max_length=50)
    translated_sentence02 = translate_sentence_bahdanau(model, sentence02, max_length=50)
    translated_sentence03 = translate_sentence_bahdanau(model, sentence03, max_length=50)
    #out = beam(sentence, 3)  list(map(convert, out[:2]))
    
    
    print(f"\nExample #1 (from Train data): \nTranslation: { translated_sentence01 }\nExpected: { expected01 }")
    print(f"\nExample #2 (from Validation): \nTranslation: { translated_sentence02 }\nExpected: { expected02 }")
    print(f"\nExample #3 (from Test data): \nTranslation: { translated_sentence03 }\nExpected: { expected03 }\n")
    
    print(f"\n Train loss {loss} | Validation loss {validation_loss} \n\n\n")
    
    if validation_loss < best_loss:
        torch.save(model.state_dict(), "best_model")
        best_loss = validation_loss

Epoch [0 / 100]

 Step 100 - Loss 5.982434 - Tokens per Sec 13758.775884
 Step 200 - Loss 3.893672 - Tokens per Sec 14209.725900
 Step 300 - Loss 4.406250 - Tokens per Sec 14478.539284
 Step 400 - Loss 5.292997 - Tokens per Sec 14530.438233
 Step 500 - Loss 4.351924 - Tokens per Sec 14466.739310
 Step 600 - Loss 5.011229 - Tokens per Sec 14463.826019
 Step 700 - Loss 4.707049 - Tokens per Sec 14511.547157
 Step 800 - Loss 4.504529 - Tokens per Sec 14592.792488
 Step 900 - Loss 5.019985 - Tokens per Sec 14607.787666
 Step 1000 - Loss 4.487456 - Tokens per Sec 14613.575630
 Step 1100 - Loss 4.043857 - Tokens per Sec 14630.879244
 Step 1200 - Loss 3.763046 - Tokens per Sec 14673.933677
 Step 1300 - Loss 3.820462 - Tokens per Sec 14662.261897
 Step 1400 - Loss 3.974252 - Tokens per Sec 14666.482315
 Step 1500 - Loss 4.023894 - Tokens per Sec 14662.590000
 Step 1600 - Loss 4.462840 - Tokens per Sec 14668.570615
 Step 1700 - Loss 3.377359 - Tokens per Sec 14685.289045
 Step 1800 - Loss 4.651