In [27]:
# utils
from utils import eval, count_parameters
import torch
import random

# data
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

# model
import torch.nn as nn
import torch.nn.functional as F


# training
import torch.optim as optim
import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda')

## Data Preparation

In [4]:
batch_size = 64
max_len = 128

In [5]:
# create the fields
# source language is German and target language is 
SRC = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True,
    include_lengths=True
)

TRG = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="en",
    batch_first=True,
#     include_lengths=True
)

In [6]:
# download the dataset
train, val, test = Multi30k.splits(
    exts=(".de", ".en"),
    fields=(SRC, TRG)
)

In [7]:
# build the vocab
SRC.build_vocab(train)
TRG.build_vocab(train)

In [8]:
# data loaders
train_loader, val_loader, test_loader = BucketIterator.splits(
    datasets=(train, val, test),
    batch_size=batch_size,
    sort_within_batch = True,
    sort_key = lambda x : len(x.src),
    device=device,
)

In [9]:
batch = next(iter(train_loader))

In [10]:
src, src_len = batch.src
trg =  batch.trg

## Model

- Hidden Size of Encoder and Decoder recurrent net will be same

#### Encoder Model

In [13]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        
        # reccurent layer
        self.gru  = nn.GRU(
            input_size = embedding_dim, 
            hidden_size=hidden_size,
            num_layers = 1,
            batch_first = True,
            bidirectional=True
        )
        
        # forward and backward output will be stacked
        self.fc = nn.Linear(in_features=2*hidden_size, out_features=hidden_size)
    
    def forward(self, src, src_len):
        
        embedded =  self.embedding(src)
        
        rnn_input = nn.utils.rnn.pack_padded_sequence(input=embedded, lengths=src_len, batch_first=True)
        packed_outputs, hidden = self.gru(rnn_input)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True)
        
        # hidden contains both forward pass hidden state as well backward pass hidden state concat the both
        concated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        
        # new hidden state 
        hidden = torch.tanh(self.fc(concated))
        
        return outputs, hidden
        
               

#### Attention Model

In [14]:
class Attention(nn.Module):
    
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        
        # attention layers
        self.attention = nn.Linear(in_features=3*hidden_size, out_features=hidden_size)
        self.v = nn.Linear(in_features=hidden_size, out_features=1, bias=False)
        
    
    def forward(self, hidden, encoder_outputs, mask=None):
        
        """
            mask is basically pad token
            hidden.shape -> [batch, hidden_size]
            encoder_outputs.shape -> [batch, seq_len, 2*hidden_size]
        """
        
        src_len = encoder_outputs.shape[1]
        
        # repeat the hidden state upto T(src_len) times but first add the additional axis
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        # concate the hidden state and encoder_outputs and pass to the attention module to calculate the energy
        energy = torch.tanh(self.attention(torch.cat((hidden, encoder_outputs), dim=2)))
        # energy.shape -> [batch, seq_len, hidden_size]
        
        attention = self.v(energy).squeeze()
        # attention.shape -> [batch, seq_len]
        
#         attention = attention.masked_fill(mask==0, -1e10)
        
        return F.softmax(attention, dim=1)

#### Decoder Model

In [15]:
class Decoder(nn.Module):
    
    def __init__(self, attention, vocab_size, embedding_dim, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.attention = attention
        
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        # reccurent net
        self.gru = nn.GRU(
            input_size=(2*hidden_size)+embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )
        
        # fully connected layer
        self.fc = nn.Linear(in_features=3*hidden_size+embedding_dim, out_features=vocab_size)
    
    def forward(self, input, hidden, encoder_outputs):
        
        """
            input.size() -> [batch, 1] - At time t only one token of each sample will be decoded
            hidden.size() ->[batch, hidden_size]
            encoder_outputs -> [batch, seq_len, 2*hidden_size]
        """
        
        embedded = self.embedding(input.to(device))
        # embedded[batch, seq_len, embedding_dim] embedded will have only two dim if seq_len is 1 (ie: at time t)
             
        attn = self.attention(hidden, encoder_outputs)
        attn = attn.unsqueeze(1)
    
        # calculate the weighted sum
        weighted = torch.bmm(attn, encoder_outputs)
        
        embedded = embedded.unsqueeze(1)
        
#         print(f'weighted shape {weighted.shape} embedded shape {embedded.shape}')
        rnn_input = torch.cat((embedded, weighted), dim=2)
        
        output, hidden = self.gru(rnn_input, hidden.unsqueeze(0))
        
        # prepare the input for fully connected layer and make predicitons
        fc_input = torch.cat((weighted, output, embedded), dim=2)
        prediction = self.fc(fc_input)
        
        return prediction, hidden.squeeze()

#### Seq2Seq Model

In [16]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, teacher_forcing_ratio=0.25):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = teacher_forcing_ratio
    
    def forward(self, batch):
        
        src, src_len = batch.src
        trg = batch.trg
        
        """
            trg.size() -> [batch, seq_len]
            src.size() -> [batch, seq_len]
        """
        
        encoder_outputs, hidden = self.encoder(src, src_len)
        
        batch, seq_len, vocab_size =  trg.shape[0], trg.shape[1], self.decoder.vocab_size
        
        outputs = torch.zeros((batch, seq_len, vocab_size), device=device)
        
        # take the first token of each samples in the batch and calculate the attention for the same
        input = trg[:, 0]
        
        
        for t in range(1, seq_len):
            output, hidden = decoder(input, hidden, encoder_outputs)
            outputs[:, t] = output.squeeze()

            # is teacher force
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < self.teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.squeeze().argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[:, t] if teacher_force else top1
        
        return outputs

## Training

In [57]:
def eval(model, data, criterion):
    loss, ppl = [], []
    with torch.no_grad():
        for batch in data:
            outputs = model(batch)
            batch_size, seq_len = outputs.size(0), outputs.size(1)
            l = criterion(outputs.view(batch_size*seq_len, -1).contiguous().to(device), batch.trg.view(-1))
            p = torch.exp(l)
            loss.append(l.item())
            ppl.append(p.item())
    return sum(loss)/len(loss), sum(ppl)/len(ppl)

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

In [58]:
src_vocab = len(SRC.vocab)
trg_vocab = len(TRG.vocab)
hidden_size = 512 # same for encoder and decoder
embedding_dim =  256 # same for encoder and decoder

In [59]:
encoder =  Encoder(vocab_size=src_vocab, embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
attention = Attention(hidden_size=hidden_size).to(device)
decoder = Decoder(attention, vocab_size=trg_vocab, embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
model = Seq2Seq(encoder=encoder, decoder=decoder).to(device)

In [60]:
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(18660, 256)
    (gru): GRU(256, 512, batch_first=True, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attention): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(9799, 256)
    (gru): GRU(1280, 512, batch_first=True)
    (fc): Linear(in_features=1792, out_features=9799, bias=True)
  )
)

In [61]:
count_parameters(model)

+------------------------------------+------------+
|              Modules               | Parameters |
+------------------------------------+------------+
|      encoder.embedding.weight      |  4776960   |
|      encoder.gru.weight_ih_l0      |   393216   |
|      encoder.gru.weight_hh_l0      |   786432   |
|       encoder.gru.bias_ih_l0       |    1536    |
|       encoder.gru.bias_hh_l0       |    1536    |
|  encoder.gru.weight_ih_l0_reverse  |   393216   |
|  encoder.gru.weight_hh_l0_reverse  |   786432   |
|   encoder.gru.bias_ih_l0_reverse   |    1536    |
|   encoder.gru.bias_hh_l0_reverse   |    1536    |
|         encoder.fc.weight          |   524288   |
|          encoder.fc.bias           |    512     |
| decoder.attention.attention.weight |   786432   |
|  decoder.attention.attention.bias  |    512     |
|     decoder.attention.v.weight     |    512     |
|      decoder.embedding.weight      |  2508544   |
|      decoder.gru.weight_ih_l0      |  1966080   |
|      decod

31288391

In [62]:
epochs = 5
lr = 1e-3
PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
total_steps = len(train_loader)*epochs

In [63]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX).to(device)
optimizer = optim.Adam(params=model.parameters(), lr=lr)

In [64]:
steps = 0

# epoch progress bar
epoch_progress = tqdm.tqdm(total=epochs, desc="Epoch", position=0)

for epoch in range(epochs):
    
    # step progress bar
    step_progress = tqdm.tqdm(total=total_steps, desc="Steps", position=0)
    
    epoch_loss = []
    epoch_ppl = []
    
    for batch in train_loader:
        
        outputs = model(batch)
        
        batch_size, seq_len = outputs.shape[0], outputs.shape[1]
        
        outputs = outputs.view((batch_size*seq_len, -1))
        labels = batch.trg.view(-1)
        
        loss = criterion(outputs, labels)
        ppl = torch.exp(loss)
        
        # backprograpage the loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        epoch_loss.append(loss.item())
        epoch_ppl.append(ppl.item())
        
#         if steps % 200 == 0:
#             print(f'Steps {steps}/{total_steps} | Train_loss {loss.item():.4f} | Train_ppl {ppl.item():.4f}')
        steps += 1
        step_progress.update(1)
    
    avg_loss = sum(epoch_loss)/len(epoch_loss)
    avg_ppl = sum(epoch_ppl)/len(epoch_ppl)
    
    val_loss, val_ppl = eval(model, val_loader, criterion)
    print(f'Epoch {epoch}/{epochs} | Steps {steps}/{total_steps} | Train_loss {avg_loss:.4f} | Train_ppl {avg_ppl:.4f} | Val_loss {val_loss:.4f} | Val_ppl {val_ppl:.4f}')
    epoch_progress.update(1)

Steps:   0%|          | 1/2270 [00:00<04:55,  7.68it/s]s]

Epoch 0/5 | Steps 454/2270 | Train_loss 5.3352 | Train_ppl 332.4269 | Val_loss 4.7795 | Val_ppl 122.9651


Steps:   0%|          | 1/2270 [00:00<06:16,  6.02it/s]s]

Epoch 1/5 | Steps 908/2270 | Train_loss 4.2993 | Train_ppl 79.2242 | Val_loss 3.9038 | Val_ppl 52.2731


Steps:   0%|          | 1/2270 [00:00<05:18,  7.12it/s]s]

Epoch 2/5 | Steps 1362/2270 | Train_loss 3.5430 | Train_ppl 36.4696 | Val_loss 3.5598 | Val_ppl 36.9900


Steps:   0%|          | 1/2270 [00:00<05:26,  6.95it/s]s]

Epoch 3/5 | Steps 1816/2270 | Train_loss 3.0317 | Train_ppl 21.8867 | Val_loss 3.4040 | Val_ppl 31.6249


Epoch: 100%|██████████| 5/5 [05:40<00:00, 68.50s/it]it/s]

Epoch 4/5 | Steps 2270/2270 | Train_loss 2.6168 | Train_ppl 14.4381 | Val_loss 3.3914 | Val_ppl 31.1631


In [66]:
loss, ppl = eval(model, test_loader, criterion)
print(f'Test_loss {loss:.4f} | Test_PPL {ppl}')

Test_loss 3.3686 | Test_PPL 30.097521543502808
