In [2]:
# utils
import torch
from util import count_parameters
import random

# data
from torchtext.data import Field, BucketIterator
from torchtext.datasets import Multi30k

# model
import torch.nn as nn
import torch.nn.functional as F

# training
import tqdm
import torch.optim as optim


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

### Prepare the dataset

In [5]:
batch_size = 64
max_len = 128

In [6]:
# create the fields
SRC = Field(
    lower=True,
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True
)

TRG = Field(
    lower=True,
    tokenize="spacy",
    tokenizer_language="en",
    batch_first=True
)

In [7]:
# download the dataset
train, val, test = Multi30k.splits(
    exts=(".de", ".en"),
    fields=(SRC, TRG)
)

downloading training.tar.gz
training.tar.gz: 100%|██████████| 1.21M/1.21M [00:08<00:00, 135kB/s]
downloading validation.tar.gz
validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 103kB/s]
downloading mmt_task1_test2016.tar.gz
mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 101kB/s]


In [8]:
# build the vocab
SRC.build_vocab(train)
TRG.build_vocab(train)

In [10]:
# data loaders
train_loader, val_loader, test_loader = BucketIterator.splits(
    datasets=(train, val, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    device=device
)

In [11]:
for batch in train_loader:
    print(batch.src.shape, batch.trg.shape)
    break

torch.Size([64, 27]) torch.Size([64, 25])


## Model
- Encoder hidden_dim and decoder hidden_dim will be same to avoid discrepancy

#### Encoder Model

In [12]:
x = batch.src
y = batch.trg

In [13]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=1):
        
        super(Encoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        
        # gru layer
        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True
        )
        # input_features = 2*hidden_dim, output_features
        self.fc = nn.Linear(in_features=2*hidden_size, out_features=hidden_size)
        self.dropout = nn.Dropout(p=0.15)
        
    
    def forward(self, x):
        
        # x.shape -> [batch, seq_len]
        
        # compute the embedding
        embedded = self.embedding(x)
        # embedded.shape -> [batch, seq_len, embedding_dim]
        
        # pass the embedding to gru layer
        outputs, hidden = self.gru(embedded)   
        # outputs.shape -> [batch, seq_len, 2*hidden_size] and hidden.shape -> [2, batch, hidden_size]
        
        # hidden contains both forward pass hidden state as well backward pass hidden state concat the both
        concated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        
        # new hidden state 
        hidden = torch.tanh(self.fc(concated))
        
        return outputs, hidden

#### Self-attention Model

In [14]:
class Attention(nn.Module):
    
    def __init__(self, hidden_size):
        
        """It is advised to use same encoder_hidden_dim and decoder_hidden_dim"""
        super(Attention, self).__init__()
        
        # attention layer and params
        self.attn = nn.Linear(in_features=(3*hidden_size), out_features=hidden_size)
        self.v = nn.Linear(in_features=hidden_size, out_features=1, bias=False)
        
    
    def forward(self, hidden, encoder_outputs):
        
        # hidden.shape -> [batch, hidden_dim]
        # encoder_outptus.shape -> [batch, seq_len, 2*hidden_dim]
        seq_len = encoder_outputs.shape[1]
        
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)  
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze()
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

#### Decoder Model

In [24]:
class Decoder(nn.Module):
    
    def __init__(self, attention, vocab_size, embedding_dim, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.attention = attention
        
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        # reccurent net
        self.gru = nn.GRU(
            input_size=(2*hidden_size)+embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )
        
        # fully connected layer
        self.fc = nn.Linear(in_features=3*hidden_size+embedding_dim, out_features=vocab_size)
    
    def forward(self, input, hidden, encoder_outputs):
        
        """
            input.size() -> [batch, 1] - At time t only one token of each sample will be decoded
            hidden.size() ->[batch, hidden_size]
            encoder_outputs -> [batch, seq_len, 2*hidden_size]
        """
        
        embedded = self.embedding(input.to(device))
        # embedded[batch, seq_len, embedding_dim] embedded will have only two dim if seq_len is 1 (ie: at time t)
             
        attn = self.attention(hidden, encoder_outputs)
        attn = attn.unsqueeze(1)
    
        # calculate the weighted sum
        weighted = torch.bmm(attn, encoder_outputs)
        
        embedded = embedded.unsqueeze(1)
        
#         print(f'weighted shape {weighted.shape} embedded shape {embedded.shape}')
        rnn_input = torch.cat((embedded, weighted), dim=2)
        
        output, hidden = self.gru(rnn_input, hidden.unsqueeze())
        
        # prepare the input for fully connected layer and make predicitons
        fc_input = torch.cat((weighted, output, embedded), dim=2)
        prediction = self.fc(fc_input)
        
        return prediction, hidden.squeeze()
        


#### Seq2Seq

In [16]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, teacher_forcing_ratio=0.25):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = teacher_forcing_ratio
    
    def forward(self, src, trg):
        """
            trg.size() -> [batch, seq_len]
            src.size() -> [batch, seq_len]
        """
        encoder_outputs, hidden = self.encoder(src)
        
        batch, seq_len, vocab_size =  trg.shape[0], trg.shape[1], self.decoder.vocab_size
        
        outputs = torch.zeros((batch, seq_len, vocab_size), device=device)
        
        # take the first token of each samples in the batch and calculate the attention for the same
        input = trg[:, 0]
        
        
        for t in range(1, seq_len):
            output, hidden = decoder(input, hidden, encoder_outputs)
            outputs[:, t] = output.squeeze()

            # is teacher force
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < self.teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.squeeze().argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[:, t] if teacher_force else top1
        
        return outputs
            

## Training

In [17]:
def eval(model, data, criterion):
    loss, ppl = [], []
    with torch.no_grad():
        for batch in data:
            outputs = model(batch.src, batch.trg)
            batch_size, seq_len = outputs.size(0), outputs.size(1)
            l = criterion(outputs.view(batch_size*seq_len, -1).contiguous().to(device), batch.trg.view(-1))
            p = torch.exp(l)
            loss.append(l.item())
            ppl.append(p.item())
    return sum(loss)/len(loss), sum(ppl)/len(ppl)

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            

In [25]:
src_vocab = len(SRC.vocab)
trg_vocab = len(TRG.vocab)
hidden_size = 512 # same for encoder and decoder
embedding_dim =  256 # same for encoder and decoder

In [26]:
encoder =  Encoder(vocab_size=src_vocab, embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
attention = Attention(hidden_size=hidden_size).to(device)
decoder = Decoder(attention, vocab_size=trg_vocab, embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
model = Seq2Seq(encoder=encoder, decoder=decoder).to(device)

In [27]:
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(18658, 256)
    (gru): GRU(256, 512, batch_first=True, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.15, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(9797, 256)
    (gru): GRU(1280, 512, batch_first=True)
    (fc): Linear(in_features=1792, out_features=9797, bias=True)
  )
)

In [63]:
count_parameters(model)

+----------------------------------+------------+
|             Modules              | Parameters |
+----------------------------------+------------+
|     encoder.embedding.weight     |  4776960   |
|     encoder.gru.weight_ih_l0     |   393216   |
|     encoder.gru.weight_hh_l0     |   786432   |
|      encoder.gru.bias_ih_l0      |    1536    |
|      encoder.gru.bias_hh_l0      |    1536    |
| encoder.gru.weight_ih_l0_reverse |   393216   |
| encoder.gru.weight_hh_l0_reverse |   786432   |
|  encoder.gru.bias_ih_l0_reverse  |    1536    |
|  encoder.gru.bias_hh_l0_reverse  |    1536    |
|        encoder.fc.weight         |   524288   |
|         encoder.fc.bias          |    512     |
|  decoder.attention.attn.weight   |   786432   |
|   decoder.attention.attn.bias    |    512     |
|    decoder.attention.v.weight    |    512     |
|     decoder.embedding.weight     |  2508544   |
|     decoder.gru.weight_ih_l0     |  1966080   |
|     decoder.gru.weight_hh_l0     |   786432   |


31288391

In [28]:
epochs = 10
lr = 1e-3
PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
total_steps = len(train_loader)

In [29]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX).to(device)
optimizer = optim.Adam(params=model.parameters(), lr=lr)

In [30]:
steps = 0

# epoch progress bar
epoch_progress = tqdm.tqdm(total=epochs, desc="Epoch", position=0)

for epoch in range(epochs):
    
    # step progress bar
    step_progress = tqdm.tqdm(total=total_steps, desc="Steps", position=0)
    
    epoch_loss = []
    epoch_ppl = []
    
    for batch in train_loader:
        
        outputs = model(batch.src, batch.trg)
        
        batch_size, seq_len = outputs.shape[0], outputs.shape[1]
        
        outputs = outputs.view((batch_size*seq_len, -1))
        labels = batch.trg.view(-1)
        
        loss = criterion(outputs, labels)
        ppl = torch.exp(loss)
        
        # backprograpage the loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        epoch_loss.append(loss.item())
        epoch_ppl.append(ppl.item())
        
        if steps % 200 == 0:
            print(f'Steps {steps}/{total_steps} | Train_loss {loss.item():.4f} | Train_ppl {ppl.item():.4f}')
        steps += 1
        step_progress.update(1)
    
    avg_loss = sum(epoch_loss)/len(epoch_loss)
    avg_ppl = sum(epoch_ppl)/len(epoch_ppl)
    
    val_loss, val_ppl = eval(model, val_loader, criterion)
    print(f'Epoch {epoch}/{epochs} | Train_loss {avg_loss:.4f} | Train_ppl {avg_ppl:.4f} | Val_loss {val_loss:.4f} | Val_ppl {val_ppl:.4f}')
    epoch_progress.update(1)
    

Steps:   0%|          | 0/454 [00:00<?, ?it/s]Epoch 0/10 | Train_loss 5.6659 | Train_ppl 412.2913 | Val_loss 5.1656 | Val_ppl 177.7235
Steps:   0%|          | 0/454 [00:00<?, ?it/s]Epoch 1/10 | Train_loss 4.6013 | Train_ppl 103.7042 | Val_loss 4.2714 | Val_ppl 74.1122
Steps:   0%|          | 0/454 [00:00<?, ?it/s]Epoch 2/10 | Train_loss 3.8116 | Train_ppl 46.0777 | Val_loss 3.8905 | Val_ppl 50.7565
Steps:   0%|          | 0/454 [00:00<?, ?it/s]Epoch 3/10 | Train_loss 3.2173 | Train_ppl 25.3009 | Val_loss 3.7378 | Val_ppl 43.9298
Steps:   0%|          | 0/454 [00:00<?, ?it/s]Epoch 4/10 | Train_loss 2.7660 | Train_ppl 16.0822 | Val_loss 3.7351 | Val_ppl 43.7428
Steps:   0%|          | 0/454 [00:00<?, ?it/s]Epoch 5/10 | Train_loss 2.4687 | Train_ppl 11.9568 | Val_loss 3.7069 | Val_ppl 43.8507
Steps:   0%|          | 0/454 [00:00<?, ?it/s]Epoch 6/10 | Train_loss 2.2636 | Train_ppl 9.7247 | Val_loss 3.8148 | Val_ppl 47.2787
Steps:   0%|          | 0/454 [00:00<?, ?it/s]Epoch 7/10 | Train_lo

#### Save, Load and Test

In [34]:
torch.save(model, "model.pth")

In [35]:
trained = torch.load("model.pth")

In [38]:
loss, ppl = eval(trained, test_loader, criterion)
print(f'Test_loss {loss:.4f} | Test_PPL {ppl}')

Test_loss 4.0117 | Test_PPL 59.965221643447876
