In [1]:
# utils
import torch
from util import count_parameters
import random

# data
from torchtext.data import Field, BucketIterator
from torchtext.datasets import Multi30k

# model
import torch.nn as nn
import torch.nn.functional as F

# training
import tqdm
import torch.optim as optim


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda')

### Prepare the dataset

In [4]:
batch_size = 64
max_len = 128

In [5]:
# create the fields
SRC = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True
)

TRG = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="en",
    batch_first=True
)

In [6]:
# download the dataset
train, val, test = Multi30k.splits(
    exts=(".de", ".en"),
    fields=(SRC, TRG)
)

In [7]:
# build the vocab
SRC.build_vocab(train)
TRG.build_vocab(train)

In [8]:
# data loaders
train_loader, val_loader, test_loader = BucketIterator.splits(
    datasets=(train, val, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    device=device
)

In [9]:
for batch in train_loader:
    print(batch.src.shape, batch.trg.shape)
    break

torch.Size([64, 26]) torch.Size([64, 26])


## Model
- Encoder hidden_dim and decoder hidden_dim will be same to avoid discrepancy

#### Encoder Model

In [10]:
x = batch.src
y = batch.trg

In [11]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=1):
        
        super(Encoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        
        # gru layer
        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True
        )
        # input_features = 2*hidden_dim, output_features
        self.fc = nn.Linear(in_features=2*hidden_size, out_features=hidden_size)
        self.dropout = nn.Dropout(p=0.15)
        
    
    def forward(self, x):
        
        # x.shape -> [batch, seq_len]
        
        # compute the embedding
        embedded = self.embedding(x)
        # embedded.shape -> [batch, seq_len, embedding_dim]
        
        # pass the embedding to gru layer
        outputs, hidden = self.gru(embedded)   
        # outputs.shape -> [batch, seq_len, 2*hidden_size] and hidden.shape -> [2, batch, hidden_size]
        
        # hidden contains both forward pass hidden state as well backward pass hidden state concat the both
        concated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        
        # new hidden state 
        hidden = torch.tanh(self.fc(concated))
        
        return outputs, hidden

#### Self-attention Model

In [12]:
class Attention(nn.Module):
    
    def __init__(self, hidden_size):
        
        """It is advised to use same encoder_hidden_dim and decoder_hidden_dim"""
        super(Attention, self).__init__()
        
        # attention layer and params
        self.attn = nn.Linear(in_features=(3*hidden_size), out_features=hidden_size)
        self.v = nn.Linear(in_features=hidden_size, out_features=1, bias=False)
        
    
    def forward(self, hidden, encoder_outputs):
        
        # hidden.shape -> [batch, hidden_dim]
        # encoder_outptus.shape -> [batch, seq_len, 2*hidden_dim]
        seq_len = encoder_outputs.shape[1]
        
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)  
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze()
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

#### Decoder Model

In [13]:
class Decoder(nn.Module):
    
    def __init__(self, attention, vocab_size, embedding_dim, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.attention = attention
        
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        # reccurent net
        self.gru = nn.GRU(
            input_size=(2*hidden_size)+embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )
        
        # fully connected layer
        self.fc = nn.Linear(in_features=3*hidden_size+embedding_dim, out_features=vocab_size)
    
    def forward(self, input, hidden, encoder_outputs):
        
        """
            input.size() -> [batch, 1] - At time t only one token of each sample will be decoded
            hidden.size() ->[batch, hidden_size]
            encoder_outputs -> [batch, seq_len, 2*hidden_size]
        """
        
        embedded = self.embedding(input.to(device))
        # embedded[batch, seq_len, embedding_dim] embedded will have only two dim if seq_len is 1 (ie: at time t)
             
        attn = self.attention(hidden, encoder_outputs)
        attn = attn.unsqueeze(1)
    
        # calculate the weighted sum
        weighted = torch.bmm(attn, encoder_outputs)
        
        embedded = embedded.unsqueeze(1)
        
#         print(f'weighted shape {weighted.shape} embedded shape {embedded.shape}')
        rnn_input = torch.cat((embedded, weighted), dim=2)
        
        output, hidden = self.gru(rnn_input)
        
        # prepare the input for fully connected layer and make predicitons
        fc_input = torch.cat((weighted, output, embedded), dim=2)
        prediction = self.fc(fc_input)
        
        return prediction, hidden.squeeze()
        


#### Seq2Seq

In [50]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, teacher_forcing_ratio=0.25):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = teacher_forcing_ratio
    
    def forward(self, src, trg):
        """
            trg.size() -> [batch, seq_len]
            src.size() -> [batch, seq_len]
        """
        encoder_outputs, hidden = self.encoder(src)
        
        batch, seq_len, vocab_size =  trg.shape[0], trg.shape[1], self.decoder.vocab_size
        
        outputs = torch.zeros((batch, seq_len, vocab_size), device=device)
        
        # take the first token of each samples in the batch and calculate the attention for the same
        input = trg[:, 0]
        
        
        for t in range(1, seq_len):
            output, hidden = decoder(input, hidden, encoder_outputs)
            outputs[:, t] = output.squeeze()

            # is teacher force
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < self.teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.squeeze().argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[:, t] if teacher_force else top1
        
        return outputs
            

## Training

In [59]:
def eval(model, data, criterion):
    loss, ppl = [], []
    with torch.no_grad():
        for batch in data:
            outputs = model(batch.src, batch.trg)
            batch_size, seq_len = outputs.size(0), outputs.size(1)
            l = criterion(outputs.view(batch_size*seq_len, -1).contiguous().to(device), batch.trg.view(-1))
            p = torch.exp(l)
            loss.append(l.item())
            ppl.append(p.item())
    return sum(loss)/len(loss), sum(ppl)/len(ppl)

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            

In [60]:
src_vocab = len(SRC.vocab)
trg_vocab = len(TRG.vocab)
hidden_size = 512 # same for encoder and decoder
embedding_dim =  256 # same for encoder and decoder

In [61]:
encoder =  Encoder(vocab_size=src_vocab, embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
attention = Attention(hidden_size=hidden_size).to(device)
decoder = Decoder(attention, vocab_size=trg_vocab, embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
model = Seq2Seq(encoder=encoder, decoder=decoder).to(device)

In [62]:
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(18660, 256)
    (gru): GRU(256, 512, batch_first=True, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.15, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(9799, 256)
    (gru): GRU(1280, 512, batch_first=True)
    (fc): Linear(in_features=1792, out_features=9799, bias=True)
  )
)

In [63]:
count_parameters(model)

+----------------------------------+------------+
|             Modules              | Parameters |
+----------------------------------+------------+
|     encoder.embedding.weight     |  4776960   |
|     encoder.gru.weight_ih_l0     |   393216   |
|     encoder.gru.weight_hh_l0     |   786432   |
|      encoder.gru.bias_ih_l0      |    1536    |
|      encoder.gru.bias_hh_l0      |    1536    |
| encoder.gru.weight_ih_l0_reverse |   393216   |
| encoder.gru.weight_hh_l0_reverse |   786432   |
|  encoder.gru.bias_ih_l0_reverse  |    1536    |
|  encoder.gru.bias_hh_l0_reverse  |    1536    |
|        encoder.fc.weight         |   524288   |
|         encoder.fc.bias          |    512     |
|  decoder.attention.attn.weight   |   786432   |
|   decoder.attention.attn.bias    |    512     |
|    decoder.attention.v.weight    |    512     |
|     decoder.embedding.weight     |  2508544   |
|     decoder.gru.weight_ih_l0     |  1966080   |
|     decoder.gru.weight_hh_l0     |   786432   |


31288391

In [64]:
epochs = 10
lr = 1e-3
PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
total_steps = len(train_loader)

In [65]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX).to(device)
optimizer = optim.Adam(params=model.parameters(), lr=lr)

In [66]:
steps = 0

# epoch progress bar
epoch_progress = tqdm.tqdm(total=epochs, desc="Epoch", position=0)

for epoch in range(epochs):
    
    # step progress bar
    step_progress = tqdm.tqdm(total=total_steps, desc="Steps", position=0)
    
    epoch_loss = []
    epoch_ppl = []
    
    for batch in train_loader:
        
        outputs = model(batch.src, batch.trg)
        
        batch_size, seq_len = outputs.shape[0], outputs.shape[1]
        
        outputs = outputs.view((batch_size*seq_len, -1))
        labels = batch.trg.view(-1)
        
        loss = criterion(outputs, labels)
        ppl = torch.exp(loss)
        
        # backprograpage the loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        epoch_loss.append(loss.item())
        epoch_ppl.append(ppl.item())
        
        if steps % 200 == 0:
            print(f'Steps {steps}/{total_steps} | Train_loss {loss.item():.4f} | Train_ppl {ppl.item():.4f}')
        steps += 1
        step_progress.update(1)
    
    avg_loss = sum(epoch_loss)/len(epoch_loss)
    avg_ppl = sum(epoch_ppl)/len(epoch_ppl)
    
    val_loss, val_ppl = eval(model, val_loader, criterion)
    print(f'Epoch {epoch}/{epochs} | Train_loss {avg_loss:.4f} | Train_ppl {avg_ppl:.4f} | Val_loss {val_loss:.4f} | Val_ppl {val_ppl:.4f}')
    epoch_progress.update(1)
    

Steps:   0%|          | 1/454 [00:00<02:25,  3.12it/s]

Steps 0/454 | Train_loss 9.1900 | Train_ppl 9798.4600


Steps:  44%|████▍     | 201/454 [00:48<01:09,  3.66it/s]

Steps 200/454 | Train_loss 5.3987 | Train_ppl 221.1098


Steps:  88%|████████▊ | 401/454 [01:42<00:14,  3.77it/s]

Steps 400/454 | Train_loss 5.2952 | Train_ppl 199.3795


Steps:   0%|          | 0/454 [00:00<?, ?it/s].97s/it]s]

Epoch 0/10 | Train_loss 5.5405 | Train_ppl 378.7909 | Val_loss 5.1971 | Val_ppl 183.9508


Steps:  32%|███▏      | 147/454 [00:44<01:26,  3.55it/s]

Steps 600/454 | Train_loss 5.0191 | Train_ppl 151.2736


Steps:  76%|███████▋  | 347/454 [01:49<00:37,  2.89it/s]

Steps 800/454 | Train_loss 4.9254 | Train_ppl 137.7439


Steps:   0%|          | 0/454 [00:00<?, ?it/s].40s/it]s]

Epoch 1/10 | Train_loss 4.8522 | Train_ppl 130.4754 | Val_loss 4.6669 | Val_ppl 110.1469


Steps:  20%|██        | 93/454 [00:30<02:05,  2.88it/s]

Steps 1000/454 | Train_loss 4.1686 | Train_ppl 64.6269


Steps:  65%|██████▍   | 293/454 [01:28<00:47,  3.42it/s]

Steps 1200/454 | Train_loss 4.0167 | Train_ppl 55.5161


Steps:   0%|          | 0/454 [00:00<?, ?it/s].99s/it]s]

Epoch 2/10 | Train_loss 4.0960 | Train_ppl 61.6530 | Val_loss 3.7980 | Val_ppl 46.9670


Steps:   9%|▊         | 39/454 [00:10<01:52,  3.70it/s]

Steps 1400/454 | Train_loss 3.7101 | Train_ppl 40.8581


Steps:  53%|█████▎    | 239/454 [01:06<01:04,  3.32it/s]

Steps 1600/454 | Train_loss 3.3538 | Train_ppl 28.6109


Steps:  97%|█████████▋| 439/454 [02:04<00:03,  3.98it/s]

Steps 1800/454 | Train_loss 3.3611 | Train_ppl 28.8223


Steps:   0%|          | 0/454 [00:00<?, ?it/s].58s/it]s]

Epoch 3/10 | Train_loss 3.4940 | Train_ppl 33.3905 | Val_loss 3.6220 | Val_ppl 38.8636


Steps:  41%|████      | 185/454 [00:53<01:11,  3.75it/s]

Steps 2000/454 | Train_loss 3.1117 | Train_ppl 22.4589


Steps:  85%|████████▍ | 385/454 [01:52<00:21,  3.21it/s]

Steps 2200/454 | Train_loss 3.3167 | Train_ppl 27.5679


Steps:   0%|          | 0/454 [00:00<?, ?it/s].91s/it]s]

Epoch 4/10 | Train_loss 3.0847 | Train_ppl 22.1661 | Val_loss 3.4998 | Val_ppl 34.6025


Steps:  29%|██▉       | 131/454 [00:37<01:31,  3.54it/s]

Steps 2400/454 | Train_loss 2.7832 | Train_ppl 16.1706


Steps:  73%|███████▎  | 331/454 [01:37<00:36,  3.38it/s]

Steps 2600/454 | Train_loss 2.9247 | Train_ppl 18.6294


Steps:   0%|          | 0/454 [00:00<?, ?it/s].94s/it]s]

Epoch 5/10 | Train_loss 2.7419 | Train_ppl 15.7333 | Val_loss 3.4048 | Val_ppl 31.8850


Steps:  17%|█▋        | 77/454 [00:24<01:52,  3.34it/s]

Steps 2800/454 | Train_loss 2.3164 | Train_ppl 10.1389


Steps:  61%|██████    | 277/454 [01:30<01:17,  2.29it/s]

Steps 3000/454 | Train_loss 2.6070 | Train_ppl 13.5578


Steps:   0%|          | 0/454 [00:00<?, ?it/s].63s/it]s]

Epoch 6/10 | Train_loss 2.4778 | Train_ppl 12.0515 | Val_loss 3.3929 | Val_ppl 31.4242


Steps:   5%|▌         | 23/454 [00:08<02:47,  2.58it/s]

Steps 3200/454 | Train_loss 2.2414 | Train_ppl 9.4064


Steps:  49%|████▉     | 223/454 [01:18<01:30,  2.56it/s]

Steps 3400/454 | Train_loss 2.2448 | Train_ppl 9.4387


Steps:  93%|█████████▎| 423/454 [02:33<00:11,  2.65it/s]

Steps 3600/454 | Train_loss 2.5149 | Train_ppl 12.3654


Steps:   0%|          | 0/454 [00:00<?, ?it/s].96s/it]s]

Epoch 7/10 | Train_loss 2.2866 | Train_ppl 9.9442 | Val_loss 3.4511 | Val_ppl 33.5790


Steps:  37%|███▋      | 169/454 [01:05<01:38,  2.88it/s]

Steps 3800/454 | Train_loss 2.2173 | Train_ppl 9.1822


Steps:  81%|████████▏ | 369/454 [02:17<00:26,  3.20it/s]

Steps 4000/454 | Train_loss 2.3647 | Train_ppl 10.6408


Steps:   0%|          | 0/454 [00:00<?, ?it/s].31s/it]s]

Epoch 8/10 | Train_loss 2.1417 | Train_ppl 8.5995 | Val_loss 3.4507 | Val_ppl 33.4235


Steps:  25%|██▌       | 115/454 [00:41<01:42,  3.32it/s]

Steps 4200/454 | Train_loss 1.8659 | Train_ppl 6.4615


Steps:  69%|██████▉   | 315/454 [01:55<00:42,  3.30it/s]

Steps 4400/454 | Train_loss 1.8664 | Train_ppl 6.4653


Epoch: 100%|██████████| 10/10 [24:07<00:00, 157.17s/it]]

Epoch 9/10 | Train_loss 2.0027 | Train_ppl 7.4876 | Val_loss 3.4755 | Val_ppl 34.7025


#### Save, Load and Test

In [68]:
torch.save(model, "model.pth")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [69]:
trained = torch.load("model.pth")

In [72]:
loss, ppl = eval(trained, test_loader, criterion)
print(f'Test_loss {loss:.4f} | Test_PPL {ppl}')

Test_loss 3.4197 | Test_PPL 33.099823236465454
