In [3]:
# utils
import torch

# data
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

# model
import torch.nn as nn
import torch.nn.functional as F

# training and evaluation
import torch.optim as optim
import tqdm

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [24]:
device

device(type='cpu')

#### Data Preparation

In [25]:
max_len = 120
BATCH_SIZE = 128

In [26]:
# create source and target field
source = Field(
    init_token="<sos>",
    eos_token="<eos>",
    tokenize="spacy",
    tokenizer_language="de",
    lower=True,
    batch_first=True,    
)

target = Field(
    init_token="<sos>",
    eos_token="<eos>",
    tokenize="spacy",
    tokenizer_language="en",
    lower=True,
    batch_first=True,    
)

In [27]:
# create datasets
train, test, val = Multi30k.splits(
    exts=(".de", ".en"),
    fields=(source, target)
)

In [28]:
# build vocabulary
source.build_vocab(train, min_freq=2)
target.build_vocab(train, min_freq=2)

In [29]:
# data loaders
train_loader, test_loader, val_loader = BucketIterator.splits(
    datasets=(train, test, val),
    batch_size=BATCH_SIZE,
    device=device
)

In [30]:
len(train_loader)

227

In [31]:
for batch in train_loader:
    print(batch.src.size(), batch.trg.size())
    break

torch.Size([128, 32]) torch.Size([128, 29])


### Model

#### Encoder

In [32]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, dropout=0.20, num_layers=1):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.seq = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
    
    
    def forward(self, src):
        # src.size() -> [batch, seq_len]
        embedded = self.embedding(src)
        outputs, hidden = self.seq(embedded)
        return hidden, hidden

#### Decoder

![seq2seq](./seq2seq7.png)

In [33]:
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, dropout=0.20, num_layers=1):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.seq = nn.GRU(
            input_size=embedding_dim + hidden_size, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
#             batch_first=True
        )
        self.fc = nn.Linear(in_features=2*hidden_size+embedding_dim, out_features=vocab_size)
    
    def forward(self, input, hidden, context):
        
        # a single token will be fed to decoder 
        # initially hidden vector and context vector will be same
        """
            input.size()   -> [1, batch]
            hidden.size()  -> [batch, 1, hidden_size]
            context.size() -> [batch, 1,  hidden_size]
        """
        
        embedded = self.embedding(input)
        # embedded.size() -> [batch_size, emb_dim]
        
        embedded_context = torch.cat((embedded.unsqueeze(0), context.type_as(embedded)), dim=2)
        # embedded_context.size() -> [1, batch_size, emb_dim+hidden_size]
    
        outputs, hidden = self.seq(embedded_context, hidden)
        # both outputs and hidden will be same
        

        # output, embedding and conext vector
        combined = torch.cat((outputs, context, embedded.unsqueeze(0)), dim=2)
        
        predictions  = F.softmax(self.fc(combined.squeeze()), dim=1)
        
        return predictions, hidden


#### Seq2Seq

In [34]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg):
        
        """
            src.size() -> [batch, seq_len]
            trg.size() -> [batch, seq_len]
        """
        batch = trg.size(0)
        seq_len = trg.size(1)
        vocab_size = self.decoder.vocab_size
        
        hidden, context = self.encoder(src)
        
        # create an empty tensor to store the outputs
        # outputs.shape -> [batch_size, seq_len, vocab_size]
        outputs = torch.empty((batch, seq_len, vocab_size))
        
        # take first token of each samples in the batch
        input = trg[:, 0]
        
        for i in range(1, seq_len):
            predictions, hidden = self.decoder(input, hidden, context)
            input = trg[:, i]
            outputs[:, i-1, :] = predictions
        return outputs 

### Training

In [35]:
def eval(model, data, criterion):
    loss, ppl = [], []
    with torch.no_grad():
        for batch in data:
            outputs = model(batch.src, batch.trg)
            bs, seq_len, vocab_size = outputs.size(0), outputs.size(1), outputs.size(2)
            l = criterion(outputs.view(bs*seq_len, vocab_size), batch.trg.view(-1))
            p = torch.exp(l)
            loss.append(l.item())
            ppl.append(p.item())
    return sum(loss)/len(loss), sum(ppl)/len(ppl)
        

In [40]:
# model params
embedding_dim = 100
hidden_size = 64

# training configuration
epochs = 10
lr = 0.1
PAD_IDX = target.vocab[target.pad_token]

In [41]:
# create the models
encoder = Encoder(vocab_size=len(source.vocab), embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
decoder = Decoder(vocab_size=len(source.vocab), embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
model = Seq2Seq(encoder, decoder).to(device)

In [42]:
# optimizer and criterion
optimizer = optim.Adam(params=model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
epoch_progress = tqdm.tqdm(total=epochs, desc="Epoch", position=0)
steps = 0

for epoch in range(epochs):
    epoch_loss, epoch_ppl = [], []
    
    for batch in train_loader:
        
        # forward pass
        outputs = model(batch.src, batch.trg)
        bs, seq_len =  outputs.size(0), outputs.size(1)
        
        # calculate loss and backpropagate the gradients
        optimizer.zero_grad()
        loss = criterion(outputs.view(bs*seq_len, -1), batch.trg.view(-1))
        loss.backward()
        optimizer.step()
        
        if steps % 200 == 0:
            val_loss, val_ppl = eval(model, val_loader, criterion)
            print(f'Epochs {epoch} | Steps {steps}')
            print(f'train_loss {loss.item():.4f} | train_ppl {torch.exp(loss).item():.4f}')
            print(f'val_loss {val_loss:.4f} | val_ppl {val_ppl:.4f}')
        steps += 1
    epoch_progress.update(1)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Epochs 0 | Steps 0
train_loss 8.9687 | train_ppl 7853.5610
val_loss 8.3490 | val_ppl 4226.8874
Epochs 0 | Steps 200
train_loss 8.1844 | train_ppl 3584.6763
val_loss 8.1785 | val_ppl 3563.7547


Epoch:  10%|█         | 1/10 [10:19<1:32:52, 619.19s/it]

Epochs 1 | Steps 400
train_loss 8.1833 | train_ppl 3580.7642
val_loss 8.1785 | val_ppl 3563.6953


Epoch:  20%|██        | 2/10 [17:17<1:14:31, 558.89s/it]

Epochs 2 | Steps 600
train_loss 8.2058 | train_ppl 3662.2341
val_loss 8.1784 | val_ppl 3563.5409


Epoch:  30%|███       | 3/10 [21:45<55:01, 471.70s/it]  

Epochs 3 | Steps 800
train_loss 8.1762 | train_ppl 3555.1658
val_loss 8.1784 | val_ppl 3563.5422


Epoch:  40%|████      | 4/10 [25:41<40:06, 401.06s/it]

Epochs 4 | Steps 1000
train_loss 8.2028 | train_ppl 3651.2561
val_loss 8.1784 | val_ppl 3563.5443


Epoch:  50%|█████     | 5/10 [29:35<29:14, 350.88s/it]

Epochs 5 | Steps 1200
train_loss 8.1843 | train_ppl 3584.3447
val_loss 8.1784 | val_ppl 3563.5443


Epoch:  60%|██████    | 6/10 [33:43<21:19, 319.87s/it]

Epochs 6 | Steps 1400
train_loss 8.1880 | train_ppl 3597.5122
val_loss 8.1882 | val_ppl 3598.8373


Epoch:  70%|███████   | 7/10 [38:00<15:03, 301.21s/it]

Epochs 7 | Steps 1600
train_loss 8.2086 | train_ppl 3672.5132
val_loss 8.1883 | val_ppl 3599.5074
