In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torchtext
import tqdm
from torch.utils import tensorboard

In [2]:
# !pip install torchtext==0.6.0

In [3]:
# !python -m spacy download en
# !python -m spacy download de

### Data
- Multi30K dataset

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cpu')

In [4]:
# SRC Field
source = torchtext.data.Field(
    init_token="<sos>",
    eos_token="<eos>",
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True,
    lower=True
)


# TRG Field
target = torchtext.data.Field(
    init_token="<sos>",
    eos_token="<eos>",
    tokenize="spacy",
    tokenizer_language="en",
    batch_first=True,
    lower=True
)

In [5]:
train, valid, test = torchtext.datasets.Multi30k.splits(
    exts=(".de", ".en"),
    fields=(source, target)
)

In [6]:
source.build_vocab(train, min_freq=2)
target.build_vocab(train, min_freq=2)

In [7]:
BATCH_SIZE = 512

train_loader, test_loader, val_loader = torchtext.data.BucketIterator.splits(
    datasets=(train, test, valid), 
    batch_size=BATCH_SIZE,
    device=device
)

In [8]:
for batch in train_loader:
    print(batch.src.size())
    break

torch.Size([512, 38])


### Model

#### Encoder

In [9]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=4, dropout = 0.25):
        super(Encoder, self).__init__()
        
        self.vocab_size = vocab_size
        
        # transoform the int tokens into embedding
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        
        # reccurent layer
        self.seq = nn.LSTM(
            input_size = embedding_dim,
            hidden_size = hidden_dim,
            num_layers = num_layers,
            batch_first = True,
            dropout = dropout,
            bidirectional = True
            
        )
    
    def forward(self, src):
        """
            outputs -> is the output at each time-steps, if the bidirectional is True it will concatenated
            hidden -> hidden state of the last time step
            cell -> cell state at the last time step
        """
        
        embedded =  self.embedding(src)
        outputs, (hidden, cell) = self.seq(embedded)
        return hidden, cell
    

#### Decoder

In [11]:
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers = 4, dropout = 0.25):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        
        # get the embedding of the int tokens
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        
        # recurrent layer
        self.seq = nn.LSTM(
            input_size = embedding_dim,
            hidden_size = hidden_dim,
            num_layers = num_layers,
            batch_first = True,
            dropout = dropout,
            bidirectional = True
        )
        
        # outputs should be from each time step, since it will be applied to each 
        # in_features is double of embedding dim because reccurent layer is bidirection and both gets concatenated
        self.fc = nn.Linear(in_features=2*hidden_dim, out_features=vocab_size)
        
    
    def forward(self, trg, hidden, cell):
        """ We have to pass the hidden state and cell state of the encoder network to decoder dims should be the same"""
        embedded = self.embedding(trg)
        outputs, (hidden, cell) = self.seq(embedded, (hidden, cell))
        prediction = self.fc(outputs.squeeze())
        return prediction, hidden, cell
        

#### Seq2Seq

In [12]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, teacher_forcing_ratio=0.25):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = teacher_forcing_ratio    
    
    def forward(self, src, trg):
        
        batch_size = trg.size(0)
        trg_seq_size = trg.size(1)
        
        outputs = torch.zeros((batch_size, trg_seq_size, self.decoder.vocab_size))
        
        hidden, cell = self.encoder(src)
        
        # hidden and cell size-> [2*num_layers, batch, hidden_dim] to [batch, 2*num_layers, hidden_dim]
#         hidden = hidden.permute(1, 0, 2)
#         cell = cell.permute(1, 0, 2)
        
        input = trg[:, 0]
        
        for t in range(1, trg_seq_size):
            
            output, hidden, cell = self.decoder(input.unsqueeze(1), hidden, cell)
#             print(f'Yaha tak ok -1')
            outputs[:, t] = output
            
            # is teacher force
            teacher_force = torch.rand(1).item() < self.teacher_forcing_ratio
            top1 = torch.argmax(output, 1)
            
            input = trg[:, t] if teacher_force else top1
            
        # output = self.decoder(input.unsqueeze(1), hidden, cell)
        # outputs[:, trg_seq_size-1] = output  
        return outputs
    

### Training

In [16]:
def eval(model, data, criterion):
    loss, ppl = [], []
    with torch.no_grad():
        for batch in data:
            outputs = model(batch.src, batch.trg)
            batch_size, seq_len = outputs.size(0), outputs.size(1)
            l = criterion(outputs.view(batch_size*seq_len, -1).contiguous().to(device), batch.trg.view(-1))
            p = torch.exp(l)
            loss.append(l.item())
            ppl.append(p.item())
    return sum(loss)/len(loss), sum(ppl)/len(ppl)

In [13]:
encoder = Encoder(vocab_size=len(source.vocab), embedding_dim=100, hidden_dim=64).to(device)
decoder = Decoder(vocab_size=len(target.vocab), embedding_dim=100, hidden_dim=64).to(device)
model = Seq2Seq(encoder=encoder, decoder=decoder).to(device)

In [17]:
lr = 0.01
epochs = 10
PAD_IDX = target.vocab.stoi[target.pad_token]

In [18]:
# optimizer
optimizer = optim.Adam(params=model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX).to(device)
writer = tensorboard.SummaryWriter()

In [19]:
epoch_progress = tqdm.tqdm(total=epochs, desc="Epoch", position=0)
steps = 0

for epoch in range(epochs):
    
    step_progress = tqdm.tqdm(total=len(train_loader), desc="Step", position=0)
    
    for batch in train_loader:
        
        # compute  the outputs
        outputs = model(batch.src, batch.trg)
        batch_size, seq_len  = outputs.size(0), outputs.size(1)
        
        # compute the loss, ppl, gradient and backpropagate the loss
        optimizer.zero_grad()
        loss = criterion(outputs.view(batch_size*seq_len, -1).contiguous().to(device), batch.trg.view(-1).contiguous())
        ppl = torch.exp(loss)
        loss.backward()
        optimizer.step()
        
        # compute the validaiton loss and ppl
        val_loss, val_ppl = eval(model, val_loader, criterion)
        
        if steps%50==0:
            print(f'Epoch {epoch} | Steps {steps} | Train_loss {loss.item():.4f} | Train_PPL {ppl.item():.4f} | Val_loss {val_loss:.4f} | Val_PPL {val_ppl:.4f}')
        
        writer.add_scalar("train_loss", loss.item(), steps)
        writer.add_scalar("train_ppl", ppl.item(), steps)
        writer.add_scalar("val_loss", val_loss, steps)
        writer.add_scalar("val_ppl", val_ppl, steps)

        steps += 1
        step_progress.update(1)
        
    epoch_progress.update(1)

Step:   0%|          | 0/57 [00:00<?, ?it/s]Epoch 0 | Steps 0 | Train_loss 8.6810 | Train_PPL 5890.1455 | Val_loss 8.5210 | Val_PPL 5019.2913
Step:  89%|████████▉ | 51/57 [05:58<00:47,  7.88s/it]Epoch 0 | Steps 50 | Train_loss 5.2808 | Train_PPL 196.5227 | Val_loss 5.2446 | Val_PPL 189.5772
Step:  77%|███████▋  | 44/57 [05:18<01:22,  6.36s/it]Epoch 1 | Steps 100 | Train_loss 5.1508 | Train_PPL 172.5658 | Val_loss 5.1352 | Val_PPL 170.2341
Step:  65%|██████▍   | 37/57 [04:10<02:12,  6.63s/it]Epoch 2 | Steps 150 | Train_loss 5.0915 | Train_PPL 162.6417 | Val_loss 5.0376 | Val_PPL 154.4008
Step:  53%|█████▎    | 30/57 [03:51<03:16,  7.29s/it]Epoch 3 | Steps 200 | Train_loss 4.9676 | Train_PPL 143.6830 | Val_loss 4.9274 | Val_PPL 138.1978
Step:  40%|████      | 23/57 [02:21<03:32,  6.24s/it]Epoch 4 | Steps 250 | Train_loss 4.8634 | Train_PPL 129.4582 | Val_loss 4.8321 | Val_PPL 125.7165
Step:  28%|██▊       | 16/57 [01:41<04:32,  6.65s/it]Epoch 5 | Steps 300 | Train_loss 4.7151 | Train_PPL

##### Test

In [20]:
loss, ppl = eval(model, test_loader, criterion)
print(f'Test_loss {loss:.4f} | Test_PPL {ppl}')

Test_loss 4.3522 | Test_PPL 77.95200729370117
