In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torchtext
import tqdm
from torch.utils import tensorboard

In [2]:
# !pip install torchtext==0.6.0

In [4]:
# !python -m spacy download en
# !python -m spacy download de

### Data
- Multi30K dataset

In [2]:
BATCH_SIZE = 512
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda', index=0)

In [4]:
# SRC Field
source = torchtext.data.Field(
    init_token="<sos>",
    eos_token="<eos>",
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True,
    lower=True
)


# TRG Field
target = torchtext.data.Field(
    init_token="<sos>",
    eos_token="<eos>",
    tokenize="spacy",
    tokenizer_language="en",
    batch_first=True,
    lower=True
)

In [5]:
train, valid, test = torchtext.datasets.Multi30k.splits(
    exts=(".de", ".en"),
    fields=(source, target)
)

In [6]:
source.build_vocab(train, min_freq=2)
target.build_vocab(train, min_freq=2)

In [7]:
train_loader, test_loader, val_loader = torchtext.data.BucketIterator.splits(
    datasets=(train, test, valid), 
    batch_size=BATCH_SIZE,
    device=device
)

In [8]:
for batch in train_loader:
    print(batch.src.size())
    break

torch.Size([512, 32])


### Model

#### Encoder

In [9]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=4, dropout = 0.25):
        super(Encoder, self).__init__()
        
        self.vocab_size = vocab_size
        
        # transoform the int tokens into embedding
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        
        # reccurent layer
        self.seq = nn.LSTM(
            input_size = embedding_dim,
            hidden_size = hidden_dim,
            num_layers = num_layers,
            batch_first = True,
            dropout = dropout,
            bidirectional = True
            
        )
    
    def forward(self, src):
        """
            outputs -> is the output at each time-steps, if the bidirectional is True it will concatenated
            hidden -> hidden state of the last time step
            cell -> cell state at the last time step
        """
        
        embedded =  self.embedding(src)
        outputs, (hidden, cell) = self.seq(embedded)
        return hidden, cell
    

#### Decoder

In [10]:
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers = 4, dropout = 0.25):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        
        # get the embedding of the int tokens
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        
        # recurrent layer
        self.seq = nn.LSTM(
            input_size = embedding_dim,
            hidden_size = hidden_dim,
            num_layers = num_layers,
            batch_first = True,
            dropout = dropout,
            bidirectional = True
        )
        
        # outputs should be from each time step, since it will be applied to each 
        # in_features is double of embedding dim because reccurent layer is bidirection and both gets concatenated
        self.fc = nn.Linear(in_features=2*hidden_dim, out_features=vocab_size)
        
    
    def forward(self, trg, hidden, cell):
        """ We have to pass the hidden state and cell state of the encoder network to decoder dims should be the same"""
        embedded = self.embedding(trg)
        outputs, (_, _) = self.seq(embedded, (hidden, cell)) ## here only outputs is relevant 
        prediction = self.fc(outputs.squeeze())
        return prediction
        

#### Seq2Seq

In [11]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, teacher_forcing_ratio=0.25):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = teacher_forcing_ratio    
    
    def forward(self, src, trg):
        
        batch_size = trg.size(0)
        trg_seq_size = trg.size(1)
        
        outputs = torch.zeros((batch_size, trg_seq_size, self.decoder.vocab_size))
        
        hidden, cell = self.encoder(src)
        
        # hidden and cell size-> [2*num_layers, batch, hidden_dim] to [batch, 2*num_layers, hidden_dim]
#         hidden = hidden.permute(1, 0, 2)
#         cell = cell.permute(1, 0, 2)
        
        input = trg[:, 0]
        
        for t in range(1, trg_seq_size):
            
            output = self.decoder(input.unsqueeze(1), hidden, cell)
#             print(f'Yaha tak ok -1')
            outputs[:, t-1] = output
            
            # is teacher force
            teacher_force = torch.rand(1).item() < self.teacher_forcing_ratio
            top1 = torch.argmax(output, 1)
            
            input = trg[:, t] if teacher_force else top1
            
        output = self.decoder(input.unsqueeze(1), hidden, cell)
        outputs[:, trg_seq_size-1] = output  
        return outputs
    

### Training

In [12]:
def eval(model, data, criterion):
    loss, ppl = [], []
    with torch.no_grad():
        for batch in data:
            outputs = model(batch.src, batch.trg)
            batch_size, seq_len = outputs.size(0), outputs.size(1)
            l = criterion(outputs.view(batch_size*seq_len, -1).contiguous().to(device), batch.trg.view(-1))
            p = torch.exp(l)
            loss.append(l.item())
            ppl.append(p.item())
    return sum(loss)/len(loss), sum(ppl)/len(ppl)

In [13]:
encoder = Encoder(vocab_size=len(source.vocab), embedding_dim=100, hidden_dim=64).to(device)
decoder = Decoder(vocab_size=len(target.vocab), embedding_dim=100, hidden_dim=64).to(device)
model = Seq2Seq(encoder=encoder, decoder=decoder).to(device)

In [14]:
lr = 0.01
epochs = 10
PAD_IDX = target.vocab.stoi[target.pad_token]

In [15]:
# optimizer
optimizer = optim.Adam(params=model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX).to(device)
writer = tensorboard.SummaryWriter()

In [17]:
epoch_progress = tqdm.tqdm(total=epochs, desc="Epoch", position=0)
steps = 0

for epoch in range(epochs):
    
    step_progress = tqdm.tqdm(total=len(train_loader), desc="Step", position=0)
    
    for batch in train_loader:
        
        # compute  the outputs
        outputs = model(batch.src, batch.trg)
        batch_size, seq_len  = outputs.size(0), outputs.size(1)
        
        # compute the loss, ppl, gradient and backpropagate the loss
        optimizer.zero_grad()
        loss = criterion(outputs.view(batch_size*seq_len, -1).contiguous().to(device), batch.trg.view(-1).contiguous())
        ppl = torch.exp(loss)
        loss.backward()
        optimizer.step()
        
        # compute the validaiton loss and ppl
        val_loss, val_ppl = eval(model, val_loader, criterion)
        
        if steps%50 ==0:
            print(f'Epoch {epoch} | Steps {steps} | Train_loss {loss.item():.4f} | Train_PPL {ppl.item():.4f} | Val_loss {val_loss:.4f} | Val_PPL {val_ppl:.4f}')
        
        writer.add_scalar("train_loss", loss.item(), steps)
        writer.add_scalar("train_ppl", ppl.item(), steps)
        writer.add_scalar("val_loss", val_loss, steps)
        writer.add_scalar("val_ppl", val_ppl, steps)

        steps += 1
        step_progress.update(1)
        
    epoch_progress.update(1)

Epoch:   0%|          | 0/10 [00:09<?, ?it/s]
Step:   0%|          | 0/57 [00:09<?, ?it/s]
Step:   2%|▏         | 1/57 [00:05<05:28,  5.87s/it]

Epoch 0 | Steps 0 | Train_loss 8.6991 | Train_PPL 5997.6294 | Val_loss 8.5711 | Val_PPL 5276.7410


Step:  89%|████████▉ | 51/57 [04:15<00:31,  5.27s/it]

Epoch 0 | Steps 50 | Train_loss 5.3659 | Train_PPL 213.9825 | Val_loss 5.2647 | Val_PPL 194.6020


Step:  77%|███████▋  | 44/57 [03:46<00:56,  4.38s/it]

Epoch 1 | Steps 100 | Train_loss 5.4100 | Train_PPL 223.6370 | Val_loss 5.2515 | Val_PPL 191.9643


Step:  65%|██████▍   | 37/57 [03:08<01:38,  4.90s/it]

Epoch 2 | Steps 150 | Train_loss 5.3308 | Train_PPL 206.6102 | Val_loss 5.2600 | Val_PPL 193.5812


Step:  53%|█████▎    | 30/57 [02:36<02:22,  5.28s/it]

Epoch 3 | Steps 200 | Train_loss 5.4122 | Train_PPL 224.1146 | Val_loss 5.2553 | Val_PPL 192.8116


Step:  40%|████      | 23/57 [01:58<02:55,  5.17s/it]

Epoch 4 | Steps 250 | Train_loss 5.3221 | Train_PPL 204.8108 | Val_loss 5.2546 | Val_PPL 192.5648


Step:  28%|██▊       | 16/57 [01:26<03:21,  4.91s/it]

Epoch 5 | Steps 300 | Train_loss 5.3591 | Train_PPL 212.5379 | Val_loss 5.2498 | Val_PPL 191.5727


Step:  16%|█▌        | 9/57 [00:47<03:53,  4.87s/it]

Epoch 6 | Steps 350 | Train_loss 5.3553 | Train_PPL 211.7194 | Val_loss 5.2599 | Val_PPL 193.5982


Step:   4%|▎         | 2/57 [00:09<03:58,  4.33s/it]

Epoch 7 | Steps 400 | Train_loss 5.3207 | Train_PPL 204.5289 | Val_loss 5.2625 | Val_PPL 194.0469


Step:  91%|█████████ | 52/57 [04:25<00:25,  5.01s/it]

Epoch 7 | Steps 450 | Train_loss 5.3549 | Train_PPL 211.6450 | Val_loss 5.2561 | Val_PPL 192.9969


Step:  79%|███████▉  | 45/57 [03:50<00:59,  4.94s/it]

Epoch 8 | Steps 500 | Train_loss 5.3891 | Train_PPL 219.0150 | Val_loss 5.2510 | Val_PPL 191.7467


Step:  67%|██████▋   | 38/57 [03:06<01:25,  4.49s/it]

Epoch 9 | Steps 550 | Train_loss 5.3142 | Train_PPL 203.1951 | Val_loss 5.2533 | Val_PPL 192.3628


Epoch: 100%|██████████| 10/10 [48:05<00:00, 286.61s/it]

##### Test

In [19]:
loss, ppl = eval(model, test_loader, criterion)
print(f'Test_loss {loss:.4f} | Test_PPL {ppl}')

Test_loss 5.2583 | Test_PPL 193.05118560791016
