In [48]:
import torch
import torch.optim as optim
import torch.nn as nn
import torchtext
import tqdm
from torch.utils import tensorboard

In [2]:
# !pip install torchtext==0.6.0

In [4]:
# !python -m spacy download en
# !python -m spacy download de

### Data
- Multi30K dataset

In [5]:
BATCH_SIZE = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
device

device(type='cuda', index=0)

In [7]:
# SRC Field
source = torchtext.data.Field(
    init_token="<sos>",
    eos_token="<eos>",
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True,
    lower=True
)


# TRG Field
target = torchtext.data.Field(
    init_token="<sos>",
    eos_token="<eos>",
    tokenize="spacy",
    tokenizer_language="en",
    batch_first=True,
    lower=True
)

In [8]:
train, valid, test = torchtext.datasets.Multi30k.splits(
    exts=(".de", ".en"),
    fields=(source, target)
)

training.tar.gz:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 4.96MB/s]
validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 1.38MB/s]

downloading validation.tar.gz
downloading mmt_task1_test2016.tar.gz



mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 1.34MB/s]


In [9]:
source.build_vocab(train, min_freq=2)
target.build_vocab(train, min_freq=2)

In [10]:
train_loader, test_loader, val_loader = torchtext.data.BucketIterator.splits(
    datasets=(train, test, valid), 
    batch_size=BATCH_SIZE,
    device=device
)

In [11]:
for batch in train_loader:
    print(batch.src.size())
    break

torch.Size([64, 23])


### Model

#### Encoder

In [27]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=4, dropout = 0.25):
        super(Encoder, self).__init__()
        
        self.vocab_size = vocab_size
        
        # transoform the int tokens into embedding
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        
        # reccurent layer
        self.seq = nn.LSTM(
            input_size = embedding_dim,
            hidden_size = hidden_dim,
            num_layers = num_layers,
            batch_first = True,
            dropout = dropout,
            bidirectional = True
            
        )
    
    def forward(self, src):
        """
            outputs -> is the output at each time-steps, if the bidirectional is True it will concatenated
            hidden -> hidden state of the last time step
            cell -> cell state at the last time step
        """
        
        embedded =  self.embedding(src)
        outputs, (hidden, cell) = self.seq(embedded)
        return hidden, cell
    

#### Decoder

In [28]:
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers = 4, dropout = 0.25):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        
        # get the embedding of the int tokens
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        
        # recurrent layer
        self.seq = nn.LSTM(
            input_size = embedding_dim,
            hidden_size = hidden_dim,
            num_layers = num_layers,
            batch_first = True,
            dropout = dropout,
            bidirectional = True
        )
        
        # outputs should be from each time step, since it will be applied to each 
        # in_features is double of embedding dim because reccurent layer is bidirection and both gets concatenated
        self.fc = nn.Linear(in_features=2*hidden_dim, out_features=vocab_size)
        
    
    def forward(self, trg, hidden, cell):
        """ We have to pass the hidden state and cell state of the encoder network to decoder dims should be the same"""
        embedded = self.embedding(trg)
        outputs, (_, _) = self.seq(embedded, (hidden, cell)) ## here only outputs is relevant 
        prediction = self.fc(outputs.squeeze())
        return prediction
        

#### Seq2Seq

In [29]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, lr=1e-3, teacher_forcing_ratio=0.25):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.lr = lr
    
    
    def forward(self, src, trg):
        
        batch_size = trg.size(0)
        trg_seq_size = trg.size(1)
        
        outputs = torch.zeros((batch_size, trg_seq_size, self.decoder.vocab_size))
        
        hidden, cell = self.encoder(src)
        
        # hidden and cell size-> [2*num_layers, batch, hidden_dim] to [batch, 2*num_layers, hidden_dim]
        hidden = hidden.permute(1, 0, 2)
        cell = cell.permute(1, 0, 2)
        
        input = trg[0]
        
        for t in range(1, trg_seq_size):
            output = self.decoder(input.unsqueeze(0).contiguous(), hidden[t-1].unsqueeze(1).contiguous(), cell[t-1].unsqueeze(1).contiguous())
            outputs[t-1] = output
            
            # is teacher force
            teacher_force = torch.rand(1).item() < self.teacher_forcing_ratio
            top1 = torch.argmax(output, 1)
            
            input = trg[t] if teacher_force else top1
            
        output = self.decoder(input.unsqueeze(0).contiguous(), hidden[trg_seq_size-1].unsqueeze(1).contiguous(), cell[trg_seq_size-1].unsqueeze(1).contiguous())
        outputs[trg_seq_size-1] = output  
        return outputs
    

### Training

In [38]:
def eval(model, data, criterion):
    loss, ppl = [], []
    with torch.no_grad():
        for batch in data:
            outputs = model(batch.src, batch.trg)
            batch_size, seq_len = outputs.size(0), outputs.size(1)
            
            l = criterion(outputs.view(batch_size*seq_len, -1).contiguous().to(device), batch.trg.view(-1))
            p = torch.exp(l)
            
            loss.append(l.item())
            ppl.append(p.item())
    return sum(loss)/len(loss), sum(ppl)/len(ppl)

In [31]:
encoder = Encoder(vocab_size=len(source.vocab), embedding_dim=100, hidden_dim=64).to(device)
decoder = Decoder(vocab_size=len(target.vocab), embedding_dim=100, hidden_dim=64).to(device)
model = Seq2Seq(encoder=encoder, decoder=decoder).to(device)

In [32]:
lr = 1e-3
epochs = 10
PAD_IDX = target.vocab.stoi[target.pad_token]

In [35]:
# optimizer
optimizer = optim.Adam(params=model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX).to(device)
writer = tensorboard.SummaryWriter()

In [44]:
epoch_progress = tqdm.tqdm(total=epochs, desc="Epoch", position=0)
steps = 0

for epoch in range(epochs):
    
    step_progress = tqdm.tqdm(total=len(train_loader), desc="Step", position=0)
    
    for batch in train_loader:
        
        # compute  the outputs
        outputs = model(batch.src, batch.trg)
        batch_size, seq_len  = outputs.size(0), outputs.size(1)
        
        # compute the loss, ppl, gradient and backpropagate the loss
        optimizer.zero_grad()
        loss = criterion(outputs.view(batch_size*seq_len, -1).contiguous().to(device), batch.trg.view(-1).contiguous())
        ppl = torch.exp(loss)
        loss.backward()
        optimizer.step()
        
        # compute the validaiton loss and ppl
        val_loss, val_ppl = eval(model, val_loader, criterion)
        
        if steps%200 ==0:
            print(f'Epoch {epoch} | Steps {steps} | Train_loss {loss.item():.4f} | Train_PPL {ppl.item():.4f} | Val_loss {val_loss:.4f} | Val_PPL {val_ppl:.4f}')
        
        writer.add_scalar("train_loss", loss.item(), steps)
        writer.add_scalar("train_ppl", ppl.item(), steps)
        writer.add_scalar("val_loss", val_loss, steps)
        writer.add_scalar("val_ppl", val_ppl, steps)

        steps += 1
        step_progress.update(1)
        
    epoch_progress.update(1)
        
    
    

Step:   0%|          | 1/454 [00:01<14:05,  1.87s/it]

Epoch 0 | Steps 0 | Train_loss 8.3769 | Train_PPL 4345.3335 | Val_loss 8.3967 | Val_PPL 4465.9990


Step:  44%|████▍     | 201/454 [06:30<08:05,  1.92s/it]

Epoch 0 | Steps 200 | Train_loss 6.7218 | Train_PPL 830.3455 | Val_loss 7.5428 | Val_PPL 1987.8233


Step:  88%|████████▊ | 401/454 [12:52<01:38,  1.86s/it]

Epoch 0 | Steps 400 | Train_loss 6.7529 | Train_PPL 856.5743 | Val_loss 7.4994 | Val_PPL 1907.5278


Step:  95%|█████████▌| 433/454 [13:52<00:41,  1.99s/it]

IndexError: ignored

In [47]:
out = model(batch.src, batch.trg)

IndexError: ignored

In [45]:
batch.trg.shape

torch.Size([8, 28])

In [46]:
batch.src.shape

torch.Size([8, 23])