In [1]:
# model
import torch
import torch.nn as nn

# data 
import torchtext
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

#training 
import tqdm 

In [2]:
### Random seed for deterministic results
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
device

device(type='cuda')

### Data 

In [10]:
BATCH_SIZE = 128
max_len = 48

In [11]:
# SRC Field
SRC = Field(
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True,
    lower=True
)

In [12]:
# TRG Field
TRG = Field(
    tokenize="spacy",
    tokenizer_language="en",
    batch_first=True,
    lower=True
)

In [13]:
## download the dataset
train, valid, test = Multi30k.splits(exts=(".de", ".en"), fields=(SRC, TRG))

In [14]:
# Bulding the vocab
SRC.build_vocab(train, min_freq=2)
TRG.build_vocab(train, min_freq=2)

In [15]:
# Data loader
train_loader, val_loader, test_loader = BucketIterator.splits(
    datasets=(train, valid, test),
    batch_size=BATCH_SIZE,
    device=device
)

In [16]:
for i, batch in enumerate(train_loader):
    print(batch.src.size())
    print(batch.trg.size())
    break

torch.Size([128, 30])
torch.Size([128, 32])


In [17]:
## Vocan Insepction
print(f'English vocab {len(TRG.vocab)}\nGerman vocab {len(SRC.vocab)}')

English vocab 5891
German vocab 7853


### Seq2Seq Model

#### Encoder

In [18]:
class Encoder(nn.Module):
    
    def __init__(self, hidden_dim, embedding_dim, vocab_size, num_layers=2, dropout=0.25):
        super(Encoder, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.seq = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=hidden_dim, 
            num_layers=num_layers, 
            batch_first=True, 
            dropout=dropout
        )
        self.dropout = nn.Dropout(p=dropout)
    
    def forward(self, src):
        
        embedded = self.embedding(src)
        
        outputs, (hidden, cell) = self.seq(embedded)
        
        return hidden, cell        

#### Decoder

In [19]:
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2, dropout=0.25):
        super(Decoder, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.output_dim = vocab_size
        
        self.embedding  = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.seq = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=hidden_dim, 
            num_layers=num_layers, 
            batch_first=True, 
            dropout=dropout
        )
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(in_features=hidden_dim, out_features=vocab_size)
    
    def forward(self, x, hidden, cell):
        
        x = x.unsqueeze(1)
        
        embedded = self.dropout(self.embedding(x))
        
        output, (hidden, cell) = self.seq(embedded, (hidden, cell))
        
        prediction = self.fc(output.squeeze(0))
        
        return prediction, hidden, cell    

In [20]:
## Hidden_dim dim and num_layers of encoder and decoder must be sam 

In [21]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder  = encoder
        self.decoder = decoder
        
    
    def forward(self, src, trg):

        
        batch_size = src.size(0)
        trg_len = trg.size(1)
        
        trg_vocab_size = self.decoder.output_dim
        
        hidden, cell = encoder(src)

        outputs = torch.zeros((batch_size, trg_len, self.decoder.output_dim), device=device)

        input = trg[:, 0]

        for t in range(1, trg_len):
            prediction, hidden, cell = self.decoder(input, hidden, cell)

            outputs[:, t] = prediction.squeeze()
            input = trg[:, t]
        
        return outputs 
             

### Training

In [32]:
encoder = Encoder(hidden_dim=64, embedding_dim=100, vocab_size=len(SRC.vocab)).to(device)
decoder = Decoder(vocab_size=len(TRG.vocab), embedding_dim=100, hidden_dim=64).to(device)
model = Seq2Seq(encoder, decoder).to(device)

In [33]:
# model(batch.src, batch.trg).shape

In [34]:
lr = 1e-3
epochs = 10
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]


In [35]:
###
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX).to(device)

In [36]:
def eval(model, data, criterion):
    losses = []
    with torch.no_grad():
        for batch in data:
            outputs = model(batch.src, batch.trg)
            loss = criterion(outputs.view(outputs.shape[0]*outputs.shape[1], outputs.shape[2]), batch.trg.view(-1))
            losses.append(loss.item())
    return sum(losses)/len(losses)
            

In [37]:
steps = 0

for epoch in range(epochs):
    
    train_loss = []
    
    for batch in train_loader:
        src = batch.src
        trg = batch.trg
        
        outputs = model(src, trg)
        
        loss = criterion(outputs.view(outputs.shape[0]*outputs.shape[1], outputs.shape[2]), trg.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss.append(loss.item())

        steps += 1

    
    avg_loss = sum(train_loss)/len(train_loss)
    avg_ppl = torch.exp(torch.tensor([avg_loss])).item()

    # val_loss = eval(model, val_loader, criterion)
    # val_ppl = torch.exp(torch.tensor([val_loss])).item()
    
    print(f'Epoch {epoch}/{epochs} | Step {steps}/{len(train_loader)} | Train_loss {avg_loss:.2f} | Train_ppl {avg_ppl:.2f}')



Epoch 0/10 | Step 227/227 | Train_loss 6.15 | Train_ppl 471.05
Epoch 1/10 | Step 454/227 | Train_loss 5.62 | Train_ppl 274.53
Epoch 2/10 | Step 681/227 | Train_loss 5.30 | Train_ppl 201.33
Epoch 3/10 | Step 908/227 | Train_loss 5.03 | Train_ppl 153.12
Epoch 4/10 | Step 1135/227 | Train_loss 4.83 | Train_ppl 125.82
Epoch 5/10 | Step 1362/227 | Train_loss 4.70 | Train_ppl 109.98
Epoch 6/10 | Step 1589/227 | Train_loss 4.60 | Train_ppl 99.35
Epoch 7/10 | Step 1816/227 | Train_loss 4.50 | Train_ppl 90.34
Epoch 8/10 | Step 2043/227 | Train_loss 4.42 | Train_ppl 82.77
Epoch 9/10 | Step 2270/227 | Train_loss 4.34 | Train_ppl 76.90


In [38]:
eval(model, val_loader, criterion)

4.311398983001709