In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from collections import Counter
import numpy as np
from tqdm import tqdm
import random
import spacy
import sacrebleu
import math
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# Set device

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [33]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load spaCy models

In [34]:
try:
    spacy_eng = spacy.load("en_core_web_sm")
    spacy_ger = spacy.load("de_core_news_sm")
    print("spaCy models loaded successfully.")
except:
    print("Installing spaCy models...")
    import os
    os.system("python -m spacy download en_core_web_sm")
    os.system("python -m spacy download de_core_news_sm")
    spacy_eng = spacy.load("en_core_web_sm")
    spacy_ger = spacy.load("de_core_news_sm")
    print("spaCy models installed and loaded successfully.")

spaCy models loaded successfully.


# Tokenization functions

In [35]:
def tokenize_en(text):
    return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

def tokenize_de(text):
    return [tok.text.lower() for tok in spacy_ger.tokenizer(text)]

# Vocabulary building function

In [36]:
def build_vocab(sentences, tokenizer, min_freq=2, max_size=7000):
    counter = Counter()
    for s in sentences:
        counter.update(tokenizer(s))
    vocab = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
    words_and_frequencies = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    for word, freq in words_and_frequencies:
        if freq >= min_freq:
            if len(vocab) >= max_size:
                break
            vocab[word] = len(vocab)
    return vocab

# Load and preprocess dataset


In [37]:
print("Loading dataset...")
dataset = load_dataset("opus_books", "de-en")
train_data = dataset['train']
en_sentences = [ex['translation']['en'] for ex in train_data]
de_sentences = [ex['translation']['de'] for ex in train_data]

Loading dataset...


In [38]:
print("Building vocabularies...")
en_vocab = build_vocab(en_sentences, tokenize_en, min_freq=2)
de_vocab = build_vocab(de_sentences, tokenize_de, min_freq=2)

Building vocabularies...


In [39]:
print(f"English vocabulary size: {len(en_vocab)}")
print(f"German vocabulary size: {len(de_vocab)}")

English vocabulary size: 7000
German vocabulary size: 7000


# Dataset class


In [40]:
class TranslationDataset(Dataset):
    def __init__(self, data_pairs, src_vocab, tgt_vocab, src_tokenizer, tgt_tokenizer, max_len=100):
        self.data_pairs = []
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        for pair in data_pairs:
            src_tokens = src_tokenizer(pair['en'])
            tgt_tokens = tgt_tokenizer(pair['de'])
            if len(src_tokens) <= max_len and len(tgt_tokens) <= max_len:
                self.data_pairs.append(pair)
        print(f"Retained {len(self.data_pairs)} of {len(data_pairs)} examples after length filtering")

    def __len__(self):
        return len(self.data_pairs)

    def __getitem__(self, idx):
        pair = self.data_pairs[idx]
        src_tokens = self.src_tokenizer(pair['en'])
        src_indices = [self.src_vocab.get(token, self.src_vocab['<UNK>']) for token in src_tokens]
        src_indices = [self.src_vocab['<SOS>']] + src_indices + [self.src_vocab['<EOS>']]
        src_tensor = torch.LongTensor(src_indices)

        tgt_tokens = self.tgt_tokenizer(pair['de'])
        tgt_indices = [self.tgt_vocab.get(token, self.tgt_vocab['<UNK>']) for token in tgt_tokens]
        tgt_indices = [self.tgt_vocab['<SOS>']] + tgt_indices + [self.tgt_vocab['<EOS>']]
        tgt_tensor = torch.LongTensor(tgt_indices)

        return src_tensor, tgt_tensor

# Collate function


In [41]:
def collate_fn(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    src_seqs, tgt_seqs = zip(*batch)
    src_lengths = torch.LongTensor([len(s) for s in src_seqs])
    tgt_lengths = torch.LongTensor([len(t) for t in tgt_seqs])
    src_padded = pad_sequence(src_seqs, batch_first=True, padding_value=0)
    tgt_padded = pad_sequence(tgt_seqs, batch_first=True, padding_value=0)
    return src_padded, tgt_padded, src_lengths, tgt_lengths

# Create datasets and dataloaders


In [42]:
data_pairs = [{'en': en, 'de': de} for en, de in zip(en_sentences, de_sentences)]
train_size = min(30000, len(data_pairs))
val_size = min(3000, len(data_pairs) - train_size)
indices = list(range(len(data_pairs)))
random.shuffle(indices)
train_indices = indices[:train_size]
val_indices = indices[train_size:train_size+val_size]
train_pairs = [data_pairs[i] for i in train_indices]
val_pairs = [data_pairs[i] for i in val_indices]

In [43]:
train_dataset = TranslationDataset(train_pairs, en_vocab, de_vocab, tokenize_en, tokenize_de, max_len=80)
val_dataset = TranslationDataset(val_pairs, en_vocab, de_vocab, tokenize_en, tokenize_de, max_len=80)

Retained 29151 of 30000 examples after length filtering
Retained 2907 of 3000 examples after length filtering


In [44]:
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Encoder class


In [45]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, bidirectional=True, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        self.fc_hidden = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc_cell = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_lengths):
        embedded = self.dropout(self.embedding(src))
        packed_embedded = pack_padded_sequence(embedded, src_lengths.cpu(), batch_first=True)
        packed_outputs, (hidden, cell) = self.rnn(packed_embedded)
        outputs, _ = pad_packed_sequence(packed_outputs, batch_first=True)
        hidden_processed = []
        cell_processed = []
        for layer in range(self.n_layers):
            forward_hidden = hidden[2*layer]
            backward_hidden = hidden[2*layer+1]
            combined_hidden = torch.cat([forward_hidden, backward_hidden], dim=1)
            processed_hidden = torch.tanh(self.fc_hidden(combined_hidden))
            hidden_processed.append(processed_hidden)

            forward_cell = cell[2*layer]
            backward_cell = cell[2*layer+1]
            combined_cell = torch.cat([forward_cell, backward_cell], dim=1)
            processed_cell = torch.tanh(self.fc_cell(combined_cell))
            cell_processed.append(processed_cell)
        hidden_final = torch.stack(hidden_processed)
        cell_final = torch.stack(cell_processed)
        return outputs, hidden_final, cell_final

# Attention class


In [46]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hidden_dim * 2) + dec_hidden_dim, dec_hidden_dim)
        self.v = nn.Linear(dec_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs, mask=None):
        batch_size = encoder_outputs.size(0)
        src_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        if mask is not None:
            attention = attention.masked_fill(mask == 0, -1e10)
        attention_weights = F.softmax(attention, dim=1)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)
        return context.squeeze(1), attention_weights

# Decoder class


In [47]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hidden_dim, dec_hidden_dim, n_layers, attention, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim + (enc_hidden_dim * 2), dec_hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        self.fc_out = nn.Linear((enc_hidden_dim * 2) + dec_hidden_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs, mask=None):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        context, attn_weights = self.attention(hidden[-1], encoder_outputs, mask)
        context = context.unsqueeze(1)
        rnn_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        output = output.squeeze(1)
        context = context.squeeze(1)
        embedded = embedded.squeeze(1)
        prediction = self.fc_out(torch.cat((output, context, embedded), dim=1))
        return prediction, hidden, cell, attn_weights

# Seq2Seq class


In [48]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.device = device

    def create_mask(self, src):
        mask = (src != self.src_pad_idx).float()
        return mask

    def forward(self, src, src_lengths, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        src_mask = self.create_mask(src)
        encoder_outputs, hidden, cell = self.encoder(src, src_lengths)
        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden, cell, _ = self.decoder(input, hidden, cell, encoder_outputs, src_mask)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
        return outputs

# Initialize model parameters


In [49]:
INPUT_DIM = len(en_vocab)
OUTPUT_DIM = len(de_vocab)
ENC_EMB_DIM = 512
DEC_EMB_DIM = 512
ENC_HIDDEN_DIM = 512
DEC_HIDDEN_DIM = 512
ENC_LAYERS = 2
DEC_LAYERS = 2
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3
SRC_PAD_IDX = en_vocab['<PAD>']
TRG_PAD_IDX = de_vocab['<PAD>']

# Initialize encoder, decoder, and seq2seq model


In [50]:
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HIDDEN_DIM, ENC_LAYERS, ENC_DROPOUT)
attn = Attention(ENC_HIDDEN_DIM, DEC_HIDDEN_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, DEC_LAYERS, attn, DEC_DROPOUT)
model = Seq2Seq(enc, dec, SRC_PAD_IDX, device).to(device)

# Initialize model weights


In [51]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.orthogonal_(param.data)
        elif 'bias' in name:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7000, 512)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
    (fc_hidden): Linear(in_features=1024, out_features=512, bias=True)
    (fc_cell): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(7000, 512)
    (rnn): LSTM(1536, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc_out): Linear(in_features=2048, out_features=7000, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)

# Count trainable parameters


In [52]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 40,149,848 trainable parameters


# Define optimizer and loss function


In [53]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

# Training function


In [54]:
def train_epoch(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(tqdm(iterator, desc="Training")):
        src, trg, src_len, _ = batch
        src, trg = src.to(device), trg.to(device)
        src_len = src_len.to(device)
        optimizer.zero_grad()
        output = model(src, src_len, trg)
        output_dim = output.shape[-1]
        output = output[:, 1:].contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Evaluation function


In [55]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(iterator, desc="Evaluating")):
            src, trg, src_len, _ = batch
            src, trg = src.to(device), trg.to(device)
            src_len = src_len.to(device)
            output = model(src, src_len, trg, 0)
            output_dim = output.shape[-1]
            output = output[:, 1:].contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Translation function


In [56]:
def translate_sentence(model, sentence, src_vocab, tgt_vocab, src_tokenizer, device, max_len=50):
    model.eval()
    if isinstance(sentence, str):
        tokens = src_tokenizer(sentence)
    else:
        tokens = sentence
    tokens = [token.lower() for token in tokens]
    src_indices = [src_vocab.get(token, src_vocab['<UNK>']) for token in tokens]
    src_indices = [src_vocab['<SOS>']] + src_indices + [src_vocab['<EOS>']]
    src_tensor = torch.LongTensor(src_indices).unsqueeze(0).to(device)
    src_len = torch.LongTensor([len(src_indices)]).to(device)
    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src_tensor, src_len)
    trg_indices = [tgt_vocab['<SOS>']]
    attention_scores = []
    for i in range(max_len):
        trg_tensor = torch.LongTensor([trg_indices[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell, attention = model.decoder(trg_tensor, hidden, cell, encoder_outputs, mask=model.create_mask(src_tensor))
        attention_scores.append(attention.detach().cpu().numpy())
        pred_token = output.argmax(1).item()
        trg_indices.append(pred_token)
        if pred_token == tgt_vocab['<EOS>']:
            break
    trg_tokens = []
    for idx in trg_indices[1:]:
        if idx == tgt_vocab['<EOS>']:
            break
        for token, index in tgt_vocab.items():
            if index == idx:
                trg_tokens.append(token)
                break
    return trg_tokens, attention_scores

# BLEU score calculation function


In [57]:
def calculate_bleu(model, data_loader, src_vocab, tgt_vocab, src_tokenizer):
    model.eval()
    references = []
    hypotheses = []
    for batch in tqdm(data_loader, desc="Calculating BLEU"):
        src, trg, src_len, _ = batch
        src, trg = src.to(device), trg.to(device)
        src_len = src_len.to(device)
        for i in range(len(src)):
            src_sentence = []
            for idx in src[i]:
                idx_item = idx.item()
                if idx_item != src_vocab['<PAD>'] and idx_item != src_vocab['<SOS>'] and idx_item != src_vocab['<EOS>']:
                    for token, index in src_vocab.items():
                        if index == idx_item:
                            src_sentence.append(token)
                            break
            trg_sentence = []
            for idx in trg[i]:
                idx_item = idx.item()
                if idx_item != tgt_vocab['<PAD>'] and idx_item != tgt_vocab['<SOS>'] and idx_item != tgt_vocab['<EOS>']:
                    for token, index in tgt_vocab.items():
                        if index == idx_item:
                            trg_sentence.append(token)
                            break
            translation, _ = translate_sentence(model, src_sentence, src_vocab, tgt_vocab, src_tokenizer, device)
            references.append([" ".join(trg_sentence)])
            hypotheses.append(" ".join(translation))
    bleu = sacrebleu.corpus_bleu(hypotheses, list(zip(*references)))
    return bleu.score

# Training loop


In [58]:
def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, n_epochs=20, clip=1.0):
    best_valid_loss = float('inf')
    best_bleu_score = 0
    patience = 3
    patience_counter = 0
    for epoch in range(n_epochs):
        print(f"Epoch: {epoch+1}/{n_epochs}")
        train_loss = train_epoch(model, train_loader, optimizer, criterion, clip)
        valid_loss = evaluate(model, val_loader, criterion)
        scheduler.step(valid_loss)
        if epoch % 3 == 0 or epoch == n_epochs - 1:
            subset_size = min(len(val_loader.dataset), 500)
            subset_indices = random.sample(range(len(val_loader.dataset)), subset_size)
            subset_data = [val_loader.dataset.data_pairs[i] for i in subset_indices]
            subset_dataset = TranslationDataset(subset_data, en_vocab, de_vocab, tokenize_en, tokenize_de)
            subset_loader = DataLoader(subset_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
            bleu_score = calculate_bleu(model, subset_loader, en_vocab, de_vocab, tokenize_en)
            print(f"BLEU Score: {bleu_score:.2f}")
            if bleu_score > best_bleu_score:
                best_bleu_score = bleu_score
                patience_counter = 0
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'bleu': bleu_score,
                    'loss': valid_loss,
                }, 'best_model_bleu.pt')
                print(f"Model saved with BLEU: {bleu_score:.2f}")
            else:
                patience_counter += 1
        else:
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                patience_counter = 0
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': valid_loss,
                }, 'best_model_loss.pt')
                print(f"Model saved with loss: {valid_loss:.4f}")
            else:
                patience_counter += 1
        print(f"Train Loss: {train_loss:.4f} | Val. Loss: {valid_loss:.4f}")
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break
    return model

# Test translation function


In [59]:
def test_translation(model, sentence, src_vocab, tgt_vocab, src_tokenizer):
    translated_tokens, attention = translate_sentence(model, sentence, src_vocab, tgt_vocab, src_tokenizer, device)
    translated_sentence = " ".join(translated_tokens)
    print(f"Source: {sentence}")
    print(f"Translated: {translated_sentence}")
    return translated_sentence

# Main execution


In [60]:
if __name__ == "__main__":
    N_EPOCHS = 20
    CLIP = 1.0
    model.apply(init_weights)
    print(f'The model has {count_parameters(model):,} trainable parameters')
    print("Starting training...")
    model = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, n_epochs=N_EPOCHS, clip=CLIP)
    checkpoint = torch.load('best_model_bleu.pt', map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded best model with BLEU score: {checkpoint['bleu']:.2f}")
    val_bleu = calculate_bleu(model, val_loader, en_vocab, de_vocab, tokenize_en)
    print(f"Validation BLEU score: {val_bleu:.2f}")
    test_sentences = [
        "This is a wonderful day to test translation.",
        "I love learning about natural language processing.",
        "The book is on the table.",
        "She walked to the store yesterday.",
        "Can you help me translate this sentence?"
    ]
    print("\nSample translations:")
    for sentence in test_sentences:
        translated = test_translation(model, sentence, en_vocab, de_vocab, tokenize_en)
    torch.save({
        'model_state_dict': model.state_dict(),
        'en_vocab': en_vocab,
        'de_vocab': de_vocab,
        'model_params': {
            'INPUT_DIM': INPUT_DIM,
            'OUTPUT_DIM': OUTPUT_DIM,
            'ENC_EMB_DIM': ENC_EMB_DIM,
            'DEC_EMB_DIM': DEC_EMB_DIM,
            'ENC_HIDDEN_DIM': ENC_HIDDEN_DIM,
            'DEC_HIDDEN_DIM': DEC_HIDDEN_DIM,
            'ENC_LAYERS': ENC_LAYERS,
            'DEC_LAYERS': DEC_LAYERS,
            'ENC_DROPOUT': ENC_DROPOUT,
            'DEC_DROPOUT': DEC_DROPOUT
        }
    }, 'translation_model_complete.pt')
    print("Model saved as 'translation_model_complete.pt'")

The model has 40,149,848 trainable parameters
Starting training...
Epoch: 1/20


Training: 100%|██████████| 456/456 [07:27<00:00,  1.02it/s]
Evaluating: 100%|██████████| 46/46 [00:18<00:00,  2.55it/s]


Retained 500 of 500 examples after length filtering


Calculating BLEU: 100%|██████████| 8/8 [00:16<00:00,  2.11s/it]


BLEU Score: 8.76
Model saved with BLEU: 8.76
Train Loss: 5.4909 | Val. Loss: 5.5241
Epoch: 2/20


Training: 100%|██████████| 456/456 [07:30<00:00,  1.01it/s]
Evaluating: 100%|██████████| 46/46 [00:17<00:00,  2.63it/s]


Model saved with loss: 5.1866
Train Loss: 4.8520 | Val. Loss: 5.1866
Epoch: 3/20


Training: 100%|██████████| 456/456 [07:29<00:00,  1.01it/s]
Evaluating: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s]


Model saved with loss: 5.0397
Train Loss: 4.4007 | Val. Loss: 5.0397
Epoch: 4/20


Training: 100%|██████████| 456/456 [07:33<00:00,  1.01it/s]
Evaluating: 100%|██████████| 46/46 [00:17<00:00,  2.60it/s]


Retained 500 of 500 examples after length filtering


Calculating BLEU: 100%|██████████| 8/8 [00:15<00:00,  2.00s/it]


BLEU Score: 15.42
Model saved with BLEU: 15.42
Train Loss: 4.0427 | Val. Loss: 4.9944
Epoch: 5/20


Training: 100%|██████████| 456/456 [07:29<00:00,  1.01it/s]
Evaluating: 100%|██████████| 46/46 [00:17<00:00,  2.60it/s]


Model saved with loss: 5.0298
Train Loss: 3.7239 | Val. Loss: 5.0298
Epoch: 6/20


Training: 100%|██████████| 456/456 [07:32<00:00,  1.01it/s]
Evaluating: 100%|██████████| 46/46 [00:17<00:00,  2.60it/s]


Train Loss: 3.4532 | Val. Loss: 5.0555
Epoch: 7/20


Training: 100%|██████████| 456/456 [07:30<00:00,  1.01it/s]
Evaluating: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s]


Retained 500 of 500 examples after length filtering


Calculating BLEU: 100%|██████████| 8/8 [00:16<00:00,  2.07s/it]


BLEU Score: 16.69
Model saved with BLEU: 16.69
Train Loss: 3.1829 | Val. Loss: 5.1463
Epoch: 8/20


Training: 100%|██████████| 456/456 [07:31<00:00,  1.01it/s]
Evaluating: 100%|██████████| 46/46 [00:17<00:00,  2.63it/s]


Train Loss: 2.8289 | Val. Loss: 5.1785
Epoch: 9/20


Training: 100%|██████████| 456/456 [07:29<00:00,  1.01it/s]
Evaluating: 100%|██████████| 46/46 [00:17<00:00,  2.58it/s]


Train Loss: 2.6342 | Val. Loss: 5.3020
Epoch: 10/20


Training: 100%|██████████| 456/456 [07:30<00:00,  1.01it/s]
Evaluating: 100%|██████████| 46/46 [00:17<00:00,  2.63it/s]


Retained 500 of 500 examples after length filtering


Calculating BLEU: 100%|██████████| 8/8 [00:16<00:00,  2.08s/it]


BLEU Score: 16.31
Train Loss: 2.5149 | Val. Loss: 5.3401
Early stopping at epoch 10
Loaded best model with BLEU score: 16.69


Calculating BLEU: 100%|██████████| 46/46 [01:43<00:00,  2.26s/it]


Validation BLEU score: 16.14

Sample translations:
Source: This is a wonderful day to test translation.
Translated: das ist eine <UNK> tag , <UNK> .
Source: I love learning about natural language processing.
Translated: ich liebe ich sehr <UNK> <UNK> sprache <UNK> .
Source: The book is on the table.
Translated: der buch steht auf tische .
Source: She walked to the store yesterday.
Translated: sie ging zu , , die <UNK> .
Source: Can you help me translate this sentence?
Translated: ihr ihr mich dieser <UNK> <UNK> ?
Model saved as 'translation_model_complete.pt'
