In [3]:
# %% [code] Cell 1: Imports, Setup, and Download Dataset Using Kaggle API
import math
import random
import time
import os
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

import kagglehub

# Set the dataset path (adjust this path if needed)
dataset_path = "/Users/amolgaur/.cache/kagglehub/datasets/mohamedlotfy50/wmt-2014-english-german/versions/1"
print("Dataset downloaded to:", dataset_path)

# Set seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

%matplotlib inline


Dataset downloaded to: /Users/amolgaur/.cache/kagglehub/datasets/mohamedlotfy50/wmt-2014-english-german/versions/1
Using device: cpu


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# %% [code] Cell 2: Load spaCy Models and Define Tokenizers
import spacy

# Ensure you have installed spaCy and downloaded the German and English models:
# pip install spacy
# python3 -m spacy download de_core_news_sm
# python3 -m spacy download en_core_web_sm

spacy_de = spacy.load("de_core_news_sm")
spacy_en = spacy.load("en_core_web_sm")

def tokenize_de(text):
    if not isinstance(text, str):
        return []  # Return empty list for non-string inputs
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    if not isinstance(text, str):
        return []
    return [tok.text for tok in spacy_en.tokenizer(text)]



In [5]:
# %% [code] Cell 3: Load the WMT-2014 CSV Files (Columns: de,en)
import pandas as pd

def load_translation_pairs(split="train"):
    # Files are named like: wmt14_translate_de-en_train.csv, etc.
    filename = f"wmt14_translate_de-en_{split}.csv"
    csv_file = os.path.join(dataset_path, filename)
    # Use the python engine and skip bad lines
    df = pd.read_csv(csv_file, engine="python", on_bad_lines="skip")
    # Expect columns "de" and "en" (if your CSV header is exactly "de,en")
    pairs = df.to_dict(orient="records")
    return pairs

train_pairs = load_translation_pairs("train")
valid_pairs = load_translation_pairs("validation")
test_pairs  = load_translation_pairs("test")
print("Number of training pairs:", len(train_pairs))


Number of training pairs: 4509785


In [6]:
# %% [code] Cell 4: Build Vocabularies from the Dataset (Updated)
from collections import Counter

SPECIALS = ["<sos>", "<eos>", "<pad>"]

def build_vocab(tokenizer, pairs, key, specials=SPECIALS, max_size=10000):
    counter = Counter()
    for pair in pairs:
        text = pair.get(key)
        # Skip if text is not a string (or is NaN)
        if not isinstance(text, str):
            continue  
        tokens = tokenizer(text)
        counter.update(tokens)
    # Select the most common tokens, leaving space for special tokens
    common_tokens = [token for token, _ in counter.most_common(max_size - len(specials))]
    # Initialize vocab with special tokens
    vocab = {token: idx for idx, token in enumerate(specials)}
    for token in common_tokens:
        vocab[token] = len(vocab)
    # Build inverse vocabulary mapping (index → token)
    inv_vocab = {idx: token for token, idx in vocab.items()}
    return vocab, inv_vocab

# For translation from German (source) to English (target)
vocab_src, inv_vocab_src = build_vocab(tokenize_de, train_pairs, key="de", specials=SPECIALS, max_size=10000)
vocab_trg, inv_vocab_trg = build_vocab(tokenize_en, train_pairs, key="en", specials=SPECIALS, max_size=10000)

print("German vocab size (source):", len(vocab_src))
print("English vocab size (target):", len(vocab_trg))


German vocab size (source): 10000
English vocab size (target): 10000


In [7]:
# %% [code] Cell 5: Data Processing and Collate Function
def process_example(example):
    if example.get("de") is None or example.get("en") is None:
        return None  # Skip this example
    # For translation from German (source) to English (target)
    src_tokens = ["<sos>"] + tokenize_de(example["de"]) + ["<eos>"]
    trg_tokens = ["<sos>"] + tokenize_en(example["en"]) + ["<eos>"]
    src_indices = [vocab_src.get(token, vocab_src["<pad>"]) for token in src_tokens]
    trg_indices = [vocab_trg.get(token, vocab_trg["<pad>"]) for token in trg_tokens]
    src_tensor = torch.tensor(src_indices, dtype=torch.long)
    trg_tensor = torch.tensor(trg_indices, dtype=torch.long)
    return src_tensor, trg_tensor

def process_split(pairs):
    processed = [process_example(item) for item in pairs]
    return [item for item in processed if item is not None]


train_data = process_split(train_pairs)
valid_data = process_split(valid_pairs)
test_data  = process_split(test_pairs)

# %% [code] New Cell: Subsample the Dataset for Memory Efficiency
# For early experimentation, only use a subset of the full dataset.
subsample_size = 100000  # adjust as needed
train_data = train_data[:subsample_size]
valid_data = valid_data[:5000]  # or an appropriate size for validation
test_data  = test_data[:5000]
print("Subsampled training pairs:", len(train_data))


from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=vocab_src["<pad>"], batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=vocab_trg["<pad>"], batch_first=True)
    return src_batch, trg_batch

from torch.utils.data import DataLoader

BATCH_SIZE = 128
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_data,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


Subsampled training pairs: 100000


In [8]:
# %% [code] Cell 6.1: Positional Encoding Module (Reused)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


In [9]:
# %% [code] Cell 7: Define the Convolutional Encoder
class ConvEncoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, kernel_size=3, dropout=0.1):
        super(ConvEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_embedding = PositionalEncoding(embed_size)
        self.fc = nn.Linear(embed_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.kernel_size = kernel_size
        self.num_layers = num_layers
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=hidden_size, out_channels=hidden_size * 2,
                      kernel_size=kernel_size, padding=(kernel_size - 1))
            for _ in range(num_layers)
        ])
    def forward(self, src):
        # src: (batch_size, src_seq_len)
        emb = self.embedding(src)               # (batch_size, src_seq_len, embed_size)
        emb = self.pos_embedding(emb)
        emb = self.fc(emb)                      # (batch_size, src_seq_len, hidden_size)
        emb = self.dropout(emb)
        conv_input = emb.transpose(1, 2)        # (batch_size, hidden_size, src_seq_len)
        for conv in self.conv_layers:
            conv_out = conv(conv_input)
            conv_out = conv_out[:, :, :conv_input.size(2)]
            glu_out = F.glu(conv_out, dim=1)     # (batch_size, hidden_size, src_seq_len)
            conv_input = (glu_out + conv_input) * math.sqrt(0.5)
        encoder_outputs = conv_input.transpose(1, 2)  # (batch_size, src_seq_len, hidden_size)
        return encoder_outputs

# Quick test
src_batch, _ = next(iter(train_loader))
encoder = ConvEncoder(len(vocab_src), embed_size=256, hidden_size=256, num_layers=4, kernel_size=3, dropout=0.1).to(device)
enc_out = encoder(src_batch.to(device))
print("Encoder output shape:", enc_out.shape)


Encoder output shape: torch.Size([128, 112, 256])


In [10]:
# %% [code] Cell 8: Define the Convolutional Decoder with Attention
class ConvDecoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, kernel_size=3, dropout=0.1):
        super(ConvDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_embedding = PositionalEncoding(embed_size)
        self.fc = nn.Linear(embed_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.kernel_size = kernel_size
        self.num_layers = num_layers
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=hidden_size, out_channels=hidden_size * 2,
                      kernel_size=kernel_size, padding=(kernel_size - 1))
            for _ in range(num_layers)
        ])
        self.attn_linear = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, vocab_size)
    def forward(self, tgt, encoder_outputs):
        # tgt: (batch_size, tgt_seq_len)
        emb = self.embedding(tgt)               # (batch_size, tgt_seq_len, embed_size)
        emb = self.pos_embedding(emb)
        emb = self.fc(emb)                      # (batch_size, tgt_seq_len, hidden_size)
        emb = self.dropout(emb)
        conv_input = emb.transpose(1, 2)        # (batch_size, hidden_size, tgt_seq_len)
        for conv in self.conv_layers:
            conv_out = conv(conv_input)
            conv_out = conv_out[:, :, :conv_input.size(2)]
            glu_out = F.glu(conv_out, dim=1)
            conv_input = (glu_out + conv_input) * math.sqrt(0.5)
        conv_output = conv_input.transpose(1, 2) # (batch_size, tgt_seq_len, hidden_size)
        queries = self.attn_linear(conv_output)   # (batch_size, tgt_seq_len, hidden_size)
        attn_scores = torch.bmm(queries, encoder_outputs.transpose(1,2))  # (batch_size, tgt_seq_len, src_seq_len)
        attn_weights = F.softmax(attn_scores, dim=-1)
        context = torch.bmm(attn_weights, encoder_outputs)  # (batch_size, tgt_seq_len, hidden_size)
        combined = conv_output + context
        output = self.out(combined)  # (batch_size, tgt_seq_len, vocab_size)
        return output, attn_weights

# Quick test
_, trg_batch = next(iter(train_loader))
decoder = ConvDecoder(len(vocab_trg), embed_size=256, hidden_size=256, num_layers=4, kernel_size=3, dropout=0.1).to(device)
dec_out, attn_w = decoder(trg_batch.to(device), enc_out)
print("Decoder output shape:", dec_out.shape)
print("Attention weights shape:", attn_w.shape)


Decoder output shape: torch.Size([128, 73, 10000])
Attention weights shape: torch.Size([128, 73, 112])


In [11]:
# %% [code] Cell 9: Define the Full ConvSeq2Seq Model
class ConvSeq2Seq(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_size, hidden_size, num_layers, kernel_size=3, dropout=0.1):
        super(ConvSeq2Seq, self).__init__()
        self.encoder = ConvEncoder(src_vocab_size, embed_size, hidden_size, num_layers, kernel_size, dropout)
        self.decoder = ConvDecoder(tgt_vocab_size, embed_size, hidden_size, num_layers, kernel_size, dropout)
    def forward(self, src, tgt):
        encoder_outputs = self.encoder(src)
        decoder_outputs, attn_weights = self.decoder(tgt, encoder_outputs)
        return decoder_outputs, attn_weights

INPUT_DIM = len(vocab_src)
OUTPUT_DIM = len(vocab_trg)
model = ConvSeq2Seq(INPUT_DIM, OUTPUT_DIM, embed_size=256, hidden_size=256, num_layers=4, kernel_size=3, dropout=0.1).to(device)
print(model)


ConvSeq2Seq(
  (encoder): ConvEncoder(
    (embedding): Embedding(10000, 256)
    (pos_embedding): PositionalEncoding()
    (fc): Linear(in_features=256, out_features=256, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (conv_layers): ModuleList(
      (0-3): 4 x Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(2,))
    )
  )
  (decoder): ConvDecoder(
    (embedding): Embedding(10000, 256)
    (pos_embedding): PositionalEncoding()
    (fc): Linear(in_features=256, out_features=256, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (conv_layers): ModuleList(
      (0-3): 4 x Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(2,))
    )
    (attn_linear): Linear(in_features=256, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=10000, bias=True)
  )
)


In [12]:
# %% [code] Cell 10: Define Optimizer and Loss Function
optimizer = optim.Adam(model.parameters(), lr=0.001)
PAD_IDX = vocab_trg["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


In [15]:
# %% [code] Cell 11: Define Training and Evaluation Functions
def train_epoch(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for src, trg in iterator:
        optimizer.zero_grad()
        # Teacher forcing: input to decoder is trg[:, :-1], target is trg[:, 1:]
        output, _ = model(src, trg[:, :-1])
        output = output.contiguous().view(-1, output.shape[-1])
        trg_target = trg[:, 1:].contiguous().view(-1)
        loss = criterion(output, trg_target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg in iterator:
            output, _ = model(src, trg[:, :-1])
            output = output.contiguous().view(-1, output.shape[-1])
            trg_target = trg[:, 1:].contiguous().view(-1)
            loss = criterion(output, trg_target)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed = end_time - start_time
    mins = int(elapsed / 60)
    secs = int(elapsed - mins * 60)
    return mins, secs


In [29]:
# %% [code] Cell 12: Full Training Loop
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    valid_loss = evaluate_epoch(model, valid_loader, criterion)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'convseq2seq-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')



Epoch: 01 | Time: 66m 22s
	Train Loss: 4.595
	 Val. Loss: 4.689
Epoch: 02 | Time: 65m 36s
	Train Loss: 4.303
	 Val. Loss: 4.544
Epoch: 03 | Time: 65m 15s
	Train Loss: 4.108
	 Val. Loss: 4.432
Epoch: 04 | Time: 65m 10s
	Train Loss: 3.973
	 Val. Loss: 4.385
Epoch: 05 | Time: 64m 55s
	Train Loss: 3.875
	 Val. Loss: 4.340
Epoch: 06 | Time: 64m 50s
	Train Loss: 3.797
	 Val. Loss: 4.301
Epoch: 07 | Time: 64m 45s
	Train Loss: 3.733
	 Val. Loss: 4.267
Epoch: 08 | Time: 64m 40s
	Train Loss: 3.681
	 Val. Loss: 4.240
Epoch: 09 | Time: 64m 35s
	Train Loss: 3.642
	 Val. Loss: 4.218
Epoch: 10 | Time: 64m 30s
	Train Loss: 3.613
	 Val. Loss: 4.200


In [28]:
# %% [code] Cell 13: Greedy Decoding for Inference
def translate_sentence(sentence, src_vocab, trg_vocab, model, device, max_len=50):
    model.eval()
    # Tokenize the input sentence using the German tokenizer
    tokens = [token.lower() for token in tokenize_de(sentence)]
    # Add start and end tokens
    tokens = ["<sos>"] + tokens + ["<eos>"]
    # Convert tokens to indices (fallback to <pad> if token not found)
    src_indices = [src_vocab.get(token, src_vocab["<pad>"]) for token in tokens]
    # Create tensor and add batch dimension
    src_tensor = torch.LongTensor(src_indices).unsqueeze(0).to(device)  # shape: (1, src_seq_len)
    
    with torch.no_grad():
        encoder_outputs = model.encoder(src_tensor)
    
    # Initialize target sequence with <sos>
    trg_indices = [trg_vocab["<sos>"]]
    for i in range(max_len - 1):
        trg_tensor = torch.LongTensor(trg_indices).unsqueeze(0).to(device)  # shape: (1, current_len)
        with torch.no_grad():
            output, _ = model.decoder(trg_tensor, encoder_outputs)
        # Get the token with the highest probability from the last time step
        next_token = output[:, -1, :].argmax(dim=-1).item()
        trg_indices.append(next_token)
        if next_token == trg_vocab["<eos>"]:
            break
    # Convert indices back to tokens using the inverse target vocabulary
    trg_tokens = [inv_vocab_trg[i] for i in trg_indices]
    # Return the decoded tokens, excluding the initial <sos>
    return trg_tokens[1:]

# Test translation on a sample German sentence
example_sentence = "Ein kleines Haus mit einem Garten ."
translation = translate_sentence(example_sentence, vocab_src, vocab_trg, model, device)
print("Translated:", " ".join(translation))


Translated: A small house with a garden . <eos>
