# Training an EN→ES Transformer (Europarl)
A complete notebook: data loading, BPE tokenization, DataLoaders, training, and greedy decoding.

In [None]:
# Install dependencies
%pip install -q datasets tokenizers sacrebleu sentencepiece tqdm torch torchvision torchaudio



Note: you may need to restart the kernel to use updated packages.


In [33]:
import random
from pathlib import Path

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.processors import TemplateProcessing

from transformer import Transformer
from config import TransformerConfig
from masks import create_padding_mask, create_decoder_mask



In [48]:
# General config
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 3
LR = 3e-4
VOCAB_SIZE = 16000
MAX_TRAIN_SAMPLES = 50000  # reduce para experimentos rápidos; usa None para todo el corpus
VAL_FRACTION = 0.01
tokenizer_path = Path("bpe_enes.json")



Device: cpu


In [35]:
# 1) Load dataset
raw_ds = load_dataset("Helsinki-NLP/europarl", "en-es")
train_valid = raw_ds["train"].train_test_split(test_size=VAL_FRACTION, seed=SEED)

if MAX_TRAIN_SAMPLES is not None:
    max_n = min(MAX_TRAIN_SAMPLES, len(train_valid["train"]))
    train_valid["train"] = train_valid["train"].shuffle(seed=SEED).select(range(max_n))

print(train_valid)



DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 20091
    })
})


## BPE Tokenizer
We train (or load) a Byte-Pair Encoding tokenizer with the special tokens `<pad>`, `<s>`, `</s>`, `<unk>`. It is saved as `bpe_enes.json` for reuse.

In [36]:
SPECIAL_TOKENS = ["<pad>", "<s>", "</s>", "<unk>"]

if tokenizer_path.exists():
    tokenizer = Tokenizer.from_file(str(tokenizer_path))
else:
    tokenizer = Tokenizer(BPE(unk_token="<unk>"))
    tokenizer.pre_tokenizer = ByteLevel()
    trainer = BpeTrainer(
        vocab_size=VOCAB_SIZE,
        min_frequency=2,
        special_tokens=SPECIAL_TOKENS,
    )

    def text_iterator(ds):
        for ex in ds:
            yield ex["translation"]["en"]
            yield ex["translation"]["es"]

    tokenizer.train_from_iterator(text_iterator(train_valid["train"]), trainer=trainer)
    tokenizer.decoder = ByteLevelDecoder()
    tokenizer.post_processor = TemplateProcessing(
        single="<s> $A </s>",
        pair="<s> $A </s> </s> $B </s>",
        special_tokens=[("<s>", tokenizer.token_to_id("<s>")), ("</s>", tokenizer.token_to_id("</s>"))],
    )
    tokenizer.save(str(tokenizer_path))

# Aseguramos decoder/post-procesador también cuando cargamos el fichero
tokenizer.decoder = ByteLevelDecoder()
tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> </s> $B </s>",
    special_tokens=[("<s>", tokenizer.token_to_id("<s>")), ("</s>", tokenizer.token_to_id("</s>"))],
)

PAD_ID = tokenizer.token_to_id("<pad>")
BOS_ID = tokenizer.token_to_id("<s>")
EOS_ID = tokenizer.token_to_id("</s>")
VOCAB_SIZE = tokenizer.get_vocab_size()

print("Vocab size:", VOCAB_SIZE)
print("PAD/BOS/EOS:", PAD_ID, BOS_ID, EOS_ID)



Vocab size: 16000
PAD/BOS/EOS: 0 1 2


## Preprocessing and DataLoaders
We encode each EN→ES sentence pair into IDs, trim sequences to `MAX_LEN`, and construct tensors with dynamic padding.

In [37]:
def encode_example(example):
    en_text = example["translation"]["en"]
    es_text = example["translation"]["es"]

    src_ids = tokenizer.encode(en_text).ids[: MAX_LEN - 1] + [EOS_ID]
    tgt_ids = tokenizer.encode(es_text).ids[: MAX_LEN - 2]
    tgt_in = [BOS_ID] + tgt_ids
    tgt_out = tgt_ids + [EOS_ID]
    return {
        "src": src_ids,
        "tgt_in": tgt_in,
        "tgt_out": tgt_out,
    }

processed = train_valid.map(
    encode_example,
    remove_columns=["translation"],
    num_proc=4,
).with_format("python")

print(processed)



DatasetDict({
    train: Dataset({
        features: ['src', 'tgt_in', 'tgt_out'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['src', 'tgt_in', 'tgt_out'],
        num_rows: 20091
    })
})


In [38]:
def pad_sequences(seqs, pad_id=PAD_ID):
    max_len = max(len(s) for s in seqs)
    return torch.tensor([s + [pad_id] * (max_len - len(s)) for s in seqs], dtype=torch.long)

def collate_fn(batch):
    return {
        "src": pad_sequences([b["src"] for b in batch]),
        "tgt_in": pad_sequences([b["tgt_in"] for b in batch]),
        "tgt_out": pad_sequences([b["tgt_out"] for b in batch]),
    }

train_loader = DataLoader(
    processed["train"],
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)
val_loader = DataLoader(
    processed["test"],
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
)

batch = next(iter(train_loader))
for key, tensor in batch.items():
    print(key, tensor.shape)



src torch.Size([64, 79])
tgt_in torch.Size([64, 86])
tgt_out torch.Size([64, 86])


## Transformer Model

We instantiate the transformer defined in this repository and configure the optimizer and loss function with ignore_index=PAD_ID.

In [39]:
config = TransformerConfig(
    vocab_size=VOCAB_SIZE,
    d_model=256,
    n_heads=4,
    num_encoder_layers=4,
    num_decoder_layers=4,
    d_ff=1024,
    dropout=0.1,
    pad_id=PAD_ID,
    max_len=MAX_LEN,
)

model = Transformer(config).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR, betas=(0.9, 0.98))
criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_ID)



## Training

Simple training/validation loop with gradient clipping and checkpoint saving.

In [49]:
from tqdm.auto import tqdm

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    steps = 0
    for batch in tqdm(loader, leave=False):
        src = batch["src"].to(device)
        tgt_in = batch["tgt_in"].to(device)
        tgt_out = batch["tgt_out"].to(device)

        if train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(train):
            logits = model(src, tgt_in)
            loss = criterion(logits.view(-1, VOCAB_SIZE), tgt_out.view(-1))

        if train:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        total_loss += loss.item()
        steps += 1

    return total_loss / max(steps, 1)

train_history, val_history = [], []
for epoch in range(1, EPOCHS + 1):
    train_loss = run_epoch(train_loader, train=True)
    val_loss = run_epoch(val_loader, train=False)
    train_history.append(train_loss)
    val_history.append(val_loss)
    print(f"Epoch {epoch}: train_loss={train_loss:.4f} val_loss={val_loss:.4f}")

torch.save(model.state_dict(), "transformer_europarl.pt")
print("Modelo guardado en transformer_europarl.pt")



                                                 

Epoch 1: train_loss=3.3602 val_loss=3.4819


                                                 

Epoch 2: train_loss=3.3129 val_loss=3.3622


                                                 

Epoch 3: train_loss=3.1739 val_loss=3.3077
Modelo guardado en transformer_europarl.pt




## Greedy decoding
Step-by-step translation using the encoder+decoder part of the current model.

In [46]:
model.load_state_dict(torch.load("transformer_europarl_1.pt"))

def translate_sentence(model, text, max_new_tokens=MAX_LEN):
    model.eval()
    with torch.no_grad():
        src_ids = tokenizer.encode(text).ids[: MAX_LEN - 1] + [EOS_ID]
        src = torch.tensor(src_ids, device=device).unsqueeze(0)
        src_mask = create_padding_mask(src, PAD_ID)
        memory = model.encode(src, src_mask)

        ys = torch.tensor([[BOS_ID]], device=device)
        for step in range(max_new_tokens):
            tgt_mask = create_decoder_mask(ys, PAD_ID, device=device)
            out = model.decode(ys, memory, tgt_mask, src_mask)
            logits = model.generator(out[:, -1])
            # Evita que el modelo genere PAD/BOS como token siguiente;
            # y fuerza al menos 1-2 tokens antes de permitir EOS.
            logits[:, [PAD_ID, BOS_ID]] = -1e9
            if step < 1:
                logits[:, EOS_ID] = -1e9
            next_id = int(logits.argmax(dim=-1).item())
            ys = torch.cat([ys, torch.tensor([[next_id]], device=device)], dim=1)
            if next_id == EOS_ID:
                break

    decoded_ids = [i for i in ys[0, 1:].tolist() if i not in {PAD_ID, BOS_ID, EOS_ID}]
    if decoded_ids:
        return tokenizer.decode(decoded_ids, skip_special_tokens=True).strip()
    # Fallback: decodifica todo salvo BOS y PAD por si solo hubo EOS temprano
    fallback_ids = [i for i in ys[0, 1:].tolist() if i not in {PAD_ID, BOS_ID}]
    return tokenizer.decode(fallback_ids, skip_special_tokens=True).strip()

samples = [train_valid["test"][i]["translation"]["en"] for i in range(3)]
for s in samples:
    print("EN:", s)
    pred = translate_sentence(model, s)
    print("ES pred:", pred if pred else "<vacío>")
    print()


EN: That is what I wanted to say in conclusion.
ES pred: Eso es lo que he dicho.

EN: End of quote.
ES pred: Las empresas.

EN: Young people are unquestionably a key resource in whom we must invest in order to revitalise the European Union economy as a whole.
ES pred: Los ciudadanos de la Unión Europea deben ser un gran número de personas.



In [47]:
model.load_state_dict(torch.load("transformer_europarl_2.pt"))

def translate_sentence(model, text, max_new_tokens=MAX_LEN):
    model.eval()
    with torch.no_grad():
        src_ids = tokenizer.encode(text).ids[: MAX_LEN - 1] + [EOS_ID]
        src = torch.tensor(src_ids, device=device).unsqueeze(0)
        src_mask = create_padding_mask(src, PAD_ID)
        memory = model.encode(src, src_mask)

        ys = torch.tensor([[BOS_ID]], device=device)
        for step in range(max_new_tokens):
            tgt_mask = create_decoder_mask(ys, PAD_ID, device=device)
            out = model.decode(ys, memory, tgt_mask, src_mask)
            logits = model.generator(out[:, -1])
            # Evita que el modelo genere PAD/BOS como token siguiente;
            # y fuerza al menos 1-2 tokens antes de permitir EOS.
            logits[:, [PAD_ID, BOS_ID]] = -1e9
            if step < 1:
                logits[:, EOS_ID] = -1e9
            next_id = int(logits.argmax(dim=-1).item())
            ys = torch.cat([ys, torch.tensor([[next_id]], device=device)], dim=1)
            if next_id == EOS_ID:
                break

    decoded_ids = [i for i in ys[0, 1:].tolist() if i not in {PAD_ID, BOS_ID, EOS_ID}]
    if decoded_ids:
        return tokenizer.decode(decoded_ids, skip_special_tokens=True).strip()
    # Fallback: decodifica todo salvo BOS y PAD por si solo hubo EOS temprano
    fallback_ids = [i for i in ys[0, 1:].tolist() if i not in {PAD_ID, BOS_ID}]
    return tokenizer.decode(fallback_ids, skip_special_tokens=True).strip()

samples = [train_valid["test"][i]["translation"]["en"] for i in range(3)]
for s in samples:
    print("EN:", s)
    pred = translate_sentence(model, s)
    print("ES pred:", pred if pred else "<vacío>")
    print()


EN: That is what I wanted to say in conclusion.
ES pred: Eso es lo que me parece.

EN: End of quote.
ES pred: Las empresas.

EN: Young people are unquestionably a key resource in whom we must invest in order to revitalise the European Union economy as a whole.
ES pred: La Unión Europea debe ser una economía europea en el ámbito de la economía.

