In [None]:
from data.load_data import * 
from data.evaluate_data import *
from model.transformer import * 
from model.training.training_loop import *
from model.transformer_utils import *

In [None]:
sp, loaders, info , ds_raw , ds_tok= build_mt_dataloaders(
    dataset_name="Helsinki-NLP/opus-100",
    config_name="en-es",
    src_lang="en",
    tgt_lang="es",
    vocab_size=16000,
    max_len=128,
    train_subset=50000,
    batch_size=64)

train_loader = loaders['train']
val_loader = loaders["validation"]
test_loader = loaders["test"]

In [None]:

run_all_checks(ds_raw, ds_tok, sp, loaders, subset_size=50000)

Tamaño del corpus OPUS-100 (EN↔ES):
  test         → 2,000 pares de oraciones
  train        → 1,000,000 pares de oraciones
  validation   → 2,000 pares de oraciones

Subconjunto de entrenamiento usado: 50,000 pares (≈5 %)

Ejemplo de par (en → es):
EN: What measures exist to prevent their forgery etc?
ES: ¿Qué medidas existen para evitar su falsificación, etc.?

Longitud promedio (EN): 6.2 palabras
Longitud promedio (ES): 6.3 palabras


## Definir modelo y entrenar


In [None]:
d_model = 256 # Dimensión del espacio de embeddings
num_layers = 4 # Número de bloques encoder y bloques decoder apilados.
num_heads = 4 #  Número de cabezas de atención por bloque.
d_ff = 1024 #  Dimensión de la red feed-forward dentro de cada capa (las dos lineales que van después de la atención).
vocab_size = sp.get_piece_size() # Tamaño del vocabulario sub-palabras que produce tu modelo SentencePiece.
max_pos = 256  # Longitud máxima de secuencia para la cual se precomputan los positional encodings.

model = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    d_ff=d_ff,
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    max_pos_src=max_pos,
    max_pos_tgt=max_pos,
    dropout=0.1,
    layernorm_eps=1e-6, weight_tying=True)

total_trainable, total_all = count_params(model)
print(f"Total trainable params : {total_trainable:,}")
print(f"Total (incl. frozen)   : {total_all:,}")

print("Approx model size (FP32):", bytes_human(total_trainable * 4))
print("Approx model size (FP16):", bytes_human(total_trainable * 2))

print("\nTop-10 submodules by params:")
for name, n in breakdown_by_child(model, topk=10):
    print(f"  {name:30s} {n:>12,}")

print("\nBy module type (direct params only):")
for t, n in breakdown_by_type(model):
    if n > 0:
        print(f"  {t:20s} {n:>12,}")

print("\nBy common prefixes (if present):")
for pref, n in breakdown_by_prefix(model):
    print(f"  {pref:20s} {n:>12,}")

Total trainable params : 15,580,800
Total (incl. frozen)   : 15,580,800
Approx model size (FP32): 59.4 MB
Approx model size (FP16): 29.7 MB

Top-10 submodules by params:
  decoder                           8,309,760
  encoder                           7,255,040
  final_linear                      4,112,000

By module type (direct params only):
  Linear                 11,474,560
  Embedding               8,192,000
  LayerNorm                  10,240

By common prefixes (if present):
  decoder                29,254,656
  encoder                23,983,104


In [None]:
history = train_transformer_mt(
    model=model,
    train_loader=train_loader,
    val_loader=None,
    d_model=256,
    epochs=70,
    base_lr= 1.0,
    warmup=1600,     # 2× steps_per_epoch (batch=64, 50k frases)
    label_smoothing=0.1,
    grad_clip=1.0,
    device="cuda",
    ckpt_path="transformer_noeval.pt",
    log_every=300 ,
    preview_every = 300 , id2tok_fn=sp.decode_ids)

## See the **training Logs.txt** files for the history


In [None]:
import torch

torch.save({
    'epoch': 30,
    'model_state_dict': model.state_dict()}, 'checkpoint.pth')