In [None]:
from data.load_cifrar100 import *


train_loader, val_loader, test_loader = get_cifar100_dataloaders(
    batch_size=64,
    data_dir="./data",
    num_workers=4,
    val_split=0.1,
    img_size=32 , seed=77)


---

In [None]:
from src.Model_A_OutGridNet import * 
from src.stage_config import *

from src.training.one_epoch_train import *
from src.training.train_full_model import *

# ---------------------------------------------------------
# MODELO 2: NO OUTLOOKER (Solo Grid + MBConv)
# ---------------------------------------------------------
def configs_no_outlooker():
    return [
        StageCfg(dim=64,  depth=2, num_heads=2, grid_size=8, outlook_heads=0, drop_path=0.0),
        StageCfg(dim=128, depth=2, num_heads=4, grid_size=8, outlook_heads=0, drop_path=0.0),
        StageCfg(dim=256, depth=3, num_heads=8, grid_size=4, outlook_heads=0, drop_path=0.0),
        StageCfg(dim=384, depth=1, num_heads=6, grid_size=2, outlook_heads=0, drop_path=0.0),]

# ---------------------------------------------------------
# MODELO 3: NO GRID (Solo Outlooker + MBConv)
# ---------------------------------------------------------
def configs_no_grid():
    return [
        StageCfg(dim=64,  depth=2, num_heads=0, grid_size=8, outlook_heads=2, drop_path=0.0),
        StageCfg(dim=128, depth=2, num_heads=0, grid_size=8, outlook_heads=4, drop_path=0.0),
        StageCfg(dim=256, depth=3, num_heads=0, grid_size=4, outlook_heads=8, drop_path=0.0),
        StageCfg(dim=384, depth=1, num_heads=0, grid_size=2, outlook_heads=6, drop_path=0.0),]

# ---------------------------------------------------------
# MODELO 4: PLAIN (Solo MBConv)
# ---------------------------------------------------------
def configs_plain():
    return [
        StageCfg(dim=64,  depth=2, num_heads=0, grid_size=8, outlook_heads=0, drop_path=0.0),
        StageCfg(dim=128, depth=2, num_heads=0, grid_size=8, outlook_heads=0, drop_path=0.0),
        StageCfg(dim=256, depth=3, num_heads=0, grid_size=4, outlook_heads=0, drop_path=0.0),
        StageCfg(dim=384, depth=1, num_heads=0, grid_size=2, outlook_heads=0, drop_path=0.0),]

experiments = [
    {
        "name": "No_Outlooker",
        "stages_fn": configs_no_outlooker,
        "dpr": 0.05, 
        "desc": "Ablation: Sin Outlooker"
    },
    {
        "name": "No_Grid",
        "stages_fn": configs_no_grid,
        "dpr": 0.05,     
        "desc": "Ablation: Sin Grid"
    },
    {
        "name": "Plain_MBConv",
        "stages_fn": configs_plain,
        "dpr": 0.0,    
        "desc": "Baseline: Solo Convolucional"
    }]

In [None]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

device = "cuda" if torch.cuda.is_available() else "cpu"

# Loop de entrenamiento
for exp in experiments:
    print(f"\n{'='*40}")
    print(f"Iniciando: {exp['name']} ({exp['desc']})")
    print(f"Drop Path Rate: {exp['dpr']}")
    print(f"{'='*40}")

    # Instanciar Modelo LIMPIO
    stages = exp["stages_fn"]()
    model = MaxOutNet(
        num_classes=100,
        stages=stages,
        stem_dim=64,
        dpr_max=exp["dpr"]).to(device)

    n_params = count_trainable_parameters(model)
    print(f"Trainable parameters: {n_params:,}")

    # Entrenar 100 Epocas
    history, _ = train_model(
        model=model,
        train_loader=train_loader,
        epochs=100,
        val_loader=val_loader,
        device=device,

        lr=5e-4,
        weight_decay=0.05,

        # Mixed Precision
        autocast_dtype="bf16" if device == "cuda" else "fp32",
        use_amp=(device == "cuda"),
        grad_clip_norm=1.0,

        warmup_ratio=0.05,
        min_lr=1e-6,

        label_smoothing=0.1,

        print_every=600,
        save_path=f"best_{exp['name']}.pt",
        last_path=f"last_{exp['name']}.pt",

        mix_prob=0.5,
        mixup_alpha=0.8,
        cutmix_alpha=1.0,

        num_classes=100,
        channels_last=True)

    print()
    print('Test del Modelo')
    evaluate_one_epoch(model=model,dataloader=test_loader)
    free_all_cuda("model", "optimizer", "scaler", "scheduler", "batch", "loss", "outputs", "logits")


Iniciando: No_Outlooker (Ablation: Sin Outlooker)
Drop Path Rate: 0.05
Trainable parameters: 12,187,300
=== Run config ===
device=cuda | amp=True | autocast_dtype=bf16 | channels_last=True
epochs=100 | steps/epoch=704 | total_steps=70400 | warmup_steps=3520
batch_size=64 | input_shape=(64, 3, 32, 32) | num_classes=100
opt=AdamW | lr=0.0005 | wd=0.05 | grad_clip_norm=1.0
aug: mix_prob=0.5 | mixup_alpha=0.8 | cutmix_alpha=1.0 | label_smoothing=0.1
early_stop=True | metric=top1 | patience=12 | min_delta=0.05

=== Epoch 1/100 ===
[train step 600/704] loss 4.3293 | top1 5.42% | top3 12.91% | top5 18.62% | 1037.2 img/s | lr 8.52e-05 | gnorm 5.446 | clip 100.0% | oflow 0 | nonfinite 0 | scale 1.0
[train step 704/704] loss 4.2894 | top1 5.86% | top3 13.96% | top5 20.07% | 1040.2 img/s | lr 1.00e-04 | gnorm 5.391 | clip 100.0% | oflow 0 | nonfinite 0 | scale 1.0
[Train] loss 4.2894 | top1 5.86% | top3 13.96% | top5 20.07% | lr 1.00e-04 | grad_norm 5.391 | clip 100.0% | amp_overflows 0 | nonfin