In [None]:
import sys, os

root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
if root_path not in sys.path:
    sys.path.append(root_path)

In [None]:
from data.load_cifrar100 import *

train_loader, val_loader, test_loader = get_cifar100_dataloaders(
    batch_size=64,
    data_dir="./data",
    num_workers=2,
    val_split=0.1,
    img_size=32 , seed=7)

In [11]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

---

In [None]:
import torch.nn as nn
import timm
from src.training.train_full_model import * 
from src.training.metrics import * 
from src.training.eval_one_epoch_logs import *

device = "cuda" if torch.cuda.is_available() else "cpu"

model_vit1 = timm.create_model(
    "deit_tiny_patch16_224",
    pretrained=False,
    num_classes=100,
    img_size=32,
    patch_size=4).to(device)

n_params = count_trainable_parameters(model_vit1)
print(f"Trainable parameters: {n_params:,}")

history, _ = train_model(
        model=model_vit1,
        train_loader=train_loader,
        epochs=100,
        val_loader=val_loader,
        device=device,

        lr=5e-4,
        weight_decay=0.05,

        # Mixed Precision
        autocast_dtype="bf16" if device == "cuda" else "fp32",
        use_amp=(device == "cuda"),
        grad_clip_norm=1.0,

        warmup_ratio=0.05,
        min_lr=1e-6,

        label_smoothing=0.1,
        save_path= "best_model_vit1.pt",
        last_path = "last_model_vit1.pt",

        print_every=400,
        mix_prob=0.5,
        mixup_alpha=0.8,
        cutmix_alpha=1.0,

        num_classes=100,
        channels_last=True)

Trainable parameters: 5,380,132
=== Run config ===
device=cuda | amp=True | autocast_dtype=bf16 | channels_last=True
epochs=100 | steps/epoch=704 | total_steps=70400 | warmup_steps=3520
batch_size=64 | input_shape=(64, 3, 32, 32) | num_classes=100
opt=AdamW | lr=0.0005 | wd=0.05 | grad_clip_norm=1.0
aug: mix_prob=0.5 | mixup_alpha=0.8 | cutmix_alpha=1.0 | label_smoothing=0.1

=== Epoch 1/100 ===
[train step 400/704] loss 4.5175 | top1 2.88% | top3 7.80% | top5 11.96% | 727.3 img/s | lr 5.68e-05 | gnorm 2.509 | clip 100.0% | oflow 0 | nonfinite 0 | scale 1.0
[train step 704/704] loss 4.4364 | top1 4.03% | top3 10.24% | top5 15.24% | 722.7 img/s | lr 1.00e-04 | gnorm 2.583 | clip 100.0% | oflow 0 | nonfinite 0 | scale 1.0
[Train] loss 4.4364 | top1 4.03% | top3 10.24% | top5 15.24% | lr 1.00e-04 | grad_norm 2.583 | clip 100.0% | amp_overflows 0 | nonfinite_loss 0 | scale 1.0
[Train] mem_peak alloc 1.57 GiB | reserved 2.49 GiB
[Val]   loss 4.1936 | top1 6.34% | top3 16.30% | top5 22.82%
[

---

In [None]:
avg_loss, metrics = evaluate_one_epoch_logs(
    model=model_vit1,
    dataloader=test_loader,
    device="cuda",
    use_amp=True,
    autocast_dtype="fp16",     
    channels_last=False,      
    measure_flops=True,       
    flops_warmup_batches=1)

print(f"loss: {avg_loss:.4f}")
print(f"top1: {metrics['top1']:.2f} | top3: {metrics['top3']:.2f} | top5: {metrics['top5']:.2f}")

print(f"throughput: {metrics['imgs_per_sec']:.1f} imgs/s | epoch: {metrics['epoch_time_sec']:.2f}s | ms/batch: {metrics['ms_per_batch']:.2f}")
print(f"GPU mem: alloc={metrics['gpu_mem_allocated_mib']:.0f} MiB | reserved={metrics['gpu_mem_reserved_mib']:.0f} MiB | peak={metrics['gpu_mem_peak_allocated_mib']:.0f} MiB")
print(f"model: params={int(metrics['model_params']):,} | param_size={metrics['model_param_size_mib']:.1f} MiB")

# FLOPs/MACs pueden salir nan si no tienes fvcore/thop instalado o si falla el profiler
print(f"FLOPs/forward: {metrics['flops_per_forward']:.3e} | MACs/forward: {metrics['macs_per_forward']:.3e}")

loss: 1.4651
top1: 63.77 | top3: 80.55 | top5: 85.93
throughput: 5077.2 imgs/s | epoch: 1.97s | ms/batch: 10.60
GPU mem: alloc=1176 MiB | reserved=2550 MiB | peak=1211 MiB
model: params=5,380,132 | param_size=20.5 MiB
FLOPs/forward: nan | MACs/forward: nan
