In [None]:
import sys, os

root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
if root_path not in sys.path:
    sys.path.append(root_path)

In [None]:
from data.load_cifrar100 import *

train_loader, val_loader, test_loader = get_cifar100_dataloaders(
    batch_size=64,
    data_dir="./data",
    num_workers=2,
    val_split=0.1,
    img_size=32 , seed=7)

In [11]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

---

In [None]:

import torch.nn as nn
import timm
from src.training.train_full_model import * 
from src.training.metrics import * 
from src.training.eval_one_epoch_logs import *

device = 'cuda'
model_v2_s = timm.create_model("efficientnetv2_s", pretrained=False, num_classes=100)
out_channels = model_v2_s.conv_stem.out_channels

model_v2_s.conv_stem = nn.Conv2d(
    in_channels=3, 
    out_channels=out_channels, 
    kernel_size=3, 
    stride=1,      
    padding=1, 
    bias=False)

model_v2_s = model_v2_s.to(device)

n_params = sum(p.numel() for p in model_v2_s.parameters() if p.requires_grad)
print(f"Trainable parameters: {n_params:,}")

Trainable parameters: 20,305,588


In [23]:
history, _ = train_model(
        model=model_v2_s,
        train_loader=train_loader,
        epochs=100,
        val_loader=val_loader,
        device=device,

        lr=5e-4,
        weight_decay=0.05,

        # Mixed Precision
        autocast_dtype="fp16" if device == "cuda" else "fp32",
        use_amp=(device == "cuda"),
        grad_clip_norm=1.0,

        warmup_ratio=0.05,
        min_lr=1e-6,

        label_smoothing=0.1,
        save_path= "best_model_vit1.pt",
        last_path = "last_model_vit1.pt",

        print_every=400,
        mix_prob=0.5,
        mixup_alpha=0.8,
        cutmix_alpha=1.0,

        num_classes=100,
        channels_last=True)

=== Run config ===
device=cuda | amp=True | autocast_dtype=fp16 | channels_last=True
epochs=100 | steps/epoch=704 | total_steps=70400 | warmup_steps=3520
batch_size=64 | input_shape=(64, 3, 32, 32) | num_classes=100
opt=AdamW | lr=0.0005 | wd=0.05 | grad_clip_norm=1.0
aug: mix_prob=0.5 | mixup_alpha=0.8 | cutmix_alpha=1.0 | label_smoothing=0.1

=== Epoch 1/100 ===
[train step 400/704] loss 4.8129 | top1 1.23% | top3 3.40% | top5 5.61% | 698.9 img/s | lr 5.68e-05 | gnorm inf | clip 100.0% | oflow 4 | nonfinite 0 | scale 4096.0
[train step 704/704] loss 4.7567 | top1 1.32% | top3 3.70% | top5 5.99% | 699.6 img/s | lr 1.00e-04 | gnorm inf | clip 100.0% | oflow 4 | nonfinite 0 | scale 4096.0
[Train] loss 4.7567 | top1 1.32% | top3 3.70% | top5 5.99% | lr 1.00e-04 | grad_norm inf | clip 100.0% | amp_overflows 4 | nonfinite_loss 0 | scale 4096.0
[Train] mem_peak alloc 1.53 GiB | reserved 3.16 GiB
[Val]   loss 7.2097 | top1 1.08% | top3 4.02% | top5 6.66%
[Val]   mem_peak alloc 0.97 GiB | res

In [24]:
evaluate_one_epoch(model=model_v2_s, dataloader=test_loader)

(1.3429618577957154, {'top1': 64.66, 'top3': 82.78, 'top5': 88.05})

---

In [26]:
avg_loss, metrics = evaluate_one_epoch(
    model=model_v2_s,
    dataloader=test_loader,
    device="cuda",
    use_amp=True,
    autocast_dtype="fp16",     
    channels_last=False,      
    measure_flops=True,       
    flops_warmup_batches=1)

print(f"loss: {avg_loss:.4f}")
print(f"top1: {metrics['top1']:.2f} | top3: {metrics['top3']:.2f} | top5: {metrics['top5']:.2f}")

print(f"throughput: {metrics['imgs_per_sec']:.1f} imgs/s | epoch: {metrics['epoch_time_sec']:.2f}s | ms/batch: {metrics['ms_per_batch']:.2f}")
print(f"GPU mem: alloc={metrics['gpu_mem_allocated_mib']:.0f} MiB | reserved={metrics['gpu_mem_reserved_mib']:.0f} MiB | peak={metrics['gpu_mem_peak_allocated_mib']:.0f} MiB")
print(f"model: params={int(metrics['model_params']):,} | param_size={metrics['model_param_size_mib']:.1f} MiB")

# FLOPs/MACs pueden salir nan si no tienes fvcore/thop instalado o si falla el profiler
print(f"FLOPs/forward: {metrics['flops_per_forward']:.3e} | MACs/forward: {metrics['macs_per_forward']:.3e}")

loss: 1.3438
top1: 64.62 | top3: 82.81 | top5: 88.10
throughput: 2099.3 imgs/s | epoch: 4.76s | ms/batch: 28.75
GPU mem: alloc=956 MiB | reserved=3232 MiB | peak=1006 MiB
model: params=20,305,588 | param_size=77.5 MiB
FLOPs/forward: nan | MACs/forward: nan
