In [None]:
import sys, os

root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
if root_path not in sys.path:
    sys.path.append(root_path)

In [None]:
from src.data.load_svhn import *

train_loader, val_loader, test_loader = get_svhn_dataloaders(
    batch_size=64,
    data_dir="./data",
    num_workers=4,
    val_split=0.1,
    img_size=32)

In [None]:

from src.data.data_utils import *


describe_loader(train_loader, "train_loader", max_batches_for_stats=50)


TRAIN_LOADER SUMMARY
Dataset type        : Subset
  ↳ Wrapped dataset  : SVHNLabelFix (Subset-like)
  ↳ Subset size      : 65932
Num samples         : 65932
Batch size          : 64
Num workers         : 4
Pin memory          : True
Drop last           : False
Sampler             : RandomSampler
len(loader) (#batches): 1031 (≈ ceil(65932/64) = 1031)

First batch:
  x.shape           : (64, 3, 32, 32)
  y.shape           : (64,)
  x.dtype           : torch.float32
  y.dtype           : torch.int64
  x.min/max         : -4.0890 / 3.9359
  y.min/max         : 0 / 9
  unique labels (batch): 10

Quick stats over up to 50 batches:
  Approx mean        : -0.209160
  Approx std         : 1.359469
  Seen label counts  : 10 classes (in sampled batches)
  Top-5 labels       : [(1, 582), (2, 471), (3, 369), (4, 319), (5, 311)]

Full dataset label distribution: (couldn't find targets/labels attribute)


---


In [None]:

from stage_config import * 
from Model_A_OutGridNet import * 
from Model_B_OutGridNet import *

def svhn_stages_t4(drop_path=0.08):
    # resoluciones: 64 -> 32 -> 16 -> 8
    return [
        StageCfg(dim=64,  depth=2, num_heads=2,  grid_size=8, outlook_heads=2,  drop_path=drop_path),
        StageCfg(dim=128, depth=2, num_heads=4,  grid_size=8, outlook_heads=4,  drop_path=drop_path),
        StageCfg(dim=256, depth=3, num_heads=8,  grid_size=4, outlook_heads=8,  drop_path=drop_path),
        StageCfg(dim=384, depth=1, num_heads=6,  grid_size=2, outlook_heads=6,  drop_path=drop_path),]

stages = svhn_stages_t4(drop_path=0.08)

model = MaxOutNet(
    num_classes=10,
    stages=stages,
    stem_dim=64,
    dpr_max=0.1)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [29]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

n_params = count_trainable_parameters(model)
print(f"Trainable parameters: {n_params:,}")

Trainable parameters: 14,564,548


In [None]:
import random, numpy as np
from src.training.train_full_model import *


seed = 7
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

random.seed(seed);
np.random.seed(seed)
torch.backends.cudnn.benchmark = True


history, model = train_model(
    model=model,
    train_loader=train_loader,
    epochs=50,
    val_loader=val_loader,
    device=device,

    lr=5e-4,
    weight_decay=0.05,

    autocast_dtype="fp16" if device == "cuda" else "fp32",
    use_amp=(device == "cuda"),
    grad_clip_norm=1.0,

    warmup_ratio=0.05,
    min_lr=1e-6,

    label_smoothing=0.0,

    print_every=100,
    save_path="best_maxout_medium.pt",
    last_path="last_maxout_medium.pt",
    resume_path=None,

    # Augmentations
    mix_prob=0.5,
    mixup_alpha=0.0,
    cutmix_alpha=1.0,

    num_classes=10,
    channels_last=True)

=== Run config ===
device=cuda | amp=True | autocast_dtype=fp16 | channels_last=True
epochs=50 | steps/epoch=1031 | total_steps=51550 | warmup_steps=2577
batch_size=64 | input_shape=(64, 3, 32, 32) | num_classes=10
opt=AdamW | lr=0.0005 | wd=0.05 | grad_clip_norm=1.0
aug: mix_prob=0.5 | mixup_alpha=0.0 | cutmix_alpha=1.0 | label_smoothing=0.0

=== Epoch 1/50 ===
[train step 100/1031] loss 2.3447 | top1 15.83% | top3 36.95% | top5 56.61% | 719.1 img/s | lr 1.94e-05 | gnorm 6.707 | clip 100.0% | oflow 0 | nonfinite 0 | scale 65536.0
[train step 200/1031] loss 2.2703 | top1 18.80% | top3 41.45% | top5 60.47% | 730.0 img/s | lr 3.88e-05 | gnorm 6.660 | clip 100.0% | oflow 0 | nonfinite 0 | scale 65536.0
[train step 300/1031] loss 2.2095 | top1 21.82% | top3 45.62% | top5 63.96% | 733.5 img/s | lr 5.82e-05 | gnorm 6.468 | clip 100.0% | oflow 0 | nonfinite 0 | scale 65536.0
[train step 400/1031] loss 2.1642 | top1 23.98% | top3 48.93% | top5 66.73% | 734.4 img/s | lr 7.76e-05 | gnorm 6.099 |

In [35]:
evaluate_one_epoch(model=model,dataloader=test_loader)

(0.14163255964914714,
 {'top1': 97.57606023355869,
  'top3': 99.39689612784265,
  'top5': 99.68884449907806})