In [1]:
# --- Dataset registry ---------------------------------------------------------
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

def get_loaders(dataset_name, batch_size=128, test_batch_size=1000, data_root='./data'):
    """
    Returns: train_loader, test_loader, input_size, num_classes, meta (dict)
    """
    name = dataset_name.lower()
    meta = {}

    # Generic normalizations (safe defaults). If you want canonical stats, compute them once.
    NORM_1C = transforms.Normalize((0.5,), (0.5,))
    NORM_3C = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))

    if name == 'mnist':
        # (You already have this; included for completeness.)
        tfm = transforms.Compose([transforms.ToTensor(),
                                  transforms.Normalize((0.1307,), (0.3081,))])
        train = datasets.MNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.MNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name == 'fashionmnist':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.FashionMNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.FashionMNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name == 'kmnist':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.KMNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.KMNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name in ('emnist_balanced', 'emnist'):
        # EMNIST Balanced has 47 classes. If digits look rotated, add a Rotate(90) or permute.
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.EMNIST(data_root, split='balanced', train=True, download=True, transform=tfm)
        test  = datasets.EMNIST(data_root, split='balanced', train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 47
        meta['note'] = 'EMNIST images can appear rotated; for visualization add a 90-degree rotate.'

    elif name == 'qmnist':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.QMNIST(data_root, what='train', download=True, transform=tfm)
        test  = datasets.QMNIST(data_root, what='test',  download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name == 'svhn':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.SVHN(data_root, split='train', download=True, transform=tfm)
        test  = datasets.SVHN(data_root, split='test',  download=True, transform=tfm)
        inp, ncls = 32*32*3, 10

    elif name == 'cifar10':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.CIFAR10(data_root, train=True,  download=True, transform=tfm)
        test  = datasets.CIFAR10(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 32*32*3, 10

    elif name == 'cifar100':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.CIFAR100(data_root, train=True,  download=True, transform=tfm)
        test  = datasets.CIFAR100(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 32*32*3, 100

    elif name in ('stl10', 'stl10_32'):
        # Downsample to 32x32 to keep input dim manageable for MLPs.
        tfm = transforms.Compose([transforms.Resize((32,32)),
                                  transforms.ToTensor(), NORM_3C])
        train = datasets.STL10(data_root, split='train', download=True, transform=tfm)
        test  = datasets.STL10(data_root, split='test',  download=True, transform=tfm)
        inp, ncls = 32*32*3, 10
        meta['note'] = 'Original STL10 is 96x96; here we resize to 32x32 for MLPs.'

    else:
        raise ValueError(f"Unknown dataset: {dataset_name}")

    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test,  batch_size=test_batch_size, shuffle=False, num_workers=2, pin_memory=True)
    return train_loader, test_loader, inp, ncls, meta


In [9]:
import os
import json
import math
from pathlib import Path
from typing import List, Dict, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

# --------------------------------------------------------------------------
# Assumes you already define `get_loaders(dataset_name, batch_size=..., ...)`
# exactly as provided in your message. We'll just import it here.
# If it's in the same file, remove the import and place the function above.
# --------------------------------------------------------------------------
# from your_dataloader_module import get_loaders   # <- uncomment if needed

# --------------------------
# Repro & device utilities
# --------------------------
def seed_everything(seed: int = 1337):
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Determinism can slow training; enable only if you need exact repeatability.
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")

# --------------------------
# Model zoo: 5 CNN variants
# --------------------------
# We define channel layouts that mirror the spirit of your MLP configs:
# - Underfit & brittle
# - Deep-Narrow
# - Balanced (baseline)
# - Balanced-Deep
# - Wide
#
# These are intentionally modest so they run on a single GPU/CPU reliably.

cnn_model_configs: Dict[str, Dict] = {
    # Underfit & brittle
    "Tiny_Underfit": {
        "channels": [16],          # one conv block, then GAP + FC
        "lr": 3e-4,
        "epochs": 10,
        "dropout": 0.0,
        "pool_every": 1            # pool after each block
    },
    # Deep-narrow (depth sensitivity)
    "Deep_Narrow": {
        "channels": [16, 16, 16, 16, 16, 16, 16, 16],
        "lr": 3e-4,
        "epochs": 15,
        "dropout": 0.2,
        "pool_every": 2
    },
    # Well-trained baseline
    "Balanced": {
        "channels": [32, 64, 128],
        "lr": 3e-4,
        "epochs": 15,
        "dropout": 0.2,
        "pool_every": 1
    },
    # Deep but still robust
    "Balanced_Deep": {
        "channels": [32, 64, 128, 128],
        "lr": 3e-4,
        "epochs": 20,
        "dropout": 0.3,
        "pool_every": 1
    },
    # Overparameterized
    "Wide": {
        "channels": [64, 128, 256],
        "lr": 1e-3,
        "epochs": 30,
        "dropout": 0.0,
        "pool_every": 1
    },
}

# --------------------------
# Flexible CNN building blocks
# --------------------------
class ConvBlock(nn.Module):
    def __init__(self, in_ch, out_ch, dropout=0.0):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1, bias=False)
        self.bn   = nn.BatchNorm2d(out_ch)
        self.do   = nn.Dropout2d(dropout) if dropout > 0 else nn.Identity()

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = F.relu(x, inplace=True)
        x = self.do(x)
        return x

class FlexibleCNN(nn.Module):
    def __init__(self, in_ch: int, num_classes: int, channels: List[int],
                 dropout: float = 0.0, pool_every: int = 1):
        super().__init__()
        assert len(channels) >= 1, "Need at least one conv stage"
        blocks = []
        c_prev = in_ch
        for i, c in enumerate(channels, start=1):
            blocks.append(ConvBlock(c_prev, c, dropout=dropout))
            # lightweight option: insert a 3x3 depthwise-separable? Keeping simple here.
            if pool_every > 0 and (i % pool_every == 0):
                blocks.append(nn.MaxPool2d(kernel_size=2, stride=2))
            c_prev = c
        self.features = nn.Sequential(*blocks)
        self.gap = nn.AdaptiveAvgPool2d(1)  # Global Average Pooling
        self.classifier = nn.Linear(channels[-1], num_classes)

        # Kaiming init for convs, zero bias, and suitable init for BN/FC
        self.apply(self._init_weights)

    @staticmethod
    def _init_weights(m):
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.ones_(m.weight)
            nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.features(x)
        x = self.gap(x).squeeze(-1).squeeze(-1)  # (B, C, 1, 1) -> (B, C)
        x = self.classifier(x)
        return x

# --------------------------
# Neff pruning utilities (plug-in; default OFF in training)
# --------------------------
@torch.no_grad()
def _neff_from_abs_vector(v_abs: torch.Tensor) -> int:
    # v_abs: 1-D absolute values
    s = v_abs.sum()
    if s <= 0:
        return 0
    p = v_abs / s
    denom = (p * p).sum()
    if denom <= 0:
        return 0
    neff = int(math.floor((1.0 / denom).item())) if isinstance(denom, torch.Tensor) else int(math.floor(1.0 / denom))
    # Clamp to [1, N]
    neff = max(1, min(neff, v_abs.numel()))
    return neff

@torch.no_grad()
def _keep_topk_by_abs(original: torch.Tensor, k: int) -> torch.Tensor:
    if k <= 0 or original.numel() == 0:
        return torch.zeros_like(original)
    if k >= original.numel():
        return original.clone()
    v_abs = original.abs().flatten()
    topk = torch.topk(v_abs, k, largest=True, sorted=False).indices
    mask = torch.zeros_like(v_abs, dtype=torch.bool)
    mask[topk] = True
    mask = mask.view_as(original)
    pruned = torch.where(mask, original, torch.zeros_like(original))
    return pruned

@torch.no_grad()
def apply_neff_pruning(model: nn.Module,
                       granularity: str = "per_filter",   # "per_filter" or "per_tensor"
                       modules: Tuple = (nn.Conv2d, nn.Linear)) -> Dict[str, Dict]:
    """
    Apply Neff pruning in-place to Conv/Linear weights.
    - For Conv2d ("per_filter"): treat each out-channel kernel [in_c, kh, kw] as a series.
    - For Conv2d ("per_tensor"): treat entire weight tensor as a single series.
    - For Linear: treat entire weight matrix as a single series.

    Returns:
        stats dict: {param_name: {"neff": ..., "N": ..., "sparsity": ...}, ...}
    """
    assert granularity in {"per_filter", "per_tensor"}
    stats = {}
    for name, module in model.named_modules():
        if isinstance(module, modules):
            W = module.weight.data
            W_shape = tuple(W.shape)
            before_nnz = (W != 0).sum().item()
            N_total = W.numel()

            if isinstance(module, nn.Conv2d):
                if granularity == "per_filter":
                    # loop over out channels
                    for f in range(W.shape[0]):
                        w = W[f]  # (in_c, kh, kw)
                        v_abs = w.abs().flatten()
                        neff = _neff_from_abs_vector(v_abs)
                        W[f] = _keep_topk_by_abs(w, neff)
                    after_nnz = (W != 0).sum().item()
                    stats_key = f"{name}.weight"
                    stats[stats_key] = {
                        "granularity": "per_filter",
                        "shape": W_shape,
                        "N": N_total,
                        "nonzeros": after_nnz,
                        "sparsity": 1.0 - (after_nnz / N_total)
                    }
                else:
                    # per_tensor
                    v_abs = W.abs().flatten()
                    neff = _neff_from_abs_vector(v_abs)
                    W[:] = _keep_topk_by_abs(W, neff)
                    after_nnz = (W != 0).sum().item()
                    stats_key = f"{name}.weight"
                    stats[stats_key] = {
                        "granularity": "per_tensor",
                        "shape": W_shape,
                        "N": N_total,
                        "nonzeros": after_nnz,
                        "sparsity": 1.0 - (after_nnz / N_total)
                    }
            elif isinstance(module, nn.Linear):
                # treat whole matrix as one series
                v_abs = W.abs().flatten()
                neff = _neff_from_abs_vector(v_abs)
                W[:] = _keep_topk_by_abs(W, neff)
                after_nnz = (W != 0).sum().item()
                stats_key = f"{name}.weight"
                stats[stats_key] = {
                    "granularity": "per_tensor",
                    "shape": W_shape,
                    "N": N_total,
                    "nonzeros": after_nnz,
                    "sparsity": 1.0 - (after_nnz / N_total)
                }
            # We do not prune BN or biases here by design.
    return stats

# --------------------------
# Training / evaluation
# --------------------------
def accuracy(logits: torch.Tensor, targets: torch.Tensor) -> float:
    preds = logits.argmax(dim=1)
    return (preds == targets).float().mean().item()

def train_one_epoch(model, optimizer, scaler, train_loader, device):
    model.train()
    total_loss, total_acc, total = 0.0, 0.0, 0
    for x, y in train_loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
            logits = model(x)
            loss = F.cross_entropy(logits, y)
        if device.type == "cuda":
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        bs = y.size(0)
        total += bs
        total_loss += loss.item() * bs
        total_acc  += accuracy(logits.detach(), y) * bs
    return total_loss / total, total_acc / total

@torch.no_grad()
def evaluate(model, test_loader, device):
    model.eval()
    total_loss, total_acc, total = 0.0, 0.0, 0
    for x, y in test_loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        logits = model(x)
        loss = F.cross_entropy(logits, y)
        bs = y.size(0)
        total += bs
        total_loss += loss.item() * bs
        total_acc  += accuracy(logits, y) * bs
    return total_loss / total, total_acc / total

# --------------------------
# Orchestration
# --------------------------
def save_json(obj, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        json.dump(obj, f, indent=2)

def append_jsonl(obj, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "a") as f:
        f.write(json.dumps(obj) + "\n")

def build_model(in_ch: int, num_classes: int, cfg: Dict) -> nn.Module:
    return FlexibleCNN(
        in_ch=in_ch,
        num_classes=num_classes,
        channels=cfg["channels"],
        dropout=cfg.get("dropout", 0.0),
        pool_every=cfg.get("pool_every", 1),
    )

def train_models_on_dataset(
    dataset_name: str,
    configs: Dict[str, Dict],
    batch_size: int = 128,
    test_batch_size: int = 1000,
    data_root: str = "./data",
    out_dir: str = "./checkpoints_cnn_neff",
    do_prune_after_train: bool = False,          # default OFF; enable when ready
    prune_granularity: str = "per_filter"        # "per_filter" or "per_tensor"
):
    # You already have this function defined; we call it directly.
    train_loader, test_loader, inp, ncls, meta = get_loaders(dataset_name, batch_size, test_batch_size, data_root)

    # Infer input channels safely from first sample
    sample_x, _ = next(iter(train_loader))
    in_ch = sample_x.shape[1]

    device = get_device()
    print(f"[{dataset_name}] device: {device}")

    for name, cfg in configs.items():
        print(f"\n=== Training {name} on {dataset_name} ===")
        model = build_model(in_ch, ncls, cfg).to(device)
        opt = torch.optim.AdamW(model.parameters(), lr=cfg["lr"], weight_decay=1e-4)
        scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))

        # Output folders & bookkeeping
        run_dir = Path(out_dir) / dataset_name / name
        run_dir.mkdir(parents=True, exist_ok=True)
        save_json({"dataset": dataset_name, **cfg}, run_dir / "config.json")

        best_acc = -1.0
        best_path_full = run_dir / "best_preprune.pt"
        best_path_sd   = run_dir / "best_preprune_state_dict.pt"

        for epoch in range(1, cfg["epochs"] + 1):
            tr_loss, tr_acc = train_one_epoch(model, opt, scaler, train_loader, device)
            te_loss, te_acc = evaluate(model, test_loader, device)

            log_row = {
                "epoch": epoch,
                "train_loss": round(tr_loss, 6),
                "train_acc": round(tr_acc, 6),
                "test_loss": round(te_loss, 6),
                "test_acc": round(te_acc, 6),
            }
            append_jsonl(log_row, run_dir / "metrics.jsonl")

            print(f"[{dataset_name}][{name}] "
                  f"Epoch {epoch:03d}/{cfg['epochs']:03d} | "
                  f"train_acc={tr_acc:.4f} test_acc={te_acc:.4f}")

            if te_acc > best_acc:
                best_acc = te_acc
                torch.save(model, best_path_full)
                torch.save(model.state_dict(), best_path_sd)

        print(f"Best test acc (pre-prune): {best_acc:.4f}  -> saved to {best_path_full}")

        # Optional: apply Neff pruning AFTER training and save a pruned copy
        if do_prune_after_train:
            print(f"Applying Neff pruning ({prune_granularity}) to {name} on {dataset_name} ...")
            # Load best preprune before pruning to be safe
            model = torch.load(best_path_full, map_location=device)
            model.eval()
            stats = apply_neff_pruning(model, granularity=prune_granularity)
            pruned_te_loss, pruned_te_acc = evaluate(model, test_loader, device)

            prune_report = {
                "granularity": prune_granularity,
                "layers": stats,
                "post_prune_test_acc": pruned_te_acc,
                "post_prune_test_loss": pruned_te_loss,
            }
            save_json(prune_report, run_dir / f"neff_prune_{prune_granularity}.json")
            torch.save(model, run_dir / f"best_postprune_neff_{prune_granularity}.pt")
            torch.save(model.state_dict(), run_dir / f"best_postprune_neff_{prune_granularity}_state_dict.pt")
            print(f"Post-prune acc: {pruned_te_acc:.4f}  -> pruned model saved.")

# --------------------------
# Main
# --------------------------
def main():
    seed_everything(1337)
    OUT_DIR = "./checkpoints_cnn_neff"

    # Datasets you requested
    datasets_to_run = ["fashionmnist", "cifar10"]

    # Toggle pruning (default: train first, no pruning)
    DO_PRUNE_AFTER_TRAIN = False
    PRUNE_GRANULARITY = "per_filter"  # or "per_tensor"

    for dset in datasets_to_run:
        train_models_on_dataset(
            dataset_name=dset,
            configs=cnn_model_configs,
            batch_size=128,
            test_batch_size=1000,
            data_root="./data",
            out_dir=OUT_DIR,
            do_prune_after_train=DO_PRUNE_AFTER_TRAIN,
            prune_granularity=PRUNE_GRANULARITY
        )

if __name__ == "__main__":
    main()


[fashionmnist] device: cuda

=== Training Tiny_Underfit on fashionmnist ===


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
  with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):


[fashionmnist][Tiny_Underfit] Epoch 001/010 | train_acc=0.1274 test_acc=0.1853
[fashionmnist][Tiny_Underfit] Epoch 002/010 | train_acc=0.2533 test_acc=0.3552
[fashionmnist][Tiny_Underfit] Epoch 003/010 | train_acc=0.3599 test_acc=0.3692
[fashionmnist][Tiny_Underfit] Epoch 004/010 | train_acc=0.3815 test_acc=0.3941
[fashionmnist][Tiny_Underfit] Epoch 005/010 | train_acc=0.4006 test_acc=0.4175
[fashionmnist][Tiny_Underfit] Epoch 006/010 | train_acc=0.4188 test_acc=0.4310
[fashionmnist][Tiny_Underfit] Epoch 007/010 | train_acc=0.4380 test_acc=0.4574
[fashionmnist][Tiny_Underfit] Epoch 008/010 | train_acc=0.4571 test_acc=0.4776
[fashionmnist][Tiny_Underfit] Epoch 009/010 | train_acc=0.4766 test_acc=0.4913
[fashionmnist][Tiny_Underfit] Epoch 010/010 | train_acc=0.4924 test_acc=0.4959
Best test acc (pre-prune): 0.4959  -> saved to checkpoints_cnn_neff\fashionmnist\Tiny_Underfit\best_preprune.pt

=== Training Deep_Narrow on fashionmnist ===
[fashionmnist][Deep_Narrow] Epoch 001/015 | train_ac

In [None]:
def main():
    seed_everything(1337)
    OUT_DIR = "./checkpoints_cnn_neff"

    # Datasets you requested
    datasets_to_run = ["fashionmnist", "cifar10"]

    # Toggle pruning (default: train first, no pruning)
    DO_PRUNE_AFTER_TRAIN = True
    PRUNE_GRANULARITY = "per_filter"  # or "per_tensor"

    for dset in datasets_to_run:
        train_models_on_dataset(
            dataset_name=dset,
            configs=cnn_model_configs,
            batch_size=128,
            test_batch_size=1000,
            data_root="./data",
            out_dir=OUT_DIR,
            do_prune_after_train=DO_PRUNE_AFTER_TRAIN,
            prune_granularity=PRUNE_GRANULARITY
        )

if __name__ == "__main__":
    main()


[fashionmnist] device: cuda

=== Training Tiny_Underfit on fashionmnist ===


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
  with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):


[fashionmnist][Tiny_Underfit] Epoch 001/010 | train_acc=0.1274 test_acc=0.1853
[fashionmnist][Tiny_Underfit] Epoch 002/010 | train_acc=0.2533 test_acc=0.3552
[fashionmnist][Tiny_Underfit] Epoch 003/010 | train_acc=0.3599 test_acc=0.3692
[fashionmnist][Tiny_Underfit] Epoch 004/010 | train_acc=0.3815 test_acc=0.3941
[fashionmnist][Tiny_Underfit] Epoch 005/010 | train_acc=0.4006 test_acc=0.4175
[fashionmnist][Tiny_Underfit] Epoch 006/010 | train_acc=0.4188 test_acc=0.4310
[fashionmnist][Tiny_Underfit] Epoch 007/010 | train_acc=0.4380 test_acc=0.4574
[fashionmnist][Tiny_Underfit] Epoch 008/010 | train_acc=0.4571 test_acc=0.4776
[fashionmnist][Tiny_Underfit] Epoch 009/010 | train_acc=0.4766 test_acc=0.4913
[fashionmnist][Tiny_Underfit] Epoch 010/010 | train_acc=0.4924 test_acc=0.4959
Best test acc (pre-prune): 0.4959  -> saved to checkpoints_cnn_neff\fashionmnist\Tiny_Underfit\best_preprune.pt
Applying Neff pruning (per_filter) to Tiny_Underfit on fashionmnist ...
Post-prune acc: 0.3474  ->

In [None]:
def main():
    seed_everything(1337)
    OUT_DIR = "./checkpoints_cnn_neff"

    # Datasets you requested
    datasets_to_run = ["fashionmnist", "cifar10"]

    # Toggle pruning (default: train first, no pruning)
    DO_PRUNE_AFTER_TRAIN = True
    PRUNE_GRANULARITY = "per_tensor"

    for dset in datasets_to_run:
        train_models_on_dataset(
            dataset_name=dset,
            configs=cnn_model_configs,
            batch_size=128,
            test_batch_size=1000,
            data_root="./data",
            out_dir=OUT_DIR,
            do_prune_after_train=DO_PRUNE_AFTER_TRAIN,
            prune_granularity=PRUNE_GRANULARITY
        )

if __name__ == "__main__":
    main()


[fashionmnist] device: cuda

=== Training Tiny_Underfit on fashionmnist ===


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
  with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):


[fashionmnist][Tiny_Underfit] Epoch 001/010 | train_acc=0.1274 test_acc=0.1853
[fashionmnist][Tiny_Underfit] Epoch 002/010 | train_acc=0.2533 test_acc=0.3552
[fashionmnist][Tiny_Underfit] Epoch 003/010 | train_acc=0.3599 test_acc=0.3692
[fashionmnist][Tiny_Underfit] Epoch 004/010 | train_acc=0.3815 test_acc=0.3941
[fashionmnist][Tiny_Underfit] Epoch 005/010 | train_acc=0.4006 test_acc=0.4175
[fashionmnist][Tiny_Underfit] Epoch 006/010 | train_acc=0.4188 test_acc=0.4310
[fashionmnist][Tiny_Underfit] Epoch 007/010 | train_acc=0.4380 test_acc=0.4574
[fashionmnist][Tiny_Underfit] Epoch 008/010 | train_acc=0.4571 test_acc=0.4776
[fashionmnist][Tiny_Underfit] Epoch 009/010 | train_acc=0.4766 test_acc=0.4913
[fashionmnist][Tiny_Underfit] Epoch 010/010 | train_acc=0.4924 test_acc=0.4959
Best test acc (pre-prune): 0.4959  -> saved to checkpoints_cnn_neff\fashionmnist\Tiny_Underfit\best_preprune.pt
Applying Neff pruning (per_tensor) to Tiny_Underfit on fashionmnist ...
Post-prune acc: 0.3736  ->