Libraries (don't change)

In [None]:
!pip -q install torchinfo

from dataclasses import dataclass
from typing import List, Callable, Optional, Tuple

import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
from torchinfo import summary

from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt
%matplotlib inline


Device (don't change)

In [None]:

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
use_amp = (DEVICE == "cuda")

print(f"Using device: {DEVICE}")
print(f"Mixed precision (AMP): {use_amp}")

Data (don't change)

In [None]:

class DataManager:
    def __init__(self, dataset_class, root: str = "./data", val_fraction: float = 0.1,
                 batch_size: int = 32, seed: int = 42):
        self.dataset_class = dataset_class
        self.root = root
        self.val_fraction = val_fraction
        self.batch_size = batch_size
        self.seed = seed

        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1918,), (0.3483,))
        ])

    def get_loaders(self) -> Tuple[DataLoader, DataLoader, DataLoader]:
        full_train = self.dataset_class(root=self.root, train=True,
                                        download=True, transform=self.transform)
        test_ds = self.dataset_class(root=self.root, train=False,
                                     download=True, transform=self.transform)

        val_size = int(len(full_train) * self.val_fraction)
        train_size = len(full_train) - val_size

        generator = torch.Generator().manual_seed(self.seed)
        train_ds, val_ds = random_split(full_train, [train_size, val_size], generator=generator)

        train_loader = DataLoader(train_ds, batch_size=self.batch_size,
                                  shuffle=True, num_workers=2, pin_memory=True)
        val_loader   = DataLoader(val_ds,   batch_size=self.batch_size,
                                  shuffle=False, num_workers=2, pin_memory=True)
        test_loader  = DataLoader(test_ds,  batch_size=self.batch_size,
                                  shuffle=False, num_workers=2, pin_memory=True)

        print(f"Train: {len(train_ds)} | Val: {len(val_ds)} | Test: {len(test_ds)}")
        return train_loader, val_loader, test_loader

Configurations (don't change)

In [None]:

@dataclass
class LayerSpec:
    out_dim: int
    activation: Callable[[torch.Tensor], torch.Tensor] = F.relu
    dropout: float = 0.0
    batch_norm: bool = True
    weight_decay: float = 0.0

@dataclass
class ModelConfig:
    input_dim: Tuple[int, int, int] = (1, 28, 28)
    num_classes: int = 10
    layers: List[LayerSpec] = None

@dataclass
class TrainConfig:
    batch_size: int = 64
    epochs: int = 100
    lr: float = 1e-4
    patience: int = 15
    min_delta: float = 1e-4
    val_fraction: float = 0.1
    seed: int = 42


Model

In [None]:
    
class MLPFromConfig(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        flat_dim = config.input_dim[0] * config.input_dim[1] * config.input_dim[2]
        self.layers_specs = config.layers
        layers = []
        prev_dim = flat_dim

        for i, spec in enumerate(config.layers):
            linear = nn.Linear(prev_dim, spec.out_dim)

            layers.append(linear)
            if spec.batch_norm:
                layers.append(nn.BatchNorm1d(spec.out_dim))
            if spec.dropout > 0:
                layers.append(nn.Dropout(spec.dropout))
            layers.append(spec.activation())
            prev_dim = spec.out_dim

        # Final classifier layer
        self.final_linear = nn.Linear(prev_dim, config.num_classes)
        layers.append(self.final_linear)

        self.net = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x.view(x.size(0), -1)
        return self.net(x)

    def get_layer_params(self):
        param_groups = []
        for i, spec in enumerate(self.layers_specs):
            linear_layer = self.net[i * (4 if spec.batch_norm or spec.dropout > 0 else 3)]
            pass
        return self.layers_specs

Early Stopping (don't change)

In [None]:

class EarlyStopping:
    def __init__(self, patience: int = 10, min_delta: float = 1e-4):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')
        self.should_stop = False

    def __call__(self, val_loss: float) -> bool:
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True
        return self.should_stop

Trainer (don't change)

In [None]:

class Trainer:
    def __init__(self, model: nn.Module, config: TrainConfig):
        self.model = model.to(DEVICE)
        self.config = config
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = self._build_optimizer()
        self.scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
        self.early_stopping = EarlyStopping(patience=config.patience,
                                            min_delta=config.min_delta)

        self.history = {"train_loss": [], "train_acc": [],
                        "val_loss": [], "val_acc": []}

    def _build_optimizer(self):

        # Collect all Linear layers in the order they appear
        linear_layers = []
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear):
                linear_layers.append((name, module))

        param_groups = []

        for i, spec in enumerate(self.model.layers_specs):
            name, layer = linear_layers[i]
            param_groups.append({
                'params': layer.parameters(),
                'weight_decay': spec.weight_decay
            })

        final_name, final_layer = linear_layers[-1]
        param_groups.append({
            'params': final_layer.parameters(),
            'weight_decay': 0.0
        })

        return torch.optim.SGD(param_groups, momentum=0.9, nesterov=True, lr=self.config.lr)

    def _train_epoch(self, loader: DataLoader):
        self.model.train()
        total_loss = 0.0
        correct = 0
        total = 0

        for data, target in loader:
            data, target = data.to(DEVICE), target.to(DEVICE)

            self.optimizer.zero_grad()
            with torch.cuda.amp.autocast(enabled=use_amp):
                output = self.model(data)
                loss = self.criterion(output, target)

            self.scaler.scale(loss).backward()
            self.scaler.step(self.optimizer)
            self.scaler.update()

            total_loss += loss.item() * data.size(0)
            correct += (output.argmax(1) == target).sum().item()
            total += data.size(0)

        return total_loss / total, correct / total

    @torch.no_grad()
    def _eval_epoch(self, loader: DataLoader):
        self.model.eval()
        total_loss = 0.0
        correct = 0
        total = 0

        for data, target in loader:
            data, target = data.to(DEVICE), target.to(DEVICE)
            with torch.cuda.amp.autocast(enabled=use_amp):
                output = self.model(data)
                loss = self.criterion(output, target)

            total_loss += loss.item() * data.size(0)
            correct += (output.argmax(1) == target).sum().item()
            total += data.size(0)

        return total_loss / total, correct / total

    def fit(self, train_loader: DataLoader, val_loader: DataLoader):
        print("üöÄ Starting training...\n")
        for epoch in range(1, self.config.epochs + 1):
            train_loss, train_acc = self._train_epoch(train_loader)
            val_loss, val_acc     = self._eval_epoch(val_loader)

            self.history["train_loss"].append(train_loss)
            self.history["train_acc"].append(train_acc)
            self.history["val_loss"].append(val_loss)
            self.history["val_acc"].append(val_acc)

            print(f"Epoch {epoch:3d} | "
                  f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | "
                  f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f}")

            if self.early_stopping(val_loss):
                print(f"\nüõë Early stopping triggered at epoch {epoch}")
                break

        print("\n‚úÖ Training complete!")

    @torch.no_grad()
    def evaluate(self, loader: DataLoader):
        return self._eval_epoch(loader)

    @torch.no_grad()
    def predict_all(self, loader: DataLoader):
        self.model.eval()
        all_preds, all_targets = [], []
        for x, y in loader:
            x = x.to(DEVICE, non_blocking=True)
            logits = self.model(x)
            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.append(preds)
            all_targets.append(y.numpy())
        return np.concatenate(all_preds), np.concatenate(all_targets)


    def save(self, path: str = "mlp_best.pt"):
        torch.save(self.model.state_dict(), path)
        print(f"üíæ Model saved to {path}")



Run (do change)

In [None]:
train_cfg = TrainConfig(
    batch_size=128,
    epochs=100,
    lr=1e-4,
    patience=5,
    val_fraction=0.1
)

data_mgr = DataManager(
    dataset_class=datasets.KMNIST,
    val_fraction=train_cfg.val_fraction,
    batch_size=train_cfg.batch_size,
    seed=train_cfg.seed
)
train_loader, val_loader, test_loader = data_mgr.get_loaders()

model_cfg = ModelConfig(
    layers=[
        # Add layers here in the format of LayerSpec. For example
        LayerSpec(out_dim=10,  dropout=0.1, batch_norm=False, activation = nn.ReLU, weight_decay=5e-1),
    ]
)


# Build model
model = MLPFromConfig(model_cfg)
print(summary(model, input_size=(1, 28, 28)))

trainer = Trainer(model, train_cfg)
trainer.fit(train_loader, val_loader)
trainer.save("mlp_colab_best.pt")


try:
    import mlflow
except ImportError:
    !pip -q install mlflow
    import mlflow

import pandas as pd
from sklearn.metrics import classification_report, accuracy_score

def compute_metrics(y_true, y_pred):
    rep = classification_report(y_true, y_pred, output_dict=True, digits=4, zero_division=0)
    acc = accuracy_score(y_true, y_pred)

    out = {"accuracy": float(acc)}

    # Per-class metrics (classes 0..9)
    for c in range(10):
        out[f"precision_c{c}"] = float(rep[str(c)]["precision"])
        out[f"recall_c{c}"]    = float(rep[str(c)]["recall"])
        out[f"f1_c{c}"]        = float(rep[str(c)]["f1-score"])

    # Averages
    out["precision_macro"] = float(rep["macro avg"]["precision"])
    out["recall_macro"]    = float(rep["macro avg"]["recall"])
    out["f1_macro"]        = float(rep["macro avg"]["f1-score"])

    out["precision_weighted"] = float(rep["weighted avg"]["precision"])
    out["recall_weighted"]    = float(rep["weighted avg"]["recall"])
    out["f1_weighted"]        = float(rep["weighted avg"]["f1-score"])

    return out

mlflow.set_experiment("KMNIST_ex2")


experiments = [
    ("exp01_256_128_lr1e3", 1e-3, [
        LayerSpec(out_dim=256, dropout=0.1, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=128, dropout=0.1, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=10,  dropout=0.0, batch_norm=False, activation=nn.ReLU, weight_decay=0.0),
    ]),
    ("exp02_512_256_lr1e3", 1e-3, [
        LayerSpec(out_dim=512, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=256, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=10,  dropout=0.0, batch_norm=False, activation=nn.ReLU, weight_decay=0.0),
    ]),
    ("exp03_512_256_128_lr1e3", 1e-3, [
        LayerSpec(out_dim=512, dropout=0.3, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=256, dropout=0.3, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=128, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=10,  dropout=0.0, batch_norm=False, activation=nn.ReLU, weight_decay=0.0),
    ]),
    ("exp04_256_128_noBN_lr1e3", 1e-3, [
        LayerSpec(out_dim=256, dropout=0.1, batch_norm=False, activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=128, dropout=0.1, batch_norm=False, activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=10,  dropout=0.0, batch_norm=False, activation=nn.ReLU, weight_decay=0.0),
    ]),
    ("exp05_512_256_lr3e4", 3e-4, [
        LayerSpec(out_dim=512, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=256, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=10,  dropout=0.0, batch_norm=False, activation=nn.ReLU, weight_decay=0.0),
    ]),
    ("exp06_512_256_lr2e3", 2e-3, [
        LayerSpec(out_dim=512, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=256, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=10,  dropout=0.0, batch_norm=False, activation=nn.ReLU, weight_decay=0.0),
    ]),
    ("exp07_1024_256_lr1e3", 1e-3, [
        LayerSpec(out_dim=1024, dropout=0.3, batch_norm=True, activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=256,  dropout=0.3, batch_norm=True, activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=10,   dropout=0.0, batch_norm=False, activation=nn.ReLU, weight_decay=0.0),
    ]),
    ("exp08_256_256_256_lr1e3", 1e-3, [
        LayerSpec(out_dim=256, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=256, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=256, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=10,  dropout=0.0, batch_norm=False, activation=nn.ReLU, weight_decay=0.0),
    ]),
    ("exp09_512_512_lr1e3", 1e-3, [
        LayerSpec(out_dim=512, dropout=0.4, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=512, dropout=0.4, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-4),
        LayerSpec(out_dim=10,  dropout=0.0, batch_norm=False, activation=nn.ReLU, weight_decay=0.0),
    ]),
    ("exp10_512_256_wd1e3_lr1e3", 1e-3, [
        LayerSpec(out_dim=512, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-3),
        LayerSpec(out_dim=256, dropout=0.2, batch_norm=True,  activation=nn.ReLU, weight_decay=1e-3),
        LayerSpec(out_dim=10,  dropout=0.0, batch_norm=False, activation=nn.ReLU, weight_decay=0.0),
    ]),
]

results_rows = []
best_test_acc = -1.0
best_model_path = "mlp_colab_best.pt"

for exp_name, lr, layers in experiments:
    # create per-experiment train config (we do NOT change your original train_cfg lines)
    train_cfg_exp = TrainConfig(
        batch_size=train_cfg.batch_size,
        epochs=train_cfg.epochs,
        lr=lr,
        patience=train_cfg.patience,
        val_fraction=train_cfg.val_fraction,
        seed=train_cfg.seed
    )

    model_cfg_exp = ModelConfig(layers=layers)

    with mlflow.start_run(run_name=exp_name) as run:
        # log params
        mlflow.log_param("exp_name", exp_name)
        mlflow.log_param("lr", lr)
        mlflow.log_param("epochs", train_cfg_exp.epochs)
        mlflow.log_param("batch_size", train_cfg_exp.batch_size)
        mlflow.log_param("patience", train_cfg_exp.patience)
        mlflow.log_param("val_fraction", train_cfg_exp.val_fraction)

        # build + train
        model_exp = MLPFromConfig(model_cfg_exp)
        trainer_exp = Trainer(model_exp, train_cfg_exp)
        trainer_exp.fit(train_loader, val_loader)

        # evaluate TRAIN + TEST (required)
        preds_tr, targets_tr = trainer_exp.predict_all(train_loader)
        preds_te, targets_te = trainer_exp.predict_all(test_loader)

        train_metrics = compute_metrics(targets_tr, preds_tr)
        test_metrics  = compute_metrics(targets_te, preds_te)

        for k, v in train_metrics.items():
            mlflow.log_metric(f"train_{k}", float(v))
        for k, v in test_metrics.items():
            mlflow.log_metric(f"test_{k}", float(v))

        # keep best
        if test_metrics["accuracy"] > best_test_acc:
            best_test_acc = test_metrics["accuracy"]
            trainer_exp.save(best_model_path)

        # row for Excel
        row = {
            "exp_name": exp_name,
            "run_id": run.info.run_id,
            "lr": lr,
            "train_accuracy": train_metrics["accuracy"],
            "test_accuracy": test_metrics["accuracy"],
            "train_precision_macro": train_metrics["precision_macro"],
            "train_recall_macro": train_metrics["recall_macro"],
            "train_f1_macro": train_metrics["f1_macro"],
            "test_precision_macro": test_metrics["precision_macro"],
            "test_recall_macro": test_metrics["recall_macro"],
            "test_f1_macro": test_metrics["f1_macro"],
        }
        for c in range(10):
            row[f"train_precision_c{c}"] = train_metrics[f"precision_c{c}"]
            row[f"train_recall_c{c}"]    = train_metrics[f"recall_c{c}"]
            row[f"train_f1_c{c}"]        = train_metrics[f"f1_c{c}"]
            row[f"test_precision_c{c}"]  = test_metrics[f"precision_c{c}"]
            row[f"test_recall_c{c}"]     = test_metrics[f"recall_c{c}"]
            row[f"test_f1_c{c}"]         = test_metrics[f"f1_c{c}"]

        results_rows.append(row)

print(f"‚úÖ Best TEST accuracy: {best_test_acc:.4f}")
print(f"üíæ Best model saved as: {best_model_path}")

# Excel output
df = pd.DataFrame(results_rows).sort_values(by="test_accuracy", ascending=False)
excel_path = "ex2_results.xlsx"
df.to_excel(excel_path, index=False)
print(f"üìÑ Excel saved: {excel_path}")

df.head(10)


Visuazize the train


In [None]:

history = trainer.history

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history["train_loss"], label="Train Loss")
plt.plot(history["val_loss"],   label="Val Loss")
plt.title("Loss")
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history["train_acc"], label="Train Acc")
plt.plot(history["val_acc"],   label="Val Acc")
plt.title("Accuracy")
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()




Visualized the test (do change)

In [None]:
# entire test result
test_loss, test_acc = trainer.evaluate(test_loader)
print(f"üèÜ Test Loss: {test_loss:.4f} | Test Accuracy: {test_acc:.4f}")

# predictions
preds, targets = trainer.predict_all(test_loader)

# confusion matrix
cm = confusion_matrix(targets, preds)
plt.figure()
plt.imshow(cm)
plt.title("Confusion Matrix (Test)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar()
plt.show()


# per-class report
print("Classification report (Test):")
print(classification_report(targets, preds, digits=4))

# ===== TRAIN (required for overfitting check) =====
train_loss, train_acc = trainer.evaluate(train_loader)
print(f"\nüèãÔ∏è Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")

preds_train, targets_train = trainer.predict_all(train_loader)

print("\nClassification report (Train):")
print(classification_report(targets_train, preds_train, digits=4))
