#**ResNet18 Teacher Model - TSER for KD**

#**Setup**

---
**Install Libraries**

In [1]:
!pip install snntorch dagshub mlflow pynvml --quiet

---
**GitHub Code**

In [2]:
from google.colab import userdata
import os

# Sets environ variables for GitHub
os.environ['GITHUB_TOKEN'] = userdata.get('GITHUB_TOKEN')
os.environ['USER'] = userdata.get('USER')

# Clones the repo and changes dir
!git clone -b dev https://${GITHUB_TOKEN}@github.com/${USER}/tser-kd.git
%cd tser-kd/

fatal: destination path 'tser-kd' already exists and is not an empty directory.
/content/tser-kd


---
**Set Seed for Experiment**

In [3]:
from tser_kd.utils import setup_seed

setup_seed(42)

Random seed: 42


---
**Device Selection**

In [4]:
import torch

# Selects the device for the experiment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

---
**MLFlow Setup**

In [5]:
import mlflow
from mlflow import MlflowClient
import dagshub

# Sets environ variables for MLFlow
os.environ['MLFLOW_TRACKING_USERNAME'] = userdata.get('USER')
os.environ['MLFLOW_TRACKING_PASSWORD'] = userdata.get('MLFLOW_TRACKING_PASSWORD')

# Init DagsHub
dagshub.init(repo_owner='matteogianferrari', repo_name='tser-kd', mlflow=True)
TRACKING_URI = "https://dagshub.com/matteogianferrari/tser-kd.mlflow"

# Sets MLFlow tracking URI
mlflow.set_tracking_uri(TRACKING_URI)

# Sets MLFLow experiment name
experiment_name = "TSER-KD Teacher"

#**Hyperparameters**

In [6]:
# Hyperparamter dictionary
h_dict = {
    "MAX_EPOCHS": 5, "BATCH_SIZE": 64,                    # Training
    "LR_SCHEDULER": "CosineAnnealingLR", "BASE_LR": 5e-4,   # LR
    "OPTIMIZER": "AdamW", "WEIGHT_DECAY": 5e-4,             # Optimizer
    "HARDWARE": "L4",                                       # GPU
}

#**CIFAR10 Dataset**

---
**Data Loaders Creation**

In [7]:
from tser_kd.dataset import load_mnist_data
from torch.utils.data import DataLoader


train_dataset, val_dataset, num_classes = load_mnist_data()

# Creates the train and test DataLoaders
train_loader = DataLoader(train_dataset, batch_size=h_dict['BATCH_SIZE'], shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=h_dict['BATCH_SIZE'], shuffle=False, num_workers=2)

#**Teacher ResNet-18**


---
**ResNet-18**

In [8]:
from tser_kd.model.teacher import make_teacher_model


# ANN
t_model = make_teacher_model(arch='resnet-18', in_channels=1, num_classes=num_classes, device=device)

# **Training**

---
**Objects Creation**

In [9]:
import torch.optim as optim
import torch.nn as nn
from tser_kd.utils import AccuracyMonitor


# Optimizer
if h_dict["OPTIMIZER"] == 'AdamW':
    optimizer = optim.AdamW(t_model.parameters(), lr=h_dict['BASE_LR'], weight_decay=h_dict['WEIGHT_DECAY'])
elif h_dict["OPTIMIZER"] == 'Adam':
    optimizer = optim.Adam(t_model.parameters(), lr=h_dict['BASE_LR'], weight_decay=h_dict['WEIGHT_DECAY'])
elif h_dict["OPTIMIZER"] == 'SGD':
    optimizer = optim.SGD(t_model.parameters(), lr=h_dict['BASE_LR'], momentum=h_dict["MOMENTUM"], weight_decay=h_dict['WEIGHT_DECAY'])

# LR scheduler
if h_dict["LR_SCHEDULER"] == 'ReduceLROnPlateau':
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=h_dict["LR_PATIENCE"], factor=h_dict["LR_FACTOR"])
elif h_dict["LR_SCHEDULER"] == 'CosineAnnealingLR':
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=h_dict["MAX_EPOCHS"])

# Loss
criterion = nn.CrossEntropyLoss()

# Accuracy monitor
acc_monitor = AccuracyMonitor(path="best_ckpt.pth")

# Gradient scaler
scaler = torch.amp.GradScaler(device='cuda')

---
**Training Loop**

In [10]:
import pynvml
from tser_kd.training import run_train
from tser_kd.eval import run_eval


# Sets the MLFlow experiment
mlflow.set_experiment(experiment_name)

epoch_i = 0
curr_lr = optimizer.param_groups[0]["lr"]

# Train the model and log with MLFlow
with mlflow.start_run(log_system_metrics=True):
    for epoch_i in range(h_dict["MAX_EPOCHS"]):
        train_loss, train_acc, epoch_time, train_batch_time = run_train(
            epoch_i, train_loader, t_model, criterion, optimizer, device, scaler
        )

        val_loss, val_acc1, val_acc5, val_batch_time = run_eval(val_loader, t_model, criterion, device)

        # Logging
        print(
            f"Time: {epoch_time:.1f}s | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
            f"Val Loss: {val_loss:.4f} | Val Acc1: {val_acc1:.2f}% | Val Acc5: {val_acc5:.2f}% | LR: {curr_lr:.6f}"
        )

        mlflow.log_metrics({
            "learning_rate": curr_lr, "train_loss": train_loss, "train_acc": train_acc, "val_loss": val_loss,
            "val_acc1": val_acc1, "val_acc5": val_acc5, "epoch_time": epoch_time,
            "train_batch_time": train_batch_time, "val_batch_time": val_batch_time
        }, step=epoch_i)

        # Updates the LR
        if h_dict["LR_SCHEDULER"] == 'ReduceLROnPlateau':
            scheduler.step(val_loss)
        else:
            scheduler.step()

        curr_lr = optimizer.param_groups[0]["lr"]

        # Accuracy monitor
        acc_monitor(val_acc1, epoch_i, t_model)


    # Log hyperparameters
    mlflow.log_params(h_dict)

    # Log test performance
    t_model.load_state_dict(torch.load("best_ckpt.pth"))
    test_loss, test_acc1, test_acc5, _ = run_eval(val_loader, t_model, criterion, device)
    mlflow.log_metrics({"test_loss": test_loss, "test_acc1": test_acc1, "test_acc5": test_acc5})

2025/07/31 14:21:48 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
Epoch 1: 100%|██████████| Batch 938/938 , acc=97.93%, loss=0.0681


Time: 17.0s | Train Loss: 0.0681 | Train Acc: 97.93% | Val Loss: 0.0356 | Val Acc1: 98.90% | Val Acc5: 99.97% | LR: 0.000500


Epoch 2: 100%|██████████| Batch 938/938 , acc=99.11%, loss=0.0309


Time: 16.2s | Train Loss: 0.0309 | Train Acc: 99.11% | Val Loss: 0.0275 | Val Acc1: 99.16% | Val Acc5: 99.99% | LR: 0.000452


Epoch 3: 100%|██████████| Batch 938/938 , acc=99.48%, loss=0.0188


Time: 15.9s | Train Loss: 0.0188 | Train Acc: 99.48% | Val Loss: 0.0201 | Val Acc1: 99.29% | Val Acc5: 100.00% | LR: 0.000327


Epoch 4: 100%|██████████| Batch 938/938 , acc=99.75%, loss=0.0086


Time: 16.7s | Train Loss: 0.0086 | Train Acc: 99.75% | Val Loss: 0.0179 | Val Acc1: 99.50% | Val Acc5: 99.99% | LR: 0.000173


Epoch 5: 100%|██████████| Batch 938/938 , acc=99.90%, loss=0.0038


Time: 16.2s | Train Loss: 0.0038 | Train Acc: 99.90% | Val Loss: 0.0143 | Val Acc1: 99.58% | Val Acc5: 100.00% | LR: 0.000048
🏃 View run handsome-worm-286 at: https://dagshub.com/matteogianferrari/tser-kd.mlflow/#/experiments/0/runs/63803e05ad0f4c0eaeb295f1c94795dd
🧪 View experiment at: https://dagshub.com/matteogianferrari/tser-kd.mlflow/#/experiments/0


2025/07/31 14:23:24 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/07/31 14:23:25 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [11]:
t_model.load_state_dict(torch.load("best_ckpt.pth"))
run_eval(val_loader, t_model, criterion, device)

(0.01427319314479828, 99.58, 100.0, 0.00444012994219543)