Add libraries

In [3]:
from pathlib import Path
from datetime import datetime
from torch import nn
import torch
import random
from mads_datasets import DatasetFactoryProvider, DatasetType
import torch.optim as optim
from mltrainer.preprocessors import BasePreprocessor
from mltrainer import imagemodels, Trainer, TrainerSettings, ReportTypes, metrics
from tomlserializer import TOMLSerializer
import mlflow


Prepare train and test datastreamers

In [4]:
fashionfactory = DatasetFactoryProvider.create_factory(DatasetType.FASHION)
preprocessor = BasePreprocessor()
batchsize = 64 
streamers = fashionfactory.create_datastreamer(batchsize=batchsize, preprocessor=preprocessor)
train = streamers["train"]
valid = streamers["valid"]
trainstreamer = train.stream()
validstreamer = valid.stream()

[32m2025-10-05 08:44:20.363[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at C:\Users\tycoh\.cache\mads_datasets\fashionmnist[0m
[32m2025-10-05 08:44:20.370[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at C:\Users\tycoh\.cache\mads_datasets\fashionmnist\fashionmnist.pt[0m


Performance metric

In [5]:
accuracy = metrics.Accuracy()

Define architecture network, we want to experiment with extra convolution layers, drop out layers and normalization layers.

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_classes: int, units1: int, units2: int, dropout: float) -> None:
        super().__init__()
        self.num_classes = num_classes
        self.units1 = units1
        self.units2 = units2
        self.dropout = dropout
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, units1),
            nn.ReLU(),
            nn.Linear(units1, units2),
            nn.ReLU(),
            nn.Dropout(p=self.dropout), # adding dropout layer
            nn.Linear(units2, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

Adding ml flow

In [13]:
modeldir = Path("models").resolve()
if not modeldir.exists():
    modeldir.mkdir()
    print(f"Created {modeldir}")

In [14]:
experiment = 'adding_dropout'

In [15]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment(experiment)

<Experiment: artifact_location='file:c:/Users/tycoh/Desktop/MADS-ML-Tyco/2-hypertuning-mlflow/mlruns/2', creation_time=1759646653266, experiment_id='2', last_update_time=1759646653266, lifecycle_stage='active', name='adding_dropout', tags={}>

In [None]:
units1 = 64
units2 = 64
dropout = 0.2
loss_fn = nn.CrossEntropyLoss()

model = NeuralNetwork(num_classes=10, units1=units1, units2=units2, dropout=dropout)

with mlflow.start_run():
    settings = TrainerSettings(
        epochs=3,
        metrics=[accuracy],
        logdir="modellogs",
        train_steps=128,
        valid_steps=128,
        reporttypes=[ReportTypes.MLFLOW, ReportTypes.TOML],
    )
    mlflow.log_params({
                "epochs": settings.epochs,
                "train_steps": settings.train_steps,
                "valid_steps": settings.valid_steps,
                "units1": model.units1,
                "units2": model.units2,
                "batchsize": batchsize,
                "logdir": settings.logdir,
                "reporttypes": [r.name for r in settings.reporttypes],  
            })
    trainer = Trainer(
        model=model,
        settings=settings,
        loss_fn=loss_fn,
        optimizer=torch.optim.Adam,
        traindataloader=trainstreamer,
        validdataloader=validstreamer,
        scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau
    )
    
    trainer.loop()
    


[32m2025-10-05 08:46:25.161[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs\20251005-084625[0m
[32m2025-10-05 08:46:25.162[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m
100%|[38;2;30;71;6m██████████[0m| 128/128 [00:01<00:00, 101.85it/s]
[32m2025-10-05 08:46:26.945[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 1.1877 test 0.7636 metric ['0.7070'][0m
100%|[38;2;30;71;6m██████████[0m| 128/128 [00:00<00:00, 134.13it/s]
[32m2025-10-05 08:46:28.404[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.6976 test 0.6273 metric ['0.7737'][0m
100%|[38;2;30;71;6m██████████[0m| 128/128 [00:01<00:00, 97.62it/s]
[32m2025-10-05 08:46:30.301[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:

Try adding convolution layers

In [8]:
class CNN(nn.Module):
    def __init__(self, filters, units1, units2, input_size=(32, 1, 28, 28)):
        super().__init__()
        self.in_channels = input_size[1]
        self.input_size = input_size
        self.filters = filters
        self.units1 = units1
        self.units2 = units2

        self.convolutions = nn.Sequential(
            nn.Conv2d(self.in_channels, filters, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(filters),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(filters, filters, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(filters, filters, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.MaxPool2d(kernel_size=2),
        )

        activation_map_size = self._conv_test(input_size)
        logger.info(f"Aggregating activationmap with size {activation_map_size}")
        self.agg = nn.AvgPool2d(activation_map_size)

        self.dense = nn.Sequential(
            nn.Flatten(),
            nn.Linear(filters, units1),
            nn.ReLU(),
            nn.Linear(units1, units2),
            nn.ReLU(),
            nn.Linear(units2, 10),
        )

    def _conv_test(self, input_size=(32, 1, 28, 28)):
        x = torch.ones(input_size)
        x = self.convolutions(x)
        return x.shape[-2:]

    def forward(self, x):
        x = self.convolutions(x)
        x = self.agg(x)
        logits = self.dense(x)
        return logits


Setup loss function, train settings, train logging (MLFLOW), grid search units,  and train model

In [16]:
modeldir = Path("models").resolve()
if not modeldir.exists():
    modeldir.mkdir()
    print(f"Created {modeldir}")

In [10]:
experiment = 'adding_dropout_batchnorm'

In [11]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment(experiment)

2025/09/26 14:32:13 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/09/26 14:32:13 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='file:c:/Users/tycoh/Desktop/MADS-ML-Tyco/2-hypertuning-mlflow/mlruns/1', creation_time=1758889935286, experiment_id='1', last_update_time=1758889935286, lifecycle_stage='active', name='adding_dropout_batchnorm', tags={}>

In [21]:
loss_fn = torch.nn.CrossEntropyLoss()

settings = TrainerSettings(
    epochs=5,
    metrics=[accuracy],
    logdir="modellogs",
    train_steps=128,
    valid_steps=128,
    reporttypes=[ReportTypes.MLFLOW, ReportTypes.TOML],
)


units = [256, 128, 64]
for _ in range(5):  
    with mlflow.start_run():
        trainstreamer = train.stream()
        validstreamer = valid.stream()
        unit1 = random.choice(units)
        unit2 = random.choice(units)
        mlflow.set_tag("model", "convnet + batchnorm")
        mlflow.set_tag("dev", "tyco")
        mlflow.log_params({
                            "epochs": settings.epochs,
                            "train_steps": settings.train_steps,
                            "valid_steps": settings.valid_steps,
                            "logdir": settings.logdir,
                            "reporttypes": [r.name for r in settings.reporttypes],  
                        })
        mlflow.log_param("units1", unit1)
        mlflow.log_param("units2", unit2)
        mlflow.log_param("batchsize", f"{batchsize}")
        model = NeuralNetwork(num_classes=10, units1=unit1, units2=unit2)
        trainer = Trainer(
            model=model,
            settings=settings,
            loss_fn=loss_fn,
            optimizer=optim.Adam,
            traindataloader=trainstreamer,
            validdataloader=validstreamer,
            scheduler=optim.lr_scheduler.ReduceLROnPlateau
        )
        trainer.loop()
        tag = datetime.now().strftime("%Y%m%d-%H%M")
        modelpath = modeldir / (tag + "model.pt")
        torch.save(model, modelpath)
        mlflow.log_artifact(local_path=modelpath, artifact_path="pytorch_models")
        mlflow.end_run()

[32m2025-09-26 15:28:59.042[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs\20250926-152859[0m
[32m2025-09-26 15:28:59.042[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m
100%|[38;2;30;71;6m██████████[0m| 128/128 [00:00<00:00, 128.47it/s]
[32m2025-09-26 15:29:00.635[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.9076 test 0.6129 metric ['0.7755'][0m
100%|[38;2;30;71;6m██████████[0m| 128/128 [00:00<00:00, 147.57it/s]
[32m2025-09-26 15:29:02.003[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.5490 test 0.5435 metric ['0.7985'][0m
100%|[38;2;30;71;6m██████████[0m| 128/128 [00:01<00:00, 107.22it/s]
[32m2025-09-26 15:29:03.850[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:

Save model settings

In [None]:
tomlserializer = TOMLSerializer()
tomlserializer.save(settings, "settings.toml")
tomlserializer.save(model, "model.toml")