In [1]:
from pathlib import Path
import numpy as np
import torch
from typing import List
from torch.nn.utils.rnn import pad_sequence
from mltrainer import rnn_models, Trainer
from torch import optim

from mads_datasets import datatools
import mltrainer
mltrainer.__version__

'0.2.5'

In [2]:
from mads_datasets import DatasetFactoryProvider, DatasetType
from mltrainer.preprocessors import PaddedPreprocessor
preprocessor = PaddedPreprocessor()

gesturesdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.GESTURES)
streamers = gesturesdatasetfactory.create_datastreamer(batchsize=64, preprocessor=preprocessor)
train = streamers["train"]
valid = streamers["valid"]

[32m2025-10-07 19:20:15.567[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at C:\Users\mwien\.cache\mads_datasets\gestures[0m
100%|[38;2;30;71;6m██████████[0m| 2600/2600 [00:01<00:00, 1670.64it/s]
100%|[38;2;30;71;6m██████████[0m| 651/651 [00:00<00:00, 1530.37it/s]


In [3]:
trainstreamer = train.stream()
validstreamer = valid.stream()
x, y = next(iter(trainstreamer))

In [4]:
from mltrainer import TrainerSettings, ReportTypes
from mltrainer.metrics import Accuracy

accuracy = Accuracy()


In [5]:
loss_fn = torch.nn.CrossEntropyLoss()

In [6]:
import torch
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("using cuda")
else:
    device = "cpu"
    print("using cpu")

# on my mac, at least for the BaseRNN model, mps does not speed up training
# probably because the overhead of copying the data to the GPU is too high
# so i override the device to cpu
device = "cpu"
# however, it might speed up training for larger models, with more parameters

using cpu


Set up the settings for the trainer and the different types of logging you want

In [7]:
settings = TrainerSettings(
    epochs=16, # increase this to about 100 for training
    metrics=[accuracy],
    logdir=Path("gestures"),
    train_steps=len(train),
    valid_steps=len(valid),
    reporttypes=[ReportTypes.TOML, ReportTypes.TENSORBOARD, ReportTypes.MLFLOW],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
    earlystop_kwargs = {
        "save": True, # save every best model, and restore the best one
        "verbose": True,
        "patience": 5, # number of epochs with no improvement after which training will be stopped
        "delta": 0.0, # minimum change to be considered an improvement
    }
)
settings

epochs: 16
metrics: [Accuracy]
logdir: gestures
train_steps: 40
valid_steps: 10
reporttypes: [<ReportTypes.TOML: 'TOML'>, <ReportTypes.TENSORBOARD: 'TENSORBOARD'>, <ReportTypes.MLFLOW: 'MLFLOW'>]
optimizer_kwargs: {'lr': 0.001, 'weight_decay': 1e-05}
scheduler_kwargs: {'factor': 0.5, 'patience': 5}
earlystop_kwargs: {'save': True, 'verbose': True, 'patience': 5, 'delta': 0.0}

In [8]:
import torch.nn as nn
import torch
from torch import Tensor
from dataclasses import dataclass

@dataclass
class ModelConfig:
    input_size: int
    hidden_size: int
    num_layers: int
    output_size: int
    dropout: float = 0.0

class GRUmodel(nn.Module):
    def __init__(
        self,
        config,
    ) -> None:
        super().__init__()
        self.config = config
        self.rnn = nn.GRU(
            input_size=config.input_size,
            hidden_size=config.hidden_size,
            dropout=config.dropout,
            batch_first=True,
            num_layers=config.num_layers,
        )
        self.linear = nn.Linear(config.hidden_size, config.output_size)

    def forward(self, x: Tensor) -> Tensor:
        x, _ = self.rnn(x)
        last_step = x[:, -1, :]
        yhat = self.linear(last_step)
        return yhat

In [9]:
import mlflow
from datetime import datetime

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("gestures")
modeldir = Path("gestures").resolve()
if not modeldir.exists():
    modeldir.mkdir(parents=True)
    
n_repeats = 3

for repeat in range(n_repeats):
    print(f"Repeat {repeat+1}/{n_repeats}")
    
    with mlflow.start_run():
        # Set MLflow tags to record metadata about the model and developer
        mlflow.set_tag("model", f"{repeat}_GRU_16epochs_0.5drop_128hidsize_64batch")
        mlflow.set_tag("dev", "Marcello")
        # Log hyperparameters to MLflow

        mlflow.log_param("epochs", settings.epochs)

        mlflow.log_param("learning_rate", settings.optimizer_kwargs.get("lr", None))
        
        config = ModelConfig(
            input_size=3,
            hidden_size=128,
            num_layers=1,
            output_size=20,
            dropout=0.5,
        )

        model = GRUmodel(
            config=config,
        )

        trainer = Trainer(
            model=model,
            settings=settings,
            loss_fn=loss_fn,
            optimizer=optim.Adam,
            traindataloader=trainstreamer,
            validdataloader=validstreamer,
            scheduler=optim.lr_scheduler.ReduceLROnPlateau,
            device=device,
        )
        trainer.loop()

        if not settings.earlystop_kwargs["save"]:
            tag = datetime.now().strftime("%Y%m%d-%H%M-")
            modelpath = modeldir / (tag + "model.pt")
            torch.save(model, modelpath)

2025/10/07 19:20:18 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/07 19:20:18 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


Repeat 1/3


[32m2025-10-07 19:20:19.377[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to gestures\20251007-192019[0m
[32m2025-10-07 19:20:20.519[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m
100%|[38;2;30;71;6m██████████[0m| 40/40 [00:00<00:00, 45.20it/s]
[32m2025-10-07 19:20:21.660[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 2.9364 test 2.7348 metric ['0.1266'][0m
[32m2025-10-07 19:20:21.661[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.7348 --> 2.7348).Saving gestures\20251007-192019\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 40/40 [00:00<00:00, 46.55it/s]
[32m2025-10-07 19:20:22.646[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [

Repeat 2/3


100%|[38;2;30;71;6m██████████[0m| 40/40 [00:00<00:00, 44.68it/s]
[32m2025-10-07 19:20:39.475[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 2.9013 test 2.5071 metric ['0.1219'][0m
[32m2025-10-07 19:20:39.476[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.5071 --> 2.5071).Saving gestures\20251007-192038\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 40/40 [00:00<00:00, 43.96it/s]
[32m2025-10-07 19:20:40.518[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 2.3913 test 2.3204 metric ['0.1313'][0m
[32m2025-10-07 19:20:40.519[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.5071 --> 2.3204).Saving gestures\20251007-192038\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 40/40 [00:00<00:00, 45.14it/s]
[32m2025-10-07 19:20:41.53

Repeat 3/3


100%|[38;2;30;71;6m██████████[0m| 40/40 [00:01<00:00, 39.77it/s]
[32m2025-10-07 19:20:56.426[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 2.9733 test 2.8859 metric ['0.1062'][0m
[32m2025-10-07 19:20:56.427[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.8859 --> 2.8859).Saving gestures\20251007-192055\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 40/40 [00:00<00:00, 40.72it/s]
[32m2025-10-07 19:20:57.531[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 2.6172 test 2.3510 metric ['0.1672'][0m
[32m2025-10-07 19:20:57.531[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.8859 --> 2.3510).Saving gestures\20251007-192055\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 40/40 [00:00<00:00, 45.87it/s]
[32m2025-10-07 19:20:58.53

In [10]:
mlflow.end_run()

## Open MLFlow
"""
```bash
mlflow server \
    --backend-store-uri sqlite:///mlflow.db \
    --host 127.0.0.1 \ 
    --port 5000 \
        
mlflow server --backend-store-uri sqlite:///3-hypertuning-rnn/mlflow.db --host 127.0.0.1 --port 5000
```
"""