In [10]:
from pathlib import Path
import numpy as np
import torch
from typing import List
from torch.nn.utils.rnn import pad_sequence
from mltrainer import rnn_models, Trainer
from torch import optim

from mads_datasets import datatools
import mltrainer
mltrainer.__version__

'0.2.5'

In [11]:
from mads_datasets import DatasetFactoryProvider, DatasetType
from mltrainer.preprocessors import PaddedPreprocessor
preprocessor = PaddedPreprocessor()

gesturesdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.GESTURES)
streamers = gesturesdatasetfactory.create_datastreamer(batchsize=32, preprocessor=preprocessor)
train = streamers["train"]
valid = streamers["valid"]

[32m2025-10-04 23:01:10.999[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at C:\Users\mwien\.cache\mads_datasets\gestures[0m
100%|[38;2;30;71;6m██████████[0m| 2600/2600 [00:00<00:00, 3394.40it/s]
100%|[38;2;30;71;6m██████████[0m| 651/651 [00:00<00:00, 3371.71it/s]


In [12]:
trainstreamer = train.stream()
validstreamer = valid.stream()
x, y = next(iter(trainstreamer))

In [13]:
from mltrainer import TrainerSettings, ReportTypes
from mltrainer.metrics import Accuracy

accuracy = Accuracy()


In [14]:
loss_fn = torch.nn.CrossEntropyLoss()

In [15]:
import torch
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("using cuda")
else:
    device = "cpu"
    print("using cpu")

# on my mac, at least for the BaseRNN model, mps does not speed up training
# probably because the overhead of copying the data to the GPU is too high
# so i override the device to cpu
device = "cpu"
# however, it might speed up training for larger models, with more parameters

using cpu


Set up the settings for the trainer and the different types of logging you want

In [16]:
settings = TrainerSettings(
    epochs=16, # increase this to about 100 for training
    metrics=[accuracy],
    logdir=Path("gestures"),
    train_steps=len(train),
    valid_steps=len(valid),
    reporttypes=[ReportTypes.TOML, ReportTypes.TENSORBOARD, ReportTypes.MLFLOW],
    optimizer_kwargs={"lr": 0.0001, "weight_decay": 1e-5},
    scheduler_kwargs={"factor": 0.5, "patience": 5},
    earlystop_kwargs = {
        "save": True, # save every best model, and restore the best one
        "verbose": True,
        "patience": 5, # number of epochs with no improvement after which training will be stopped
        "delta": 0.0, # minimum change to be considered an improvement
    }
)
settings

epochs: 16
metrics: [Accuracy]
logdir: gestures
train_steps: 81
valid_steps: 20
reporttypes: [<ReportTypes.TOML: 'TOML'>, <ReportTypes.TENSORBOARD: 'TENSORBOARD'>, <ReportTypes.MLFLOW: 'MLFLOW'>]
optimizer_kwargs: {'lr': 0.0001, 'weight_decay': 1e-05}
scheduler_kwargs: {'factor': 0.5, 'patience': 5}
earlystop_kwargs: {'save': True, 'verbose': True, 'patience': 5, 'delta': 0.0}

In [17]:
import torch.nn as nn
import torch
from torch import Tensor
from dataclasses import dataclass

@dataclass
class ModelConfig:
    input_size: int
    hidden_size: int
    num_layers: int
    output_size: int
    dropout: float = 0.0

class GRUmodel(nn.Module):
    def __init__(
        self,
        config,
    ) -> None:
        super().__init__()
        self.config = config
        self.rnn = nn.GRU(
            input_size=config.input_size,
            hidden_size=config.hidden_size,
            dropout=config.dropout,
            batch_first=True,
            num_layers=config.num_layers,
        )
        self.linear = nn.Linear(config.hidden_size, config.output_size)

    def forward(self, x: Tensor) -> Tensor:
        x, _ = self.rnn(x)
        last_step = x[:, -1, :]
        yhat = self.linear(last_step)
        return yhat

In [18]:
import mlflow
from datetime import datetime

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("gestures")
modeldir = Path("gestures").resolve()
if not modeldir.exists():
    modeldir.mkdir(parents=True)
    
n_repeats = 3

for repeat in range(n_repeats):
    print(f"Repeat {repeat+1}/{n_repeats}")
    
    with mlflow.start_run():
        # Set MLflow tags to record metadata about the model and developer
        mlflow.set_tag("model", f"{repeat}_GRU_16epochs_0.5drop_128hidsize_lr-0.0001")
        mlflow.set_tag("dev", "Marcello")
        # Log hyperparameters to MLflow

        mlflow.log_param("epochs", settings.epochs)

        mlflow.log_param("learning_rate", settings.optimizer_kwargs.get("lr", None))
        
        config = ModelConfig(
            input_size=3,
            hidden_size=128,
            num_layers=1,
            output_size=20,
            dropout=0.5,
        )

        model = GRUmodel(
            config=config,
        )

        trainer = Trainer(
            model=model,
            settings=settings,
            loss_fn=loss_fn,
            optimizer=optim.Adam,
            traindataloader=trainstreamer,
            validdataloader=validstreamer,
            scheduler=optim.lr_scheduler.ReduceLROnPlateau,
            device=device,
        )
        trainer.loop()

        if not settings.earlystop_kwargs["save"]:
            tag = datetime.now().strftime("%Y%m%d-%H%M-")
            modelpath = modeldir / (tag + "model.pt")
            torch.save(model, modelpath)

[32m2025-10-04 23:01:12.500[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to gestures\20251004-230112[0m
[32m2025-10-04 23:01:12.501[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m


Repeat 1/3


100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 58.94it/s]
[32m2025-10-04 23:01:14.036[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 2.9896 test 2.9770 metric ['0.0781'][0m
[32m2025-10-04 23:01:14.036[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.9770 --> 2.9770).Saving gestures\20251004-230112\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 61.41it/s]
[32m2025-10-04 23:01:15.521[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 2.9582 test 2.9522 metric ['0.0875'][0m
[32m2025-10-04 23:01:15.522[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.9770 --> 2.9522).Saving gestures\20251004-230112\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 60.82it/s]
[32m2025-10-04 23:01:17.01

Repeat 2/3


100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 62.48it/s]
[32m2025-10-04 23:01:38.925[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 2.9874 test 2.9800 metric ['0.0547'][0m
[32m2025-10-04 23:01:38.925[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.9800 --> 2.9800).Saving gestures\20251004-230137\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 61.37it/s]
[32m2025-10-04 23:01:40.402[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 2.9601 test 2.9545 metric ['0.0625'][0m
[32m2025-10-04 23:01:40.403[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.9800 --> 2.9545).Saving gestures\20251004-230137\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 60.65it/s]
[32m2025-10-04 23:01:41.90

Repeat 3/3


100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 61.72it/s]
[32m2025-10-04 23:02:03.354[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 2.9900 test 2.9806 metric ['0.0688'][0m
[32m2025-10-04 23:02:03.355[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.9806 --> 2.9806).Saving gestures\20251004-230201\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 60.20it/s]
[32m2025-10-04 23:02:04.897[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 2.9574 test 2.9551 metric ['0.0969'][0m
[32m2025-10-04 23:02:04.898[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36msave_checkpoint[0m:[36m268[0m - [1mValidation loss (2.9806 --> 2.9551).Saving gestures\20251004-230201\checkpoint.pt ...[0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 57.29it/s]
[32m2025-10-04 23:02:06.48

In [19]:
mlflow.end_run()

## Open MLFlow
"""
```bash
mlflow server \
    --backend-store-uri sqlite:///mlflow.db \
    --host 127.0.0.1 \ 
    --port 5000 \
        
mlflow server --backend-store-uri sqlite:///3-hypertuning-rnn/mlflow.db --host 127.0.0.1 --port 5000
```
"""