In [1]:
#to load libs from root
import sys
from pathlib import Path
sys.path.insert(0, str(Path("..").resolve()))

In [None]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "setuptools", "wheel"])

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


[0m

Collecting setuptools
  Downloading setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)


[0m

Downloading setuptools-80.9.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 60.2.0
    Uninstalling setuptools-60.2.0:
      Successfully uninstalled setuptools-60.2.0


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
openxlab 0.1.3 requires setuptools~=60.2.0, but you have setuptools 80.9.0 which is incompatible.
tensorflow 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.27.3 which is incompatible.
tensorboard 2.17.0 requires protobuf!=4.24.0,<5.0.0,>=3.19.6, but you have protobuf 5.27.3 which is incompatible.[0m[31m
[0m

Successfully installed setuptools-80.9.0


# Train basic Mask R-CNN 

Trains **one baseline model** (Mask R-CNN + R50-FPN, ImageNet backbone) on **Dataset B (instance masks)*


In [2]:
import time
from dataclasses import dataclass
from pathlib import Path

import torch

from datasets import cfg
from datasets.loader import DataModule, DataConfig
from models.models import build_model
from train.trainer_v2 import Trainer, TrainConfig
from train.eval import Evaluator


  param_schemas = callee.param_schemas()
  param_schemas = callee.param_schemas()


In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL_NAME = "maskrcnn_r50_fpn"
NUM_CLASSES = int(cfg.num_classes)   # 1 + 24

TRACKING_URI = "file:///media/sdb1/mlflow"
EXPERIMENT = "B_basic_"

WEIGHTS_DIR = Path("../weights")
WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)

print("device:", DEVICE)
print("num_classes:", NUM_CLASSES)


device: cuda
num_classes: 25


In [4]:
dm = DataModule(DataConfig(val_frac=0.1,batch_size=4,num_workers=4,))
b_train, b_val = dm.make_loaders_b()
b_test = dm.make_loader_b_test()

print("B train:", len(dm.ds_b_train), "| B val:", len(dm.ds_b_val), "| B test:", len(dm.ds_b_test))


B train: 475 | B val: 52 | B test: 67


In [5]:
def build_basic_model():
    return build_model2(MODEL_NAME,NUM_CLASSES, 
                        weights_backbone=True, #imagenet pretrain
                        trainable_backbone_layers=3, 
    ).to(DEVICE)


In [6]:
import numpy as np, json, tempfile, os
import torch, mlflow

def _as_scalar(x):
    if isinstance(x, (float, int)): return float(x)
    if torch.is_tensor(x): x = x.detach().cpu().numpy()
    if isinstance(x, np.ndarray):
        if x.size == 1: return float(x.reshape(-1)[0])
        return None
    if isinstance(x, (list, tuple)):
        if len(x) == 1 and isinstance(x[0], (float, int)): return float(x[0])
        return None
    try: return float(x)
    except Exception: return None

def mlflow_log_metrics_safe(d, prefix="", step=None, log_arrays_as_artifact=True):
    scalars, arrays = {}, {}
    for k, v in d.items():
        s = _as_scalar(v)
        if s is not None: scalars[f"{prefix}{k}"] = s
        else:
            if torch.is_tensor(v): vv = v.detach().cpu().tolist()
            elif hasattr(v, "tolist"): vv = v.tolist()
            else: vv = v
            arrays[k] = vv
            if isinstance(vv, list) and len(vv) and all(isinstance(t, (int, float)) for t in vv):
                scalars[f"{prefix}{k}_mean"] = float(np.mean(vv))
                scalars[f"{prefix}{k}_p50"] = float(np.median(vv))
    if scalars: mlflow.log_metrics(scalars, step=step)
    if arrays and log_arrays_as_artifact:
        if hasattr(mlflow, "log_dict"): mlflow.log_dict(arrays, "metrics_arrays.json")
        else:
            fd, path = tempfile.mkstemp(suffix=".json"); os.close(fd)
            with open(path, "w") as f: json.dump(arrays, f)
            mlflow.log_artifact(path); os.remove(path)

In [7]:
# Optuna search (10-epoch trials)
import sys, time, math
import optuna
import mlflow.pytorch

EPOCHS = 
10 #this is for search only 
N_TRIALS = 20
EXPERIMENT_NAME = "karyo_basic_optuna"
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

ev = Evaluator(device=DEVICE)

def _make_optimizer(model, opt_name, lr, weight_decay, momentum):
    params = [p for p in model.parameters() if p.requires_grad]
    if opt_name == "sgd": 
        return torch.optim.SGD(params, lr=float(lr), momentum=float(momentum), weight_decay=float(weight_decay))
    if opt_name == "adamw": 
        return torch.optim.AdamW(params, lr=float(lr), weight_decay=float(weight_decay))
    raise ValueError(f"unknown optimizer: {opt_name}")

def _get_aji(metrics_dict):
    for k, v in metrics_dict.items():
        if "aji" in str(k).lower(): return float(v)
    raise KeyError(f"AJI key not found; keys={list(metrics_dict.keys())}")

def objective(trial):
    opt_name = trial.suggest_categorical("optimizer", ["sgd", "adamw"])
    lr = trial.suggest_float("lr", 1e-3, 2e-2, log=True) if opt_name == "sgd" else trial.suggest_float("lr", 5e-5, 2e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    scheduler = trial.suggest_categorical("scheduler", ["none", "cosine"])
    warmup_iters = trial.suggest_categorical("warmup_iters", [0, 1000]) if scheduler != "none" else 0
    ema_decay = trial.suggest_categorical("ema_decay", [0.0, 0.999]) if scheduler != "none" else 0.0
    momentum = 0.9 if opt_name == "sgd" else 0.0

    model = build_model(MODEL_NAME, NUM_CLASSES).to(DEVICE)
    conf = TrainConfig(
        num_epochs=int(EPOCHS),
        batch_size=int(getattr(cfg, "batch_size", 4)) if "cfg" in globals() else 4, 
        num_workers=int(getattr(cfg, "num_workers", 4)) if "cfg" in globals() else 4, 
        lr=float(lr), weight_decay=float(weight_decay), momentum=float(momentum if opt_name == "sgd" else 0.9), 
        print_every=50, 
        tracking_uri=TRACKING_URI, amp=True, grad_clip=1.0, 
        ema_decay=float(ema_decay), 
        warmup_iters=int(warmup_iters), 
        scheduler=str(scheduler), min_lr=1e-6, 
        freeze_bn=True)

    trainer = Trainer(model, conf)
    trainer.optimizer = _make_optimizer(model, opt_name, lr, weight_decay, momentum)
    trainer._init_schedulers(steps_per_epoch=len(b_train))

    run_name = f"trial_{trial.number:03d}_{opt_name}_lr{lr:.2g}_wd{weight_decay:.1g}_{scheduler}_ema{ema_decay}"
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params({"trial": int(trial.number), "optimizer": opt_name, "lr": float(lr), "weight_decay": float(weight_decay), "momentum": float(momentum), "scheduler": scheduler, "warmup_iters": int(warmup_iters), "ema_decay": float(ema_decay), "epochs": int(EPOCHS), "model_name": str(MODEL_NAME)})

        for epoch in range(int(EPOCHS)):
            t0 = time.perf_counter()
            train_loss = trainer.train_one_epoch(b_train, epoch)
            val_loss = trainer._eval_with_ema(b_val)
            lr_now = float(trainer.optimizer.param_groups[0]["lr"])
            trainer._sched_epoch_step()
            mlflow.log_metrics({"train_loss": float(train_loss), "val_loss": float(val_loss), "lr": float(lr_now), "epoch_sec": float(time.perf_counter() - t0)}, step=int(epoch))
            trial.report(-float(val_loss), step=int(epoch))
            if trial.should_prune(): raise optuna.TrialPruned()

        masks_b = ev.metrics_masks(model, b_test, num_classes=NUM_CLASSES)
        aji = _get_aji(masks_b)
        mlflow_log_metrics_safe(masks_b, prefix="b_test_", step=None)
        #mlflow.log_metrics({f"b_test_{k}": float(v) for k, v in masks_b.items()})
        mlflow.log_metric("objective_AJI", float(aji))
        mlflow.pytorch.log_model(model, "model")
        return float(aji)

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42), pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2, interval_steps=1))
study.optimize(objective, n_trials=int(N_TRIALS))

print("best_value(AJI)=", study.best_value)
print("best_params=", study.best_params)


  return FileStore(store_uri, store_uri)
[I 2025-12-25 15:37:49,613] A new study created in memory with name: no-name-fcb7d3e2-f113-4b5d-aa75-86ac49694f30
  self.scaler = torch.cuda.amp.GradScaler(enabled=bool(train_conf.amp and self.device.type == "cuda"))


[epoch 001/010] step 50/119 loss 2.3721
[epoch 001/010] step 100/119 loss 1.7528
[epoch 001/010] step 119/119 loss 1.7653
[epoch 002/010] step 50/119 loss 1.3551
[epoch 002/010] step 100/119 loss 1.2925
[epoch 002/010] step 119/119 loss 1.4147
[epoch 003/010] step 50/119 loss 1.4519
[epoch 003/010] step 100/119 loss 1.2412
[epoch 003/010] step 119/119 loss 1.3432
[epoch 004/010] step 50/119 loss 1.1074
[epoch 004/010] step 100/119 loss 1.1730
[epoch 004/010] step 119/119 loss 1.0041
[epoch 005/010] step 50/119 loss 1.0520
[epoch 005/010] step 100/119 loss 1.0774
[epoch 005/010] step 119/119 loss 1.1593
[epoch 006/010] step 50/119 loss 0.9516
[epoch 006/010] step 100/119 loss 1.1604
[epoch 006/010] step 119/119 loss 0.9429
[epoch 007/010] step 50/119 loss 0.9839
[epoch 007/010] step 100/119 loss 1.0365
[epoch 007/010] step 119/119 loss 1.0394
[epoch 008/010] step 50/119 loss 0.9611
[epoch 008/010] step 100/119 loss 0.9923
[epoch 008/010] step 119/119 loss 1.0653
[epoch 009/010] step 50/

[I 2025-12-25 15:46:56,216] Trial 0 finished with value: 0.3826360001053386 and parameters: {'optimizer': 'adamw', 'lr': 0.0007441632389160641, 'weight_decay': 6.251373574521755e-05, 'scheduler': 'none'}. Best is trial 0 with value: 0.3826360001053386.
  self.scaler = torch.cuda.amp.GradScaler(enabled=bool(train_conf.amp and self.device.type == "cuda"))


[epoch 001/010] step 50/119 loss 1.5905
[epoch 001/010] step 100/119 loss 1.3825
[epoch 001/010] step 119/119 loss 1.3198
[epoch 002/010] step 50/119 loss 1.1596
[epoch 002/010] step 100/119 loss 1.1180
[epoch 002/010] step 119/119 loss 0.9739
[epoch 003/010] step 50/119 loss 1.2946
[epoch 003/010] step 100/119 loss 1.1327
[epoch 003/010] step 119/119 loss 1.1002
[epoch 004/010] step 50/119 loss 1.0851
[epoch 004/010] step 100/119 loss 0.8740
[epoch 004/010] step 119/119 loss 1.0095
[epoch 005/010] step 50/119 loss 0.8741
[epoch 005/010] step 100/119 loss 1.0486
[epoch 005/010] step 119/119 loss 1.0549
[epoch 006/010] step 50/119 loss 1.1199
[epoch 006/010] step 100/119 loss 0.8156
[epoch 006/010] step 119/119 loss 0.9949
[epoch 007/010] step 50/119 loss 0.8328
[epoch 007/010] step 100/119 loss 0.7793
[epoch 007/010] step 119/119 loss 0.6326
[epoch 008/010] step 50/119 loss 0.6064
[epoch 008/010] step 100/119 loss 0.8620
[epoch 008/010] step 119/119 loss 0.5600
[epoch 009/010] step 50/

[I 2025-12-25 15:56:00,681] Trial 1 finished with value: 0.3821212564715687 and parameters: {'optimizer': 'adamw', 'lr': 0.0004591898870587331, 'weight_decay': 0.000133112160807369, 'scheduler': 'cosine', 'warmup_iters': 0, 'ema_decay': 0.999}. Best is trial 0 with value: 0.3826360001053386.


[epoch 001/010] step 50/119 loss 1.4904
[epoch 001/010] step 100/119 loss 1.1771
[epoch 001/010] step 119/119 loss 1.1921
[epoch 002/010] step 50/119 loss 1.3907
[epoch 002/010] step 100/119 loss 1.2247
[epoch 002/010] step 119/119 loss 1.2583
[epoch 003/010] step 50/119 loss 1.0627
[epoch 003/010] step 100/119 loss 1.0679
[epoch 003/010] step 119/119 loss 1.4234
[epoch 004/010] step 50/119 loss 1.0610
[epoch 004/010] step 100/119 loss 0.9086
[epoch 004/010] step 119/119 loss 1.0954
[epoch 005/010] step 50/119 loss 0.9871
[epoch 005/010] step 100/119 loss 0.8733
[epoch 005/010] step 119/119 loss 0.8704
[epoch 006/010] step 50/119 loss 0.7983
[epoch 006/010] step 100/119 loss 0.8162
[epoch 006/010] step 119/119 loss 0.7662
[epoch 007/010] step 50/119 loss 0.8433
[epoch 007/010] step 100/119 loss 0.7738
[epoch 007/010] step 119/119 loss 0.7774
[epoch 008/010] step 50/119 loss 0.6927
[epoch 008/010] step 100/119 loss 0.6597
[epoch 008/010] step 119/119 loss 0.6821
[epoch 009/010] step 50/

[I 2025-12-25 16:04:55,332] Trial 2 finished with value: 0.4035825230630872 and parameters: {'optimizer': 'adamw', 'lr': 0.0002460208061014163, 'weight_decay': 7.4763120622522945e-06, 'scheduler': 'none'}. Best is trial 2 with value: 0.4035825230630872.


[epoch 001/010] step 50/119 loss 1.4631
[epoch 001/010] step 100/119 loss 1.2561
[epoch 001/010] step 119/119 loss 1.3802
[epoch 002/010] step 50/119 loss 1.1909
[epoch 002/010] step 100/119 loss 1.1663
[epoch 002/010] step 119/119 loss 1.0146
[epoch 003/010] step 50/119 loss 1.1468
[epoch 003/010] step 100/119 loss 0.9409
[epoch 003/010] step 119/119 loss 1.0901
[epoch 004/010] step 50/119 loss 1.2061
[epoch 004/010] step 100/119 loss 0.7852
[epoch 004/010] step 119/119 loss 1.0534
[epoch 005/010] step 50/119 loss 0.9064
[epoch 005/010] step 100/119 loss 0.7901
[epoch 005/010] step 119/119 loss 0.7880
[epoch 006/010] step 50/119 loss 0.8379
[epoch 006/010] step 100/119 loss 0.8101
[epoch 006/010] step 119/119 loss 0.6621
[epoch 007/010] step 50/119 loss 0.7281
[epoch 007/010] step 100/119 loss 0.6386
[epoch 007/010] step 119/119 loss 0.8835
[epoch 008/010] step 50/119 loss 0.6132
[epoch 008/010] step 100/119 loss 0.5800
[epoch 008/010] step 119/119 loss 0.6473
[epoch 009/010] step 50/

[I 2025-12-25 16:13:52,017] Trial 3 finished with value: 0.3905406904471475 and parameters: {'optimizer': 'adamw', 'lr': 0.00026891899484442807, 'weight_decay': 0.0002267398652378039, 'scheduler': 'cosine', 'warmup_iters': 0, 'ema_decay': 0.0}. Best is trial 2 with value: 0.4035825230630872.


[epoch 001/010] step 50/119 loss 3.2759
[epoch 001/010] step 100/119 loss 2.6084
[epoch 001/010] step 119/119 loss 2.3519
[epoch 002/010] step 50/119 loss 2.2711
[epoch 002/010] step 100/119 loss 1.9929
[epoch 002/010] step 119/119 loss 1.6640
[epoch 003/010] step 50/119 loss 1.7741
[epoch 003/010] step 100/119 loss 1.8768
[epoch 003/010] step 119/119 loss 1.6977
[epoch 004/010] step 50/119 loss 1.6969
[epoch 004/010] step 100/119 loss 1.5086
[epoch 004/010] step 119/119 loss 1.4445
[epoch 005/010] step 50/119 loss 1.4400
[epoch 005/010] step 100/119 loss 1.2956
[epoch 005/010] step 119/119 loss 1.4740
[epoch 006/010] step 50/119 loss 1.3436
[epoch 006/010] step 100/119 loss 1.4818
[epoch 006/010] step 119/119 loss 1.4061
[epoch 007/010] step 50/119 loss 1.3850
[epoch 007/010] step 100/119 loss 1.7041
[epoch 007/010] step 119/119 loss 1.3558
[epoch 008/010] step 50/119 loss 1.5382
[epoch 008/010] step 100/119 loss 1.2831
[epoch 008/010] step 119/119 loss 1.5568
[epoch 009/010] step 50/

[I 2025-12-25 16:22:43,272] Trial 4 finished with value: 0.33506872165874363 and parameters: {'optimizer': 'adamw', 'lr': 0.0017618561667189323, 'weight_decay': 0.0002661901888489054, 'scheduler': 'none'}. Best is trial 2 with value: 0.4035825230630872.


[epoch 001/010] step 50/119 loss 6.0290
[epoch 001/010] step 100/119 loss 4.9099
[epoch 001/010] step 119/119 loss 4.3436
[epoch 002/010] step 50/119 loss 3.3767
[epoch 002/010] step 100/119 loss 3.0152
[epoch 002/010] step 119/119 loss 2.8712
[epoch 003/010] step 50/119 loss 2.9087
[epoch 003/010] step 100/119 loss 2.6403
[epoch 003/010] step 119/119 loss 2.5925


[I 2025-12-25 16:24:41,573] Trial 5 pruned. 


[epoch 001/010] step 50/119 loss 1.7509
[epoch 001/010] step 100/119 loss 1.3077
[epoch 001/010] step 119/119 loss 1.3144
[epoch 002/010] step 50/119 loss 1.1498
[epoch 002/010] step 100/119 loss 1.2004
[epoch 002/010] step 119/119 loss 1.1948
[epoch 003/010] step 50/119 loss 1.1347
[epoch 003/010] step 100/119 loss 1.1237
[epoch 003/010] step 119/119 loss 1.0477
[epoch 004/010] step 50/119 loss 0.9902
[epoch 004/010] step 100/119 loss 0.8476
[epoch 004/010] step 119/119 loss 0.8080
[epoch 005/010] step 50/119 loss 0.9992
[epoch 005/010] step 100/119 loss 1.0956
[epoch 005/010] step 119/119 loss 0.9784
[epoch 006/010] step 50/119 loss 0.9185
[epoch 006/010] step 100/119 loss 0.9805
[epoch 006/010] step 119/119 loss 0.9071
[epoch 007/010] step 50/119 loss 0.7091
[epoch 007/010] step 100/119 loss 0.7452
[epoch 007/010] step 119/119 loss 0.7298
[epoch 008/010] step 50/119 loss 0.8704
[epoch 008/010] step 100/119 loss 0.7657
[epoch 008/010] step 119/119 loss 0.8064
[epoch 009/010] step 50/

[I 2025-12-25 16:33:29,493] Trial 6 finished with value: 0.39255241529546736 and parameters: {'optimizer': 'sgd', 'lr': 0.01825823043920025, 'weight_decay': 0.00021154290797261214, 'scheduler': 'none'}. Best is trial 2 with value: 0.4035825230630872.


[epoch 001/010] step 50/119 loss 1.4586
[epoch 001/010] step 100/119 loss 1.4013
[epoch 001/010] step 119/119 loss 1.2614
[epoch 002/010] step 50/119 loss 1.1484
[epoch 002/010] step 100/119 loss 1.2132
[epoch 002/010] step 119/119 loss 1.3357
[epoch 003/010] step 50/119 loss 0.9937
[epoch 003/010] step 100/119 loss 1.0187
[epoch 003/010] step 119/119 loss 1.0542
[epoch 004/010] step 50/119 loss 0.8610
[epoch 004/010] step 100/119 loss 0.9149
[epoch 004/010] step 119/119 loss 0.8333
[epoch 005/010] step 50/119 loss 0.8542
[epoch 005/010] step 100/119 loss 0.8743
[epoch 005/010] step 119/119 loss 1.1418
[epoch 006/010] step 50/119 loss 0.7563
[epoch 006/010] step 100/119 loss 0.8133
[epoch 006/010] step 119/119 loss 0.7323
[epoch 007/010] step 50/119 loss 0.7902
[epoch 007/010] step 100/119 loss 0.7954
[epoch 007/010] step 119/119 loss 0.9311
[epoch 008/010] step 50/119 loss 0.9163
[epoch 008/010] step 100/119 loss 0.7932
[epoch 008/010] step 119/119 loss 0.7256
[epoch 009/010] step 50/

[I 2025-12-25 16:42:22,494] Trial 7 finished with value: 0.39311500146249423 and parameters: {'optimizer': 'adamw', 'lr': 6.930112765148073e-05, 'weight_decay': 3.87211803217458e-06, 'scheduler': 'cosine', 'warmup_iters': 0, 'ema_decay': 0.0}. Best is trial 2 with value: 0.4035825230630872.


[epoch 001/010] step 50/119 loss 1.5002
[epoch 001/010] step 100/119 loss 1.3649
[epoch 001/010] step 119/119 loss 1.1524
[epoch 002/010] step 50/119 loss 1.1306
[epoch 002/010] step 100/119 loss 0.9801
[epoch 002/010] step 119/119 loss 1.0913
[epoch 003/010] step 50/119 loss 1.0715
[epoch 003/010] step 100/119 loss 1.1599
[epoch 003/010] step 119/119 loss 0.8509


[I 2025-12-25 16:44:20,141] Trial 8 pruned. 


[epoch 001/010] step 50/119 loss 2.7440
[epoch 001/010] step 100/119 loss 2.0355
[epoch 001/010] step 119/119 loss 1.8517
[epoch 002/010] step 50/119 loss 1.6787
[epoch 002/010] step 100/119 loss 1.5201
[epoch 002/010] step 119/119 loss 1.3180
[epoch 003/010] step 50/119 loss 1.4025
[epoch 003/010] step 100/119 loss 1.4837
[epoch 003/010] step 119/119 loss 1.1829


[I 2025-12-25 16:46:16,375] Trial 9 pruned. 


[epoch 001/010] step 50/119 loss 2.4778
[epoch 001/010] step 100/119 loss 1.8320
[epoch 001/010] step 119/119 loss 1.4574
[epoch 002/010] step 50/119 loss 1.5947
[epoch 002/010] step 100/119 loss 1.1885
[epoch 002/010] step 119/119 loss 1.1096
[epoch 003/010] step 50/119 loss 1.3191
[epoch 003/010] step 100/119 loss 1.2692
[epoch 003/010] step 119/119 loss 0.9960


[I 2025-12-25 16:48:12,052] Trial 10 pruned. 


[epoch 001/010] step 50/119 loss 4.4185
[epoch 001/010] step 100/119 loss 2.6798
[epoch 001/010] step 119/119 loss 2.5610
[epoch 002/010] step 50/119 loss 2.0319
[epoch 002/010] step 100/119 loss 1.8234
[epoch 002/010] step 119/119 loss 1.7691
[epoch 003/010] step 50/119 loss 1.7399
[epoch 003/010] step 100/119 loss 1.2743
[epoch 003/010] step 119/119 loss 1.3048


[I 2025-12-25 16:50:09,050] Trial 11 pruned. 


[epoch 001/010] step 50/119 loss 1.3806
[epoch 001/010] step 100/119 loss 1.3837
[epoch 001/010] step 119/119 loss 1.1585
[epoch 002/010] step 50/119 loss 1.1053
[epoch 002/010] step 100/119 loss 1.0575
[epoch 002/010] step 119/119 loss 1.2151
[epoch 003/010] step 50/119 loss 1.0699
[epoch 003/010] step 100/119 loss 1.0195
[epoch 003/010] step 119/119 loss 1.0005
[epoch 004/010] step 50/119 loss 0.9435
[epoch 004/010] step 100/119 loss 1.0374
[epoch 004/010] step 119/119 loss 0.7979
[epoch 005/010] step 50/119 loss 0.7776
[epoch 005/010] step 100/119 loss 0.8599
[epoch 005/010] step 119/119 loss 0.5874
[epoch 006/010] step 50/119 loss 0.8446
[epoch 006/010] step 100/119 loss 0.8130
[epoch 006/010] step 119/119 loss 0.7697
[epoch 007/010] step 50/119 loss 0.6390
[epoch 007/010] step 100/119 loss 0.9655
[epoch 007/010] step 119/119 loss 0.6951
[epoch 008/010] step 50/119 loss 0.5661
[epoch 008/010] step 100/119 loss 0.5509
[epoch 008/010] step 119/119 loss 0.8986
[epoch 009/010] step 50/

[I 2025-12-25 16:59:03,362] Trial 12 finished with value: 0.39755849516839487 and parameters: {'optimizer': 'adamw', 'lr': 0.0001634795977671513, 'weight_decay': 7.203821205189785e-06, 'scheduler': 'cosine', 'warmup_iters': 0, 'ema_decay': 0.0}. Best is trial 2 with value: 0.4035825230630872.


[epoch 001/010] step 50/119 loss 1.3843
[epoch 001/010] step 100/119 loss 1.2635
[epoch 001/010] step 119/119 loss 1.2498
[epoch 002/010] step 50/119 loss 1.0521
[epoch 002/010] step 100/119 loss 1.1498
[epoch 002/010] step 119/119 loss 1.1341
[epoch 003/010] step 50/119 loss 1.0424
[epoch 003/010] step 100/119 loss 1.1175
[epoch 003/010] step 119/119 loss 0.9751
[epoch 004/010] step 50/119 loss 1.1446
[epoch 004/010] step 100/119 loss 0.9563
[epoch 004/010] step 119/119 loss 0.7523
[epoch 005/010] step 50/119 loss 0.9754
[epoch 005/010] step 100/119 loss 0.9162
[epoch 005/010] step 119/119 loss 0.7876
[epoch 006/010] step 50/119 loss 0.7608
[epoch 006/010] step 100/119 loss 0.9139
[epoch 006/010] step 119/119 loss 0.9096
[epoch 007/010] step 50/119 loss 0.8037
[epoch 007/010] step 100/119 loss 0.7067
[epoch 007/010] step 119/119 loss 0.7814
[epoch 008/010] step 50/119 loss 0.6148
[epoch 008/010] step 100/119 loss 0.6515
[epoch 008/010] step 119/119 loss 0.7593
[epoch 009/010] step 50/

[I 2025-12-25 17:07:57,431] Trial 13 finished with value: 0.4038958720252052 and parameters: {'optimizer': 'adamw', 'lr': 0.0001615751321784009, 'weight_decay': 1.341899467024904e-05, 'scheduler': 'none'}. Best is trial 13 with value: 0.4038958720252052.


[epoch 001/010] step 50/119 loss 2.3725
[epoch 001/010] step 100/119 loss 2.0371
[epoch 001/010] step 119/119 loss 1.9319
[epoch 002/010] step 50/119 loss 1.6612
[epoch 002/010] step 100/119 loss 1.3922
[epoch 002/010] step 119/119 loss 1.4936
[epoch 003/010] step 50/119 loss 1.5091
[epoch 003/010] step 100/119 loss 1.2831
[epoch 003/010] step 119/119 loss 1.3152


[I 2025-12-25 17:09:52,663] Trial 14 pruned. 


[epoch 001/010] step 50/119 loss 1.4948
[epoch 001/010] step 100/119 loss 1.2137
[epoch 001/010] step 119/119 loss 1.0703
[epoch 002/010] step 50/119 loss 1.2544
[epoch 002/010] step 100/119 loss 1.1132
[epoch 002/010] step 119/119 loss 1.0096
[epoch 003/010] step 50/119 loss 1.0580
[epoch 003/010] step 100/119 loss 1.0173
[epoch 003/010] step 119/119 loss 1.1806
[epoch 004/010] step 50/119 loss 1.1569
[epoch 004/010] step 100/119 loss 0.8121
[epoch 004/010] step 119/119 loss 1.0076
[epoch 005/010] step 50/119 loss 0.8589
[epoch 005/010] step 100/119 loss 0.9320
[epoch 005/010] step 119/119 loss 0.8230
[epoch 006/010] step 50/119 loss 0.8962
[epoch 006/010] step 100/119 loss 0.7757
[epoch 006/010] step 119/119 loss 0.6635
[epoch 007/010] step 50/119 loss 0.9018
[epoch 007/010] step 100/119 loss 0.7855
[epoch 007/010] step 119/119 loss 0.7445
[epoch 008/010] step 50/119 loss 0.8863
[epoch 008/010] step 100/119 loss 0.5810
[epoch 008/010] step 119/119 loss 0.7625
[epoch 009/010] step 50/

[I 2025-12-25 17:18:43,109] Trial 15 finished with value: 0.4031790329098913 and parameters: {'optimizer': 'adamw', 'lr': 0.00015230664888236258, 'weight_decay': 2.0115570409364703e-05, 'scheduler': 'none'}. Best is trial 13 with value: 0.4038958720252052.


[epoch 001/010] step 50/119 loss 1.4162
[epoch 001/010] step 100/119 loss 1.2481
[epoch 001/010] step 119/119 loss 1.1899
[epoch 002/010] step 50/119 loss 1.3947
[epoch 002/010] step 100/119 loss 1.2336
[epoch 002/010] step 119/119 loss 1.2775
[epoch 003/010] step 50/119 loss 1.1048
[epoch 003/010] step 100/119 loss 1.0767
[epoch 003/010] step 119/119 loss 0.9053
[epoch 004/010] step 50/119 loss 0.8872
[epoch 004/010] step 100/119 loss 0.7165
[epoch 004/010] step 119/119 loss 0.8585
[epoch 005/010] step 50/119 loss 0.9946
[epoch 005/010] step 100/119 loss 0.8834
[epoch 005/010] step 119/119 loss 0.8209
[epoch 006/010] step 50/119 loss 0.7779
[epoch 006/010] step 100/119 loss 0.8284
[epoch 006/010] step 119/119 loss 0.7842
[epoch 007/010] step 50/119 loss 0.7959
[epoch 007/010] step 100/119 loss 0.7670
[epoch 007/010] step 119/119 loss 0.8676
[epoch 008/010] step 50/119 loss 0.6513
[epoch 008/010] step 100/119 loss 0.7059
[epoch 008/010] step 119/119 loss 0.9668
[epoch 009/010] step 50/

[I 2025-12-25 17:27:34,447] Trial 16 finished with value: 0.41107649962940734 and parameters: {'optimizer': 'adamw', 'lr': 0.00016401195178940758, 'weight_decay': 1.0719427741859864e-06, 'scheduler': 'none'}. Best is trial 16 with value: 0.41107649962940734.


[epoch 001/010] step 50/119 loss 1.4462
[epoch 001/010] step 100/119 loss 1.4563
[epoch 001/010] step 119/119 loss 1.5130
[epoch 002/010] step 50/119 loss 1.2899
[epoch 002/010] step 100/119 loss 1.0445
[epoch 002/010] step 119/119 loss 1.2512
[epoch 003/010] step 50/119 loss 1.1332
[epoch 003/010] step 100/119 loss 1.0655
[epoch 003/010] step 119/119 loss 0.9526
[epoch 004/010] step 50/119 loss 0.8921
[epoch 004/010] step 100/119 loss 0.8884
[epoch 004/010] step 119/119 loss 0.7525
[epoch 005/010] step 50/119 loss 0.8766
[epoch 005/010] step 100/119 loss 0.8597
[epoch 005/010] step 119/119 loss 0.8147
[epoch 006/010] step 50/119 loss 0.8098
[epoch 006/010] step 100/119 loss 0.8790
[epoch 006/010] step 119/119 loss 0.6886
[epoch 007/010] step 50/119 loss 0.6548
[epoch 007/010] step 100/119 loss 0.7286
[epoch 007/010] step 119/119 loss 0.9080
[epoch 008/010] step 50/119 loss 0.6557
[epoch 008/010] step 100/119 loss 0.9337
[epoch 008/010] step 119/119 loss 0.8850


[I 2025-12-25 17:32:46,835] Trial 17 pruned. 


[epoch 001/010] step 50/119 loss 2.4474
[epoch 001/010] step 100/119 loss 1.9317
[epoch 001/010] step 119/119 loss 1.8297
[epoch 002/010] step 50/119 loss 1.5097
[epoch 002/010] step 100/119 loss 1.2604
[epoch 002/010] step 119/119 loss 1.6049
[epoch 003/010] step 50/119 loss 1.3750
[epoch 003/010] step 100/119 loss 1.2916
[epoch 003/010] step 119/119 loss 1.1712


[I 2025-12-25 17:34:42,096] Trial 18 pruned. 


[epoch 001/010] step 50/119 loss 1.6429
[epoch 001/010] step 100/119 loss 1.3642
[epoch 001/010] step 119/119 loss 1.2726
[epoch 002/010] step 50/119 loss 1.1934
[epoch 002/010] step 100/119 loss 0.9797
[epoch 002/010] step 119/119 loss 1.1162
[epoch 003/010] step 50/119 loss 1.3150
[epoch 003/010] step 100/119 loss 1.1078
[epoch 003/010] step 119/119 loss 1.0718
[epoch 004/010] step 50/119 loss 1.0127
[epoch 004/010] step 100/119 loss 0.8183
[epoch 004/010] step 119/119 loss 0.9948
[epoch 005/010] step 50/119 loss 1.0414
[epoch 005/010] step 100/119 loss 0.9222
[epoch 005/010] step 119/119 loss 0.9529
[epoch 006/010] step 50/119 loss 0.7814
[epoch 006/010] step 100/119 loss 0.8675
[epoch 006/010] step 119/119 loss 1.0225
[epoch 007/010] step 50/119 loss 0.7282
[epoch 007/010] step 100/119 loss 0.9318
[epoch 007/010] step 119/119 loss 0.8351
[epoch 008/010] step 50/119 loss 0.7528
[epoch 008/010] step 100/119 loss 0.7258
[epoch 008/010] step 119/119 loss 1.0024


[I 2025-12-25 17:39:53,076] Trial 19 pruned. 


best_value(AJI)= 0.41107649962940734
best_params= {'optimizer': 'adamw', 'lr': 0.00016401195178940758, 'weight_decay': 1.0719427741859864e-06, 'scheduler': 'none'}


In [15]:
import pandas as pd
import mlflow

mlflow.set_tracking_uri(TRACKING_URI)

exp = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if exp is None: raise RuntimeError(f"MLflow experiment not found: {EXPERIMENT_NAME}")

runs = mlflow.search_runs([exp.experiment_id], max_results=2000)

name_col = next((c for c in ["run_name", "tags.mlflow.runName", "tags.mlflow.run_name"] if c in runs.columns), None)
if name_col is None: name_col = "run_id"

cols = [name_col, "metrics.b_test_mAP50", "params.optimizer", "params.lr", "params.weight_decay", "params.momentum"]
cols = [c for c in cols if c in runs.columns]
df = runs[cols].copy()

df = df[df["metrics.b_test_mAP50"].notna()].copy()
df["metrics.b_test_mAP50"] = pd.to_numeric(df["metrics.b_test_mAP50"], errors="coerce")
for c in ["params.lr", "params.weight_decay", "params.momentum"]:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce")

out = pd.DataFrame({
    "mAP50": df["metrics.b_test_mAP50"].astype(float),
    "lr": df.get("params.lr", pd.Series([None]*len(df))),
    "wd": df.get("params.weight_decay", pd.Series([None]*len(df))),
    "optimizer": df.get("params.optimizer", pd.Series([""]*len(df))).astype(str),
    "momentum": df.get("params.momentum", pd.Series([None]*len(df))),
})

out.loc[out["optimizer"].str.lower() != "sgd", "momentum"] = ""
out["lr"] = out["lr"].map(lambda x: f"{float(x):.3g}" if x is not None and x != "" and pd.notna(x) else "")
out["wd"] = out["wd"].map(lambda x: f"{float(x):.3g}" if x is not None and x != "" and pd.notna(x) else "")
out["momentum"] = out["momentum"].map(lambda x: f"{float(x):.3g}" if x is not None and x != "" and pd.notna(x) else "")

out = out.sort_values("mAP50", ascending=False).reset_index(drop=True)
display(out.head(50))






  out.loc[out["optimizer"].str.lower() != "sgd", "momentum"] = ""


Unnamed: 0,mAP50,lr,wd,optimizer,momentum
0,0.866164,0.000246,7.48e-06,adamw,
1,0.81536,0.000164,1.07e-06,adamw,
2,0.813263,0.000152,2.01e-05,adamw,
3,0.80308,0.000163,7.2e-06,adamw,
4,0.797136,0.000162,1.34e-05,adamw,
5,0.790847,0.000269,0.000227,adamw,
6,0.741104,0.0183,0.000212,sgd,0.9
7,0.704362,0.000459,0.000133,adamw,
8,0.690413,6.93e-05,3.87e-06,adamw,
9,0.679184,0.000744,6.25e-05,adamw,


In [18]:
FINAL_EPOCHS = 40

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

exp = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
runs = mlflow.search_runs([exp.experiment_id], order_by=["metrics.b_test_mAP50 DESC"], max_results=2000)
runs = runs[runs["metrics.b_test_mAP50"].notna()].copy()
best = runs.iloc[0]

opt = str(best.get("params.optimizer", "sgd")).lower()
lr = float(best.get("params.lr"))
wd = float(best.get("params.weight_decay"))
mom = float(best.get("params.momentum", 0.9)) if opt == "sgd" else 0.0

b_train = b_train if "b_train" in globals() else dm.make_loader_b_train()
b_val = b_val if "b_val" in globals() else dm.make_loader_b_val()
b_test = b_test if "b_test" in globals() else dm.make_loader_b_test()
ev = ev if "ev" in globals() else Evaluator(device=DEVICE)

model = build_model(MODEL_NAME, NUM_CLASSES).to(DEVICE)
conf = TrainConfig(num_epochs=FINAL_EPOCHS, batch_size=4, num_workers=4, lr=lr, weight_decay=wd, momentum=mom if opt == "sgd" else 0.9, print_every=50, 
                   tracking_uri=TRACKING_URI, amp=True, grad_clip=1.0, ema_decay=0.0, warmup_iters=0, scheduler="none", min_lr=1e-6, freeze_bn=True)

trainer = Trainer(model, conf)
params = [p for p in model.parameters() if p.requires_grad]
trainer.optimizer = torch.optim.SGD(params, lr=lr, momentum=mom, weight_decay=wd) if opt == "sgd" else torch.optim.AdamW(params, lr=lr, weight_decay=wd)
trainer._init_schedulers(steps_per_epoch=len(b_train))

run_name = f"final_best_map50_{opt}_lr{lr:.3g}_wd{wd:.3g}_ep{FINAL_EPOCHS}"
with mlflow.start_run(run_name=run_name):
    mlflow.log_params({"source": "best_by_b_test_mAP50", "optimizer": opt, "lr": lr, "weight_decay": wd, "momentum": mom, "epochs": FINAL_EPOCHS, "model_name": str(MODEL_NAME), "best_trial_run_id": str(best.get("run_id", "")), "best_trial_b_test_mAP50": float(best["metrics.b_test_mAP50"])})
    for epoch in range(FINAL_EPOCHS):
        train_loss = trainer.train_one_epoch(b_train, epoch)
        val_loss = trainer._eval_with_ema(b_val)
        mlflow.log_metrics({"train_loss": float(train_loss), "val_loss": float(val_loss)}, step=epoch)
        print(f"[epoch {epoch + 1:03d}/{FINAL_EPOCHS:03d}] train={float(train_loss):.4f} val={float(val_loss):.4f}")
    m = ev.metrics_masks(model, b_test, num_classes=NUM_CLASSES)
    mlflow.log_metric("b_test_mAP50", float(m["mAP50"]))
    mlflow.log_metric("b_test_AJI", float(m["AJI"]))
    mlflow.pytorch.log_model(model, "model")

print("done:", run_name)


  self.scaler = torch.cuda.amp.GradScaler(enabled=bool(train_conf.amp and self.device.type == "cuda"))


[epoch 001/040] step 50/119 loss 1.4433
[epoch 001/040] step 100/119 loss 1.1993
[epoch 001/040] step 119/119 loss 1.4526
[epoch 001/040] train=1.6869 val=1.2408
[epoch 002/040] step 50/119 loss 1.1471
[epoch 002/040] step 100/119 loss 1.0001
[epoch 002/040] step 119/119 loss 1.2928
[epoch 002/040] train=1.1918 val=1.1336
[epoch 003/040] step 50/119 loss 1.0857
[epoch 003/040] step 100/119 loss 1.0921
[epoch 003/040] step 119/119 loss 1.0237
[epoch 003/040] train=1.0596 val=1.0067
[epoch 004/040] step 50/119 loss 0.9541
[epoch 004/040] step 100/119 loss 0.9192
[epoch 004/040] step 119/119 loss 0.9223
[epoch 004/040] train=0.9818 val=0.9620
[epoch 005/040] step 50/119 loss 1.0336
[epoch 005/040] step 100/119 loss 0.7592
[epoch 005/040] step 119/119 loss 0.9446
[epoch 005/040] train=0.9058 val=0.8871
[epoch 006/040] step 50/119 loss 0.8981
[epoch 006/040] step 100/119 loss 0.7994
[epoch 006/040] step 119/119 loss 0.8711
[epoch 006/040] train=0.8400 val=0.8538
[epoch 007/040] step 50/119 



done: final_best_map50_adamw_lr0.000246_wd7.48e-06_ep40


In [19]:
ckpt_a ="_optuna.pth"
torch.save(model.state_dict(), ckpt_a)
print("saved:", ckpt_a) #mlflow saves checkpoints also

saved: _optuna.pth


In [10]:
import re, pandas as pd
import mlflow

mlflow.set_tracking_uri(TRACKING_URI)

exp = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if exp is None: raise RuntimeError(f"MLflow experiment not found: {EXPERIMENT_NAME}")

runs = mlflow.search_runs([exp.experiment_id], order_by=["metrics.objective_AJI DESC"], max_results=500)

# find candidate mAP50 metric columns that were logged from metrics_masks()
metric_cols = [c for c in runs.columns if c.startswith("metrics.")]
cand = [c for c in metric_cols if ("map50" in c.lower()) or (re.search(r"\bap50\b", c.lower()) is not None)]
print("mAP50/AP50 candidates:", cand)

# pick the best-looking one (prefer b_test_*)
map_col = next((c for c in cand if "b_test_" in c.lower()), None) or (cand[0] if cand else None)
if map_col is None: raise RuntimeError("No mAP50/AP50 metric found in MLflow. (It may not be produced by metrics_masks().)")

show_cols = [c for c in ["run_name", "metrics.objective_AJI", "metrics.b_test_AJI", map_col] if c in runs.columns]
df = runs[show_cols].copy()
df.rename(columns={map_col: "metrics.b_test_mAP50_or_AP50"}, inplace=True)

display(df.head(20))
print("best trial mAP50/AP50 =", float(df.iloc[0]["metrics.b_test_mAP50_or_AP50"]))


mAP50/AP50 candidates: ['metrics.b_test_mAP50']


Unnamed: 0,metrics.objective_AJI,metrics.b_test_AJI,metrics.b_test_mAP50_or_AP50
0,0.411076,0.411076,0.81536
1,0.403896,0.403896,0.797136
2,0.403583,0.403583,0.866164
3,0.403179,0.403179,0.813263
4,0.397558,0.397558,0.80308
5,0.393115,0.393115,0.690413
6,0.392552,0.392552,0.741104
7,0.390541,0.390541,0.790847
8,0.382636,0.382636,0.679184
9,0.382121,0.382121,0.704362


best trial mAP50/AP50 = 0.8153602480888367
