# 1 CASE

Uses optuna to improve the stage 2. Every run optuna creates is registered in the mlflow and finally label the final best version model.

No uses nested runs, so it can be added in case you plan to have multiple runs

### 1.Imports & config 

In [5]:
# --- core
import os, io, math, json
import numpy as np
import pandas as pd
from io import BytesIO

# minio
from minio import Minio

# --- mlflow
import mlflow
from mlflow.tracking import MlflowClient

# --- torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# --- metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- hpo
import optuna
from optuna.pruners import MedianPruner, PercentilePruner

# --- plotting
import matplotlib.pyplot as plt

# ====== Update these for your cluster ======
TRACKING_URI = "http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080"
EXPERIMENT_NAME = "k8s-cpu-forecasting"
REGISTERED_MODEL_NAME = "cpu-pct-hpo"   # MLflow model name

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
client = MlflowClient()

# Minio client
minio_client = Minio(
    "minio-service.kubeflow.svc.cluster.local:9000",
    access_key="minio",
    secret_key="minio123",
    secure=False,
)

### 2 — MinIO helpers (reuse your existing)

In [6]:
def download_numpy_from_minio(minio_client, bucket_name, object_name):
    resp = minio_client.get_object(bucket_name, object_name)
    try:
        data = resp.read()
        arr = np.load(BytesIO(data))
        return arr
    finally:
        resp.close()
        resp.release_conn()

# Set your paths
bucket_name = "k8s-resources-forecast"
object_names = {
    "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
    "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
    "X_val":   "data/k8s-preprocessed/node-1-X_test/X_test.npy",
    "y_val":   "data/k8s-preprocessed/node-1-y_test/y_test.npy",
}

# Download arrays
X_train = download_numpy_from_minio(minio_client, bucket_name, object_names["X_train"])
y_train = download_numpy_from_minio(minio_client, bucket_name, object_names["y_train"])
X_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["X_val"])
y_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["y_val"])

# Torch loaders
def make_loader(X, y, batch_size, shuffle):
    ds = TensorDataset(torch.tensor(X, dtype=torch.float32),
                       torch.tensor(y, dtype=torch.float32))
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle)

# sanity
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:  ", X_val.shape,   "y_val:  ", y_val.shape)


X_train: (8238, 5, 1) y_train: (8238, 1, 1)
X_val:   (2060, 5, 1) y_val:   (2060, 1, 1)


### 3 — Model, train loop, metrics & plots

In [9]:
class LSTMForecaster(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, dropout=0.0, horizon=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
        self.fc   = nn.Linear(hidden_size, horizon)  # horizon outputs
    def forward(self, x):
        # x: [B, T, F]
        out, _ = self.lstm(x)             # [B, T, H]
        out = out[:, -1, :]               # last step [B, H]
        out = self.fc(out)                # [B, horizon]
        return out.unsqueeze(-1)          # [B, horizon, 1] to match y

def train_one_model(X_train, y_train, X_val, y_val,
                    hidden_size, num_layers, dropout,
                    lr, batch_size, epochs, patience, window_size, horizon,
                    run_name=None):
    device = torch.device("cpu")

    train_loader = make_loader(X_train, y_train, batch_size=batch_size, shuffle=True)
    val_loader   = make_loader(X_val,   y_val,   batch_size=batch_size, shuffle=False)

    model = LSTMForecaster(input_size=X_train.shape[-1],
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           dropout=dropout,
                           horizon=horizon).to(device)

    criterion = nn.MSELoss()
    opt = optim.Adam(model.parameters(), lr=lr)

    best_val = float("inf")
    patience_left = patience
    train_curve, val_curve = [], []

    with mlflow.start_run(run_name=run_name):
        mlflow.log_params({
            "hidden_size": hidden_size,
            "num_layers": num_layers,
            "dropout": dropout,
            "lr": lr,
            "batch_size": batch_size,
            "epochs": epochs,
            "patience": patience,
            "window_size": window_size,
            "horizon": horizon
        })

        for ep in range(1, epochs + 1):
            # --- train ---
            model.train()
            running = 0.0
            for xb, yb in train_loader:
                opt.zero_grad()
                pred = model(xb)
                loss = criterion(pred, yb)
                loss.backward()
                opt.step()
                running += loss.item()
            train_loss = running / max(1, len(train_loader))

            # --- validate ---
            model.eval()
            v_running = 0.0
            preds, targs = [], []
            with torch.no_grad():
                for xb, yb in val_loader:
                    pr = model(xb)
                    v_loss = criterion(pr, yb)
                    v_running += v_loss.item()
                    preds.append(pr.numpy())
                    targs.append(yb.numpy())
            val_loss = v_running / max(1, len(val_loader))
            train_curve.append(train_loss)
            val_curve.append(val_loss)

            # log per-epoch for MLflow + Optuna pruning
            mlflow.log_metric("train_loss", train_loss, step=ep)
            mlflow.log_metric("val_loss",   val_loss,   step=ep)

            # early stopping
            if val_loss < best_val - 1e-8:
                best_val = val_loss
                patience_left = patience
                best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            else:
                patience_left -= 1
                if patience_left <= 0:
                    print("Early stopping!")
                    break

        # restore best
        model.load_state_dict(best_state)

        # Final val metrics (MAE/RMSE/R2)
        preds = np.concatenate([model(xb).detach().numpy() for xb, _ in val_loader], axis=0).reshape(-1)
        targs = np.concatenate([yb.numpy() for _, yb in val_loader], axis=0).reshape(-1)
        mae  = mean_absolute_error(targs, preds)
        mse  = mean_squared_error(targs, preds)
        rmse = math.sqrt(mse)
        r2   = r2_score(targs, preds)

        mlflow.log_metric("val_mae", mae)
        mlflow.log_metric("val_rmse", rmse)
        mlflow.log_metric("val_r2", r2)

        # plots: learning curve, residuals, true vs pred
        # 1) Learning curve
        plt.figure(figsize=(6,3))
        plt.plot(train_curve, label="train")
        plt.plot(val_curve, label="val")
        plt.legend(); plt.title("Learning Curve"); plt.tight_layout()
        plt.savefig("learning_curve.png"); plt.close()
        mlflow.log_artifact("learning_curve.png")

        # 2) Residuals over time
        plt.figure(figsize=(6,3))
        plt.plot(preds - targs, label="residual")
        plt.axhline(0, color="black", linewidth=0.8)
        plt.legend(); plt.title("Residuals (val)"); plt.tight_layout()
        plt.savefig("residuals.png"); plt.close()
        mlflow.log_artifact("residuals.png")

        # 3) True vs Pred
        plt.figure(figsize=(6,3))
        plt.plot(targs, label="true")
        plt.plot(preds, label="pred")
        plt.legend(); plt.title("True vs Pred (val)"); plt.tight_layout()
        plt.savefig("val_true_vs_pred.png"); plt.close()
        mlflow.log_artifact("val_true_vs_pred.png")

        # Save model to MLflow
        mlflow.pytorch.log_model(model, "model")

        # Return metrics (for Optuna) and the run_id
        run_id = mlflow.active_run().info.run_id
    return model, {"val_loss": best_val, "mae": mae, "rmse": rmse, "r2": r2}, run_id


### 4 — Optuna objective & study

In [12]:
# Define the search space and objective
WINDOW_SIZE = X_train.shape[1]  # should be 5 in your case
HORIZON     = y_train.shape[1]  # 1

def objective(trial: optuna.Trial):
    # Hyperparameter search space
    hidden_size = trial.suggest_int("hidden_size", 16, 128, step=16)
    num_layers  = trial.suggest_int("num_layers", 1, 3)
    dropout     = trial.suggest_float("dropout", 0.0, 0.4)
    lr          = trial.suggest_float("lr", 1e-4, 5e-3, log=True)
    batch_size  = trial.suggest_categorical("batch_size", [16, 32, 64])
    epochs      = trial.suggest_int("epochs", 15, 50)
    patience    = trial.suggest_int("patience", 3, 10)

    # Train & get val metrics
    _, metrics, run_id = train_one_model(
        X_train, y_train, X_val, y_val,
        hidden_size, num_layers, dropout,
        lr, batch_size, epochs, patience,
        WINDOW_SIZE, HORIZON,
        run_name=f"optuna-trial-{trial.number}"
    )

    # report intermediate to enable pruning
    trial.report(metrics["val_loss"], step=epochs)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    # Attach the MLflow run_id to the trial for traceability
    trial.set_user_attr("mlflow_run_id", run_id)
    return metrics["val_loss"]

# Create study (minimize val_loss) with pruning
pruner = PercentilePruner(percentile=50, n_startup_trials=3, n_warmup_steps=0)
study = optuna.create_study(direction="minimize", pruner=pruner, study_name="cpu_pct_lstm")

# Run optimization
N_TRIALS = 12
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

print("Best trial:", study.best_trial.number)
print("Best value (val_loss):", study.best_value)
print("Best params:", study.best_params)


[I 2025-09-05 12:28:02,592] A new study created in memory with name: cpu_pct_lstm


Early stopping!


[I 2025-09-05 12:28:54,476] Trial 0 finished with value: 0.0007080256194643345 and parameters: {'hidden_size': 128, 'num_layers': 2, 'dropout': 0.17196126146370536, 'lr': 0.003127900843950967, 'batch_size': 16, 'epochs': 46, 'patience': 4}. Best is trial 0 with value: 0.0007080256194643345.


🏃 View run optuna-trial-0 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/a42ea83f26e74755a7d569e6084f497e
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:29:32,313] Trial 1 finished with value: 0.000550303586130736 and parameters: {'hidden_size': 64, 'num_layers': 2, 'dropout': 0.20657423557430274, 'lr': 0.0004770825259355593, 'batch_size': 64, 'epochs': 49, 'patience': 8}. Best is trial 1 with value: 0.000550303586130736.


🏃 View run optuna-trial-1 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/e47e07df73994507b31761ed942c11eb
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:30:03,494] Trial 2 finished with value: 0.0005593707155919849 and parameters: {'hidden_size': 96, 'num_layers': 1, 'dropout': 0.3801576987716789, 'lr': 0.0003339794978128798, 'batch_size': 32, 'epochs': 49, 'patience': 7}. Best is trial 1 with value: 0.000550303586130736.


🏃 View run optuna-trial-2 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/e4e2e057d74a4816b375f76af064e1b8
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:31:17,868] Trial 3 finished with value: 0.0005508742245952957 and parameters: {'hidden_size': 80, 'num_layers': 3, 'dropout': 0.27795352066203954, 'lr': 0.0004255807011022233, 'batch_size': 32, 'epochs': 44, 'patience': 10}. Best is trial 1 with value: 0.000550303586130736.


🏃 View run optuna-trial-3 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/ddffc0c2ac4e43edb7579d2879a99c07
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:31:55,993] Trial 4 finished with value: 0.0008058777809289262 and parameters: {'hidden_size': 128, 'num_layers': 2, 'dropout': 0.1170797146384416, 'lr': 0.0018520909012747725, 'batch_size': 16, 'epochs': 48, 'patience': 3}. Best is trial 1 with value: 0.000550303586130736.


🏃 View run optuna-trial-4 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/cb05852583664ab6b2f5a9ba6b3e72ca
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:32:13,189] Trial 5 finished with value: 0.0005694188520115298 and parameters: {'hidden_size': 80, 'num_layers': 1, 'dropout': 0.11710905649009723, 'lr': 0.004394327634441767, 'batch_size': 32, 'epochs': 18, 'patience': 8}. Best is trial 1 with value: 0.000550303586130736.


🏃 View run optuna-trial-5 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/8c8e38e0fb134642a2ddab236c6207b4
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:32:46,790] Trial 6 finished with value: 0.0005730214435562072 and parameters: {'hidden_size': 16, 'num_layers': 2, 'dropout': 0.026416992261014818, 'lr': 0.0012471602055676436, 'batch_size': 16, 'epochs': 45, 'patience': 4}. Best is trial 1 with value: 0.000550303586130736.


🏃 View run optuna-trial-6 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/12e219e69e9b4e479ea67d95cf1f527f
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:33:04,632] Trial 7 pruned. 


🏃 View run optuna-trial-7 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/68da9be44b144977a62c211eae8dc877
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:33:58,717] Trial 8 finished with value: 0.0005629137305041068 and parameters: {'hidden_size': 32, 'num_layers': 1, 'dropout': 0.32868231920620256, 'lr': 0.0002442776272584786, 'batch_size': 16, 'epochs': 46, 'patience': 10}. Best is trial 1 with value: 0.000550303586130736.


🏃 View run optuna-trial-8 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/c0d749030e9346968107dd66d417cd9b
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:34:34,240] Trial 9 finished with value: 0.0005474657589343913 and parameters: {'hidden_size': 48, 'num_layers': 2, 'dropout': 0.3125423900604701, 'lr': 0.0002844212723489982, 'batch_size': 64, 'epochs': 46, 'patience': 6}. Best is trial 9 with value: 0.0005474657589343913.


🏃 View run optuna-trial-9 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/7a6172ba96ec43a387b1f34dcb944cf4
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:35:16,683] Trial 10 finished with value: 0.000710926936748861 and parameters: {'hidden_size': 16, 'num_layers': 3, 'dropout': 0.2614810083185243, 'lr': 0.00018991236345418773, 'batch_size': 64, 'epochs': 34, 'patience': 6}. Best is trial 9 with value: 0.0005474657589343913.


🏃 View run optuna-trial-10 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/6c596918b57140a38601a9095aeb98a1
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 12:36:11,871] Trial 11 finished with value: 0.0005757764191719506 and parameters: {'hidden_size': 48, 'num_layers': 3, 'dropout': 0.21740591526050054, 'lr': 0.00012412149478993346, 'batch_size': 64, 'epochs': 35, 'patience': 8}. Best is trial 9 with value: 0.0005474657589343913.


🏃 View run optuna-trial-11 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/6707703efcfa4435a33dfdef8ac329a9
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Best trial: 9
Best value (val_loss): 0.0005474657589343913
Best params: {'hidden_size': 48, 'num_layers': 2, 'dropout': 0.3125423900604701, 'lr': 0.0002844212723489982, 'batch_size': 64, 'epochs': 46, 'patience': 6}


### 5 — Train final model on (train+val), evaluate test, log & register

In [13]:
# If you also have a held-out test set, load it (or repurpose X_val/y_val as test)
# Here, we'll just evaluate on the existing val set for demonstration
best = study.best_params

# Re-train once with best params (optionally combine train+val for final fit)
final_model, final_metrics, run_id = train_one_model(
    X_train, y_train, X_val, y_val,
    best["hidden_size"], best["num_layers"], best["dropout"],
    best["lr"], best["batch_size"], best["epochs"], best["patience"],
    WINDOW_SIZE, HORIZON,
    run_name="final-best"
)

# Register in MLflow Model Registry
model_uri = f"runs:/{run_id}/model"
result = mlflow.register_model(model_uri, REGISTERED_MODEL_NAME)
print("✅ Registered model:", result.name, "version:", result.version)


Early stopping!


Successfully registered model 'cpu-pct-hpo'.
2025/09/05 12:36:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cpu-pct-hpo, version 1


🏃 View run final-best at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/dd7985f8fbc04803b71b23852e6c045d
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
✅ Registered model: cpu-pct-hpo version: 1


Created version '1' of model 'cpu-pct-hpo'.


# 2 CASE


In [45]:
# ----- Core -----
import os, io, json, math, random
from io import BytesIO
import numpy as np
import pandas as pd

# ----- MLflow -----
import mlflow
from mlflow.tracking import MlflowClient

# ----- Torch -----
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# ----- Metrics -----
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ----- HPO -----
import optuna
from optuna.pruners import PercentilePruner

# ----- Plotting -----
import matplotlib.pyplot as plt

# ----- Repro -----
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
set_seed(42)

# ===== MLflow tracking =====
TRACKING_URI = "http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080"
EXPERIMENT_NAME = "k8s-cpu-forecasting"
REGISTERED_MODEL_NAME = "cpu-pct"     # model registry name (optional but nice)

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
ml_client = MlflowClient()

print("MLflow:", TRACKING_URI, "Experiment:", EXPERIMENT_NAME)


MLflow: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080 Experiment: k8s-cpu-forecasting


In [46]:
from minio import Minio

# If you already have `minio_client`, you can skip this cell.
# Adjust endpoint/creds to your cluster.
MINIO_ENDPOINT = "minio-service.kubeflow.svc.cluster.local:9000"
MINIO_ACCESS_KEY = "minio"
MINIO_SECRET_KEY = "minio123"

minio_client = Minio(
    endpoint=MINIO_ENDPOINT,
    access_key=MINIO_ACCESS_KEY,
    secret_key=MINIO_SECRET_KEY,
    secure=False
)

def download_numpy_from_minio(minio_client, bucket_name: str, object_name: str) -> np.ndarray:
    """Download a .npy as numpy array (in-memory, no temp file)."""
    resp = minio_client.get_object(bucket_name, object_name)
    try:
        data = resp.read()
        arr = np.load(BytesIO(data))
        return arr
    finally:
        resp.close()
        resp.release_conn()

# ---- Paths for your prepared arrays ----
bucket_name = "k8s-resources-forecast"
object_names = {
    "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
    "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
    "X_val":   "data/k8s-preprocessed/node-1-X_test/X_test.npy",
    "y_val":   "data/k8s-preprocessed/node-1-y_test/y_test.npy",
}

# ---- Pull arrays ----
X_train = download_numpy_from_minio(minio_client, bucket_name, object_names["X_train"])
y_train = download_numpy_from_minio(minio_client, bucket_name, object_names["y_train"])
X_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["X_val"])
y_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["y_val"])

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:  ", X_val.shape,   "y_val:  ", y_val.shape)

def make_loader(X, y, batch_size=32, shuffle=True):
    ds = TensorDataset(
        torch.tensor(X, dtype=torch.float32),
        torch.tensor(y, dtype=torch.float32),
    )
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle)


X_train: (8238, 5, 1) y_train: (8238, 1, 1)
X_val:   (2060, 5, 1) y_val:   (2060, 1, 1)


In [47]:
class LSTMForecaster(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, dropout=0.1, horizon=1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        self.fc = nn.Linear(hidden_size, horizon)

    def forward(self, x):
        # x: [B, T, F]
        out, _ = self.lstm(x)       # [B, T, H]
        out = out[:, -1, :]         # [B, H] last step
        out = self.fc(out)          # [B, horizon]
        return out.unsqueeze(-1)    # [B, horizon, 1] to match y

def compute_metrics(loader, model):
    """Return (MAE, RMSE, R2), along with flattened preds & targets for plotting."""
    model.eval()
    preds, targs = [], []
    with torch.no_grad():
        for xb, yb in loader:
            pr = model(xb)
            preds.append(pr.numpy())
            targs.append(yb.numpy())
    preds = np.concatenate(preds, axis=0).reshape(-1)
    targs = np.concatenate(targs, axis=0).reshape(-1)
    mae = mean_absolute_error(targs, preds)
    mse = mean_squared_error(targs, preds)
    rmse = math.sqrt(mse)
    r2 = r2_score(targs, preds)
    return (mae, rmse, r2), preds, targs

def plot_and_log_learning_curves(train_curve, val_curve, filename="learning_curve.png"):
    plt.figure(figsize=(7,3))
    plt.plot(train_curve, label="train")
    plt.plot(val_curve, label="val")
    plt.legend(); plt.title("Learning Curve"); plt.tight_layout()
    plt.savefig(filename); plt.close()
    mlflow.log_artifact(filename)

def plot_and_log_residuals(preds, targs, filename="residuals.png"):
    plt.figure(figsize=(7,3))
    plt.plot(preds - targs, label="residual")
    plt.axhline(0, color="black", linewidth=0.8)
    plt.legend(); plt.title("Residuals (val)"); plt.tight_layout()
    plt.savefig(filename); plt.close()
    mlflow.log_artifact(filename)

def plot_and_log_true_vs_pred(targs, preds, filename="val_true_vs_pred.png"):
    plt.figure(figsize=(7,3))
    plt.plot(targs, label="true")
    plt.plot(preds, label="pred")
    plt.legend(); plt.title("True vs Pred (val)"); plt.tight_layout()
    plt.savefig(filename); plt.close()
    mlflow.log_artifact(filename)


In [48]:
def train_one_model_nested(
    X_train, y_train, X_val, y_val,
    hidden_size=64, num_layers=2, dropout=0.1,
    lr=1e-3, batch_size=32,
    epochs=35, patience=5,
    window_size=None, horizon=None,
    run_name="trial",
    nested=True,
):
    """
    Trains one model, logs to MLflow (as nested run if nested=True),
    returns (best_val_loss, run_id, metrics_dict).
    """
    device = torch.device("cpu")
    train_loader = make_loader(X_train, y_train, batch_size=batch_size, shuffle=True)
    val_loader   = make_loader(X_val,   y_val,   batch_size=batch_size, shuffle=False)

    input_size = X_train.shape[-1]
    if window_size is None: window_size = X_train.shape[1]
    if horizon is None:     horizon = y_train.shape[1]

    model = LSTMForecaster(
        input_size=input_size,
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout=dropout,
        horizon=horizon,
    ).to(device)

    criterion = nn.MSELoss()
    opt = optim.Adam(model.parameters(), lr=lr)

    best_val = float("inf")
    patience_left = patience
    best_state = None
    train_curve, val_curve = [], []

    with mlflow.start_run(run_name=run_name, nested=nested) as active_run:
        # Log params once
        mlflow.log_params({
            "hidden_size": hidden_size,
            "num_layers": num_layers,
            "dropout": dropout,
            "lr": lr,
            "batch_size": batch_size,
            "epochs": epochs,
            "patience": patience,
            "window_size": window_size,
            "horizon": horizon,
        })

        for ep in range(1, epochs + 1):
            # Train
            model.train()
            running = 0.0
            for xb, yb in train_loader:
                opt.zero_grad()
                pred = model(xb)
                loss = criterion(pred, yb)
                loss.backward()
                opt.step()
                running += loss.item()
            train_loss = running / max(1, len(train_loader))

            # Validate
            model.eval()
            v_running = 0.0
            with torch.no_grad():
                for xb, yb in val_loader:
                    pr = model(xb)
                    vloss = criterion(pr, yb)
                    v_running += vloss.item()
            val_loss = v_running / max(1, len(val_loader))

            train_curve.append(train_loss)
            val_curve.append(val_loss)

            # Log per-epoch
            mlflow.log_metric("train_loss", train_loss, step=ep)
            mlflow.log_metric("val_loss", val_loss, step=ep)

            # Early stopping
            if val_loss < best_val - 1e-8:
                best_val = val_loss
                patience_left = patience
                best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            else:
                patience_left -= 1
                if patience_left <= 0:
                    print("Early stopping!")
                    break

        # Restore best weights & compute final metrics on val
        if best_state is not None:
            model.load_state_dict(best_state)
        (mae, rmse, r2), preds, targs = compute_metrics(val_loader, model)

        # Log final metrics & plots
        mlflow.log_metric("val_mae", mae)
        mlflow.log_metric("val_rmse", rmse)
        mlflow.log_metric("val_r2", r2)

        plot_and_log_learning_curves(train_curve, val_curve)
        plot_and_log_residuals(preds, targs)
        plot_and_log_true_vs_pred(targs, preds)

        # Save model
        mlflow.pytorch.log_model(model, "model")

        run_id = active_run.info.run_id

    metrics = {"val_loss": best_val, "val_mae": mae, "val_rmse": rmse, "val_r2": r2}
    return best_val, run_id, metrics


In [49]:
WINDOW_SIZE = X_train.shape[1]  # should be 5
HORIZON     = y_train.shape[1]  # should be 1

def objective(trial: optuna.Trial):
    # Search space
    hidden_size = trial.suggest_int("hidden_size", 16, 128, step=16)
    num_layers  = trial.suggest_int("num_layers", 1, 3)
    dropout     = trial.suggest_float("dropout", 0.0, 0.4)
    lr          = trial.suggest_float("lr", 1e-4, 5e-3, log=True)
    batch_size  = trial.suggest_categorical("batch_size", [16, 32, 64])
    epochs      = trial.suggest_int("epochs", 15, 50)
    patience    = trial.suggest_int("patience", 3, 10)

    # Train one trial as a NESTED MLflow run
    val_loss, run_id, metrics = train_one_model_nested(
        X_train, y_train, X_val, y_val,
        hidden_size=hidden_size, num_layers=num_layers, dropout=dropout,
        lr=lr, batch_size=batch_size, epochs=epochs, patience=patience,
        window_size=WINDOW_SIZE, horizon=HORIZON,
        run_name=f"trial-{trial.number}",
        nested=True
    )

    # Attach run id to trial for traceability
    trial.set_user_attr("mlflow_run_id", run_id)
    # Report for pruning
    trial.report(val_loss, step=epochs)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return val_loss


In [50]:
def run_hpo_and_final_training(
    n_trials=8,
    pruner=None,            # e.g., PercentilePruner(percentage=50, n_startup_trials=3)
    study_name="cpu_pct_lstm",
    parent_run_name="session",
    register_model=True,
):
    """
    Creates a PARENT MLflow run.
    - Inside it, runs Optuna trials as nested runs.
    - Trains a final best model as another nested run.
    - Logs a summary artifact in the parent.
    Returns: dict with summary (best params, child run ids, final run id, etc.)
    """
    # Default pruner if not provided
    if pruner is None:
        pruner = PercentilePruner(percent=50, n_startup_trials=3)

    summary = {}
    trial_infos = []

    with mlflow.start_run(run_name=parent_run_name, nested=False) as parent_run:
        parent_run_id = parent_run.info.run_id
        mlflow.set_tag("session", parent_run_name)

        # --- HPO (trials) ---
        study = optuna.create_study(direction="minimize", pruner=pruner, study_name=study_name)
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)

        best_params = study.best_params
        best_value  = study.best_value
        best_trial  = study.best_trial.number

        # collect trial metadata (trial -> run_id)
        for t in study.trials:
            trial_infos.append({
                "trial_number": t.number,
                "value": t.value,
                "params": t.params,
                "state": str(t.state),
                "mlflow_run_id": t.user_attrs.get("mlflow_run_id", None),
            })

        # log to parent
        mlflow.log_param("n_trials", n_trials)
        mlflow.log_param("study_name", study_name)
        mlflow.log_metric("best_val_loss", best_value)
        mlflow.set_tag("best_trial_number", best_trial)
        for k, v in best_params.items():
            mlflow.log_param(f"best_{k}", v)

        # Save trial summary JSON as artifact
        with open("optuna_trials_summary.json", "w") as f:
            json.dump(trial_infos, f, indent=2)
        mlflow.log_artifact("optuna_trials_summary.json")

        # --- Final training (nested) ---
        final_val_loss, final_run_id, final_metrics = train_one_model_nested(
            X_train, y_train, X_val, y_val,
            hidden_size=best_params["hidden_size"],
            num_layers=best_params["num_layers"],
            dropout=best_params["dropout"],
            lr=best_params["lr"],
            batch_size=best_params["batch_size"],
            epochs=best_params["epochs"],
            patience=best_params["patience"],
            window_size=WINDOW_SIZE,
            horizon=HORIZON,
            run_name="final-best",
            nested=True
        )

        # Optionally register best model from the final nested run
        registered_version = None
        if register_model:
            model_uri = f"runs:/{final_run_id}/model"
            registered = mlflow.register_model(model_uri, REGISTERED_MODEL_NAME)
            registered_version = registered.version
            mlflow.set_tag("registered_model_name", REGISTERED_MODEL_NAME)
            mlflow.set_tag("registered_model_version", registered_version)

        # build return summary
        summary = {
            "parent_run_id": parent_run_id,
            "best_trial_number": best_trial,
            "best_params": best_params,
            "best_val_loss": best_value,
            "trial_infos": trial_infos,
            "final_run_id": final_run_id,
            "final_metrics": final_metrics,
            "registered_model_name": REGISTERED_MODEL_NAME if register_model else None,
            "registered_model_version": registered_version,
            "mlflow_ui_session": f"{TRACKING_URI}/#/experiments/{ml_client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id}/runs/{parent_run_id}"
        }

    return summary


In [62]:
summary = run_hpo_and_final_training(
    n_trials=10,
    pruner=PercentilePruner(percentile=50, n_startup_trials=3),
    study_name="cpu_pct_lstm",
    parent_run_name="cpu-pct-session",
    register_model=True,  # set False if you don't want to register now
)

print(json.dumps(summary, indent=2))
print("\n👀 Open MLflow session run:", summary["mlflow_ui_session"])


[I 2025-09-05 13:40:06,445] A new study created in memory with name: cpu_pct_lstm


Early stopping!


[I 2025-09-05 13:40:54,193] Trial 0 finished with value: 0.0005527776212507295 and parameters: {'hidden_size': 64, 'num_layers': 3, 'dropout': 0.21594938963242113, 'lr': 0.001035867811094136, 'batch_size': 16, 'epochs': 27, 'patience': 4}. Best is trial 0 with value: 0.0005527776212507295.


🏃 View run trial-0 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/f1eece9b992d46228d4fc12b0726b558
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 13:41:37,635] Trial 1 finished with value: 0.0005516582446244473 and parameters: {'hidden_size': 80, 'num_layers': 2, 'dropout': 0.13146413348549732, 'lr': 0.0002057123347351767, 'batch_size': 32, 'epochs': 16, 'patience': 5}. Best is trial 1 with value: 0.0005516582446244473.


🏃 View run trial-1 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/c91fe35b9c07423397a3932958d482e0
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 13:44:32,253] Trial 2 finished with value: 0.0005883132114324199 and parameters: {'hidden_size': 80, 'num_layers': 3, 'dropout': 0.026820471100354393, 'lr': 0.0034939485416015234, 'batch_size': 16, 'epochs': 26, 'patience': 8}. Best is trial 1 with value: 0.0005516582446244473.


🏃 View run trial-2 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/7aff3d53f7e14ad48e059d4f89d59965
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 13:46:55,344] Trial 3 finished with value: 0.0005895734918953548 and parameters: {'hidden_size': 48, 'num_layers': 3, 'dropout': 0.12974596361687527, 'lr': 0.0018237788732962288, 'batch_size': 16, 'epochs': 24, 'patience': 10}. Best is trial 1 with value: 0.0005516582446244473.


🏃 View run trial-3 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/2c8e8991476245a38f0f39e401b374e1
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 13:47:28,299] Trial 4 finished with value: 0.0007164971448360155 and parameters: {'hidden_size': 48, 'num_layers': 3, 'dropout': 0.32268152115910104, 'lr': 0.003671774149292361, 'batch_size': 16, 'epochs': 47, 'patience': 3}. Best is trial 1 with value: 0.0005516582446244473.


🏃 View run trial-4 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/7262ab5bd42347c09c86d0be01320a3f
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 13:47:51,455] Trial 5 finished with value: 0.0006404745867117667 and parameters: {'hidden_size': 64, 'num_layers': 2, 'dropout': 0.09033688524389612, 'lr': 0.003103161141244499, 'batch_size': 32, 'epochs': 19, 'patience': 7}. Best is trial 1 with value: 0.0005516582446244473.


🏃 View run trial-5 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/193f41038c5548389bb3291949bb5fa3
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 13:49:38,013] Trial 6 finished with value: 0.0005471934855449945 and parameters: {'hidden_size': 80, 'num_layers': 3, 'dropout': 0.13959564423989423, 'lr': 0.00017799899768228584, 'batch_size': 32, 'epochs': 35, 'patience': 10}. Best is trial 6 with value: 0.0005471934855449945.


🏃 View run trial-6 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/879b706c9ac346aea35e33058f35abe5
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 13:49:52,492] Trial 7 finished with value: 0.0006421489524655044 and parameters: {'hidden_size': 80, 'num_layers': 1, 'dropout': 0.2159554306213737, 'lr': 0.00080749013512319, 'batch_size': 16, 'epochs': 18, 'patience': 3}. Best is trial 6 with value: 0.0005471934855449945.


🏃 View run trial-7 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/8d6305fa26c04494b5d650f230c5840c
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 13:50:28,870] Trial 8 finished with value: 0.0005548046221604657 and parameters: {'hidden_size': 80, 'num_layers': 1, 'dropout': 0.30481101953835205, 'lr': 0.0006210091911101955, 'batch_size': 16, 'epochs': 33, 'patience': 9}. Best is trial 6 with value: 0.0005471934855449945.


🏃 View run trial-8 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/e55882d7a8ab4e28aeb964e2723b7f3c
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 13:50:53,853] Trial 9 finished with value: 0.0005775818791544924 and parameters: {'hidden_size': 80, 'num_layers': 2, 'dropout': 0.3263315805624927, 'lr': 0.0020367448517796026, 'batch_size': 32, 'epochs': 32, 'patience': 7}. Best is trial 6 with value: 0.0005471934855449945.


🏃 View run trial-9 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/f88b97ad22024922b3dde2b08166c49b
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


Successfully registered model 'cpu-pct'.
2025/09/05 13:52:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cpu-pct, version 1


🏃 View run final-best at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/f047961788c94ce29f5a17499d224ba7
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
🏃 View run cpu-pct-session at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/bc43b1ae967d4139b5ad7db110776452
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
{
  "parent_run_id": "bc43b1ae967d4139b5ad7db110776452",
  "best_trial_number": 6,
  "best_params": {
    "hidden_size": 80,
    "num_layers": 3,
    "dropout": 0.13959564423989423,
    "lr": 0.00017799899768228584,
    "batch_size": 32,
    "epochs": 35,
    "patience": 10
  },
  "best_val_loss": 0.0005471934855449945,
  "trial_infos": [
    {
      "trial_number": 0,
      "value": 0.0005527776212507295,
      "params": {
        "hidden_size": 64,
        "num_layers": 3,
        "dropout": 0.2159493896

Created version '1' of model 'cpu-pct'.
