### 1. Imports & global config

In [1]:
# ----- Core -----
import os, io, json, math, random
from io import BytesIO
import numpy as np
import pandas as pd

# ----- MLflow -----
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature 

# ----- Torch -----
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# ----- Metrics -----
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ----- HPO -----
import optuna
from optuna.pruners import PercentilePruner

# ----- Plotting -----
import matplotlib.pyplot as plt

# ----- Repro -----
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
set_seed(42)

# ===== MLflow tracking =====
TRACKING_URI = "http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080"
EXPERIMENT_NAME = "k8s-cpu-forecasting"
REGISTERED_MODEL_NAME = "cpu-pct"     # model registry name (optional but nice)

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
ml_client = MlflowClient()

print("MLflow:", TRACKING_URI, "Experiment:", EXPERIMENT_NAME)


MLflow: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080 Experiment: k8s-cpu-forecasting


### 2 MinIO client & data loaders (update creds if needed)

In [2]:
from minio import Minio

# If you already have `minio_client`, you can skip this cell.
# Adjust endpoint/creds to your cluster.
MINIO_ENDPOINT = "minio-service.kubeflow.svc.cluster.local:9000"
MINIO_ACCESS_KEY = "minio"
MINIO_SECRET_KEY = "minio123"

minio_client = Minio(
    endpoint=MINIO_ENDPOINT,
    access_key=MINIO_ACCESS_KEY,
    secret_key=MINIO_SECRET_KEY,
    secure=False
)

def download_numpy_from_minio(minio_client, bucket_name: str, object_name: str) -> np.ndarray:
    """Download a .npy as numpy array (in-memory, no temp file)."""
    resp = minio_client.get_object(bucket_name, object_name)
    try:
        data = resp.read()
        arr = np.load(BytesIO(data))
        return arr
    finally:
        resp.close()
        resp.release_conn()

# ---- Paths for your prepared arrays ----
bucket_name = "k8s-resources-forecast"
object_names = {
    "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
    "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
    "X_val":   "data/k8s-preprocessed/node-1-X_test/X_test.npy",
    "y_val":   "data/k8s-preprocessed/node-1-y_test/y_test.npy",
}

# ---- Pull arrays ----
X_train = download_numpy_from_minio(minio_client, bucket_name, object_names["X_train"])
y_train = download_numpy_from_minio(minio_client, bucket_name, object_names["y_train"])
X_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["X_val"])
y_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["y_val"])

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:  ", X_val.shape,   "y_val:  ", y_val.shape)

def make_loader(X, y, batch_size=32, shuffle=True):
    ds = TensorDataset(
        torch.tensor(X, dtype=torch.float32),
        torch.tensor(y, dtype=torch.float32),
    )
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle)


X_train: (8238, 5, 1) y_train: (8238, 1, 1)
X_val:   (2060, 5, 1) y_val:   (2060, 1, 1)


### 3 Model & training utils (plots, metrics, early stop)

In [3]:
class LSTMForecaster(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, dropout=0.1, horizon=1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        self.fc = nn.Linear(hidden_size, horizon)

    def forward(self, x):
        # x: [B, T, F]
        out, _ = self.lstm(x)       # [B, T, H]
        out = out[:, -1, :]         # [B, H] last step
        out = self.fc(out)          # [B, horizon]
        return out.unsqueeze(-1)    # [B, horizon, 1] to match y

def compute_metrics(loader, model):
    """Return (MAE, RMSE, R2), along with flattened preds & targets for plotting."""
    model.eval()
    preds, targs = [], []
    with torch.no_grad():
        for xb, yb in loader:
            pr = model(xb)
            preds.append(pr.numpy())
            targs.append(yb.numpy())
    preds = np.concatenate(preds, axis=0).reshape(-1)
    targs = np.concatenate(targs, axis=0).reshape(-1)
    mae = mean_absolute_error(targs, preds)
    mse = mean_squared_error(targs, preds)
    rmse = math.sqrt(mse)
    r2 = r2_score(targs, preds)
    return (mae, rmse, r2), preds, targs
#in kfp /tmp the folder to locally store 
default_local_path = "/home/jovyan/kpf_sunrise/notebooks-lstm/tmp/"
def plot_and_log_learning_curves(train_curve, val_curve, filename="learning_curve.png"):
    plt.figure(figsize=(7,3))
    plt.plot(train_curve, label="train")
    plt.plot(val_curve, label="val")
    plt.legend(); plt.title("Learning Curve"); plt.tight_layout()
    plt.savefig(default_local_path + filename); plt.close()
    mlflow.log_artifact(default_local_path + filename)

def plot_and_log_residuals(preds, targs, filename="residuals.png"):
    plt.figure(figsize=(7,3))
    plt.plot(preds - targs, label="residual")
    plt.axhline(0, color="black", linewidth=0.8)
    plt.legend(); plt.title("Residuals (val)"); plt.tight_layout()
    plt.savefig(default_local_path + filename); plt.close()
    mlflow.log_artifact(default_local_path + filename)

def plot_and_log_true_vs_pred(targs, preds, filename="val_true_vs_pred.png"):
    plt.figure(figsize=(7,3))
    plt.plot(targs, label="true")
    plt.plot(preds, label="pred")
    plt.legend(); plt.title("True vs Pred (val)"); plt.tight_layout()
    plt.savefig(default_local_path + filename); plt.close()
    mlflow.log_artifact(default_local_path + filename)


### 4 — Single-run trainer (used by both trials & final), nested run aware

In [5]:
#############################
########### 2 MODIFIED VERSION################
##############################
## Addind I/o schema to the model

def train_one_model_nested(
    X_train, y_train, X_val, y_val,
    hidden_size=64, num_layers=2, dropout=0.1,
    lr=1e-3, batch_size=32,
    epochs=35, patience=5,
    window_size=None, horizon=None,
    run_name="trial",
    nested=True,
):
    """
    Trains one model, logs to MLflow (as nested run if nested=True),
    returns (best_val_loss, run_id, metrics_dict).
    """
    device = torch.device("cpu")
    train_loader = make_loader(X_train, y_train, batch_size=batch_size, shuffle=True)
    val_loader   = make_loader(X_val,   y_val,   batch_size=batch_size, shuffle=False)

    input_size = X_train.shape[-1]
    if window_size is None: window_size = X_train.shape[1]
    if horizon is None:     horizon = y_train.shape[1]

    model = LSTMForecaster(
        input_size=input_size,
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout=dropout,
        horizon=horizon,
    ).to(device)

    criterion = nn.MSELoss()
    opt = optim.Adam(model.parameters(), lr=lr)

    best_val = float("inf")
    patience_left = patience
    best_state = None
    train_curve, val_curve = [], []

    with mlflow.start_run(run_name=run_name, nested=nested) as active_run:
        # Log params once
        mlflow.log_params({
            "hidden_size": hidden_size,
            "num_layers": num_layers,
            "dropout": dropout,
            "lr": lr,
            "batch_size": batch_size,
            "epochs": epochs,
            "patience": patience,
            "window_size": window_size,
            "horizon": horizon,
        })

        for ep in range(1, epochs + 1):
            # Train
            model.train()
            running = 0.0
            for xb, yb in train_loader:
                opt.zero_grad()
                pred = model(xb)
                loss = criterion(pred, yb)
                loss.backward()
                opt.step()
                running += loss.item()
            train_loss = running / max(1, len(train_loader))

            # Validate
            model.eval()
            v_running = 0.0
            with torch.no_grad():
                for xb, yb in val_loader:
                    pr = model(xb)
                    vloss = criterion(pr, yb)
                    v_running += vloss.item()
            val_loss = v_running / max(1, len(val_loader))

            train_curve.append(train_loss)
            val_curve.append(val_loss)

            # Log per-epoch
            mlflow.log_metric("train_loss", train_loss, step=ep)
            mlflow.log_metric("val_loss", val_loss, step=ep)

            # Early stopping
            if val_loss < best_val - 1e-8:
                best_val = val_loss
                patience_left = patience
                best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            else:
                patience_left -= 1
                if patience_left <= 0:
                    print("Early stopping!")
                    break

        # Restore best weights & compute final metrics on val
        if best_state is not None:
            model.load_state_dict(best_state)
        (mae, rmse, r2), preds, targs = compute_metrics(val_loader, model)

        # Log final metrics & plots
        mlflow.log_metric("val_mae", mae)
        mlflow.log_metric("val_rmse", rmse)
        mlflow.log_metric("val_r2", r2)

        plot_and_log_learning_curves(train_curve, val_curve)
        plot_and_log_residuals(preds, targs)
        plot_and_log_true_vs_pred(targs, preds)

                ## --------- Build input_example + signature and log the model (FIXED) ---------
        # Take one validation sample with the real shape [1, window, 1]
        sample_input_t = torch.tensor(X_val[:1], dtype=torch.float32)     # torch tensor
        with torch.no_grad():
            sample_output_np = model(sample_input_t).detach().cpu().numpy()  # numpy [1, 1, 1]
        
        #  MLflow requires numpy/pandas (not torch.Tensor) for input_example
        input_example_np = sample_input_t.cpu().numpy()  # [1, window, 1]
        
        # Infer signature from numpy arrays
        signature = infer_signature(
            input_example_np,   # inputs:  [1, 5, 1] in your case [batch, window, feature]
            sample_output_np    # outputs: [1, 1, 1] [batch, horizon, 1]
        )
        
        # Log PyTorch model WITH signature + input_example (no more warning)
        mlflow.pytorch.log_model(
            model,
            artifact_path="model",
            input_example=input_example_np,   # <-- numpy, not torch
            signature=signature
        )
        # ---------------------------------------------------------------------------

        # ------------------------------------------------------------------

        run_id = active_run.info.run_id

    metrics = {"val_loss": best_val, "val_mae": mae, "val_rmse": rmse, "val_r2": r2}
    return best_val, run_id, metrics


### 5 — Optuna objective (each trial becomes a nested run)

In [6]:
WINDOW_SIZE = X_train.shape[1]  # should be 5
HORIZON     = y_train.shape[1]  # should be 1

def objective(trial: optuna.Trial):
    # Search space
    hidden_size = trial.suggest_int("hidden_size", 16, 128, step=16)
    num_layers  = trial.suggest_int("num_layers", 1, 3)
    dropout     = trial.suggest_float("dropout", 0.0, 0.4)
    lr          = trial.suggest_float("lr", 1e-4, 5e-3, log=True)
    batch_size  = trial.suggest_categorical("batch_size", [16, 32, 64])
    epochs      = trial.suggest_int("epochs", 15, 50)
    patience    = trial.suggest_int("patience", 3, 10)

    # Train one trial as a NESTED MLflow run
    val_loss, run_id, metrics = train_one_model_nested(
        X_train, y_train, X_val, y_val,
        hidden_size=hidden_size, num_layers=num_layers, dropout=dropout,
        lr=lr, batch_size=batch_size, epochs=epochs, patience=patience,
        window_size=WINDOW_SIZE, horizon=HORIZON,
        run_name=f"trial-{trial.number}",
        nested=True
    )

    # Attach run id to trial for traceability
    trial.set_user_attr("mlflow_run_id", run_id)
    # Report for pruning
    trial.report(val_loss, step=epochs)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return val_loss



### 6 — Orchestrator: Parent run → nested trials → nested final

In [9]:
#############################
########### 2 MODIFIED VERSION################
##############################
## Addind I/o schema to the model
def run_hpo_and_final_training(
    n_trials=8,
    pruner=None,            # e.g., PercentilePruner(percentage=50, n_startup_trials=3)
    study_name="cpu_pct_lstm",
    parent_run_name="session",
    register_model=True,
):
    """
    Creates a PARENT MLflow run.
    - Inside it, runs Optuna trials as nested runs.
    - Trains a final best model as another nested run.
    - Logs a summary artifact in the parent.
    Returns: dict with summary (best params, child run ids, final run id, etc.)
    """
    # Default pruner if not provided
    if pruner is None:
        pruner = PercentilePruner(percent=50, n_startup_trials=3)

    summary = {}
    trial_infos = []

    with mlflow.start_run(run_name=parent_run_name, nested=False) as parent_run:
        parent_run_id = parent_run.info.run_id
        mlflow.set_tag("session", parent_run_name)

        # --- HPO (trials) ---
        study = optuna.create_study(direction="minimize", pruner=pruner, study_name=study_name)
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)

        best_params = study.best_params
        best_value  = study.best_value
        best_trial  = study.best_trial.number

        # collect trial metadata (trial -> run_id)
        for t in study.trials:
            trial_infos.append({
                "trial_number": t.number,
                "value": t.value,
                "params": t.params,
                "state": str(t.state),
                "mlflow_run_id": t.user_attrs.get("mlflow_run_id", None),
            })

        # log to parent
        mlflow.log_param("n_trials", n_trials)
        mlflow.log_param("study_name", study_name)
        mlflow.log_metric("best_val_loss", best_value)
        mlflow.set_tag("best_trial_number", best_trial)
        for k, v in best_params.items():
            mlflow.log_param(f"best_{k}", v)

        # Save trial summary JSON as artifact
        with open("optuna_trials_summary.json", "w") as f:
            json.dump(trial_infos, f, indent=2)
        mlflow.log_artifact("optuna_trials_summary.json")

        # --- Final training (nested) ---
        final_val_loss, final_run_id, final_metrics = train_one_model_nested(
            X_train, y_train, X_val, y_val,
            hidden_size=best_params["hidden_size"],
            num_layers=best_params["num_layers"],
            dropout=best_params["dropout"],
            lr=best_params["lr"],
            batch_size=best_params["batch_size"],
            epochs=best_params["epochs"],
            patience=best_params["patience"],
            window_size=WINDOW_SIZE,
            horizon=HORIZON,
            run_name="final-best",
            nested=True
        )

        # build return summary
        summary = {
            "parent_run_id": parent_run_id,
            "best_trial_number": best_trial,
            "best_params": best_params,
            "best_val_loss": best_value,
            "trial_infos": trial_infos,
            "final_run_id": final_run_id,
            "final_metrics": final_metrics,
            "registered_model_name": REGISTERED_MODEL_NAME if register_model else None,
            #"registered_model_version": registered_version,
            "mlflow_ui_session": f"{TRACKING_URI}/#/experiments/{ml_client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id}/runs/{parent_run_id}"
        }

    return summary


In [10]:
summary = run_hpo_and_final_training(
    n_trials=3,
    pruner=PercentilePruner(percentile=50, n_startup_trials=3),
    study_name="cpu_pct_lstm",
    parent_run_name="cpu-pct-session",
    register_model=True,  # set False if you don't want to register now
)

print(json.dumps(summary, indent=2))
print("\n👀 Open MLflow session run:", summary["mlflow_ui_session"])


[I 2025-09-05 16:47:59,015] A new study created in memory with name: cpu_pct_lstm


Early stopping!


[I 2025-09-05 16:49:38,650] Trial 0 finished with value: 0.0005486616978734053 and parameters: {'hidden_size': 112, 'num_layers': 3, 'dropout': 0.32534968343747217, 'lr': 0.0010371359966564948, 'batch_size': 16, 'epochs': 24, 'patience': 10}. Best is trial 0 with value: 0.0005486616978734053.


🏃 View run trial-0 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/074e46f14693406d9a278d71793167f6
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 16:50:01,154] Trial 1 finished with value: 0.0005531848141937658 and parameters: {'hidden_size': 112, 'num_layers': 1, 'dropout': 0.09965210336877456, 'lr': 0.00017556132890107854, 'batch_size': 64, 'epochs': 41, 'patience': 6}. Best is trial 0 with value: 0.0005486616978734053.


🏃 View run trial-1 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/4b9bd0d68e4f4c919d89674d4b68bfa4
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!


[I 2025-09-05 16:50:14,401] Trial 2 finished with value: 0.0005567306467458944 and parameters: {'hidden_size': 48, 'num_layers': 1, 'dropout': 0.38727895335211304, 'lr': 0.00440839664884538, 'batch_size': 64, 'epochs': 19, 'patience': 10}. Best is trial 0 with value: 0.0005486616978734053.


🏃 View run trial-2 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/d9883ce92320421398478f770c148993
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Early stopping!




🏃 View run final-best at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/2632cf6a405544868e2e65cdc1a113e1
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
🏃 View run cpu-pct-session at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/b0a43ddd8d8c4309946be096182c80e1
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
{
  "parent_run_id": "b0a43ddd8d8c4309946be096182c80e1",
  "best_trial_number": 0,
  "best_params": {
    "hidden_size": 112,
    "num_layers": 3,
    "dropout": 0.32534968343747217,
    "lr": 0.0010371359966564948,
    "batch_size": 16,
    "epochs": 24,
    "patience": 10
  },
  "best_val_loss": 0.0005486616978734053,
  "trial_infos": [
    {
      "trial_number": 0,
      "value": 0.0005486616978734053,
      "params": {
        "hidden_size": 112,
        "num_layers": 3,
        "dropout": 0.325349683