In [5]:
import kfp
from kfp import dsl
from typing import NamedTuple


############################################################################################################
#####################################---1. STAGE---#########################################################
############################################################################################################
@dsl.component(
    base_image="docker.io/jhofydu/pytorch-kfp:v1.0.0",
    packages_to_install=["minio"]
)
def get_data_batch()-> NamedTuple("Outputs", [("tracking_timestamp", str)]):
    import pandas as pd
    import numpy as np
    import torch
    import joblib
    import datetime
    from typing import NamedTuple
    from minio import Minio
    from io import BytesIO
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split

    
    # Minio client
    minio_client = Minio(
        "minio-service.kubeflow.svc.cluster.local:9000",
        access_key="minio",
        secret_key="minio123",
        secure=False,
    )

    def read_csv_from_minio(minio_client, bucket, object_name):
        with minio_client.get_object(bucket, object_name) as response:
            df = pd.read_csv(BytesIO(response.read()))
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.set_index('timestamp', inplace=True)
        df.sort_index(inplace=True)
        return df

    bucket_name = "k8s-resources-forecast"
    object_name = "data/k8s-preprocessed/prec-pct-k8s-dataset.csv"
    df = read_csv_from_minio(minio_client, bucket_name, object_name)

    selected_feature = 'ac3-node-1-vm_cpu_pct'
    df_node_1_cpu_feature = df[[selected_feature]]

    def normalize_with_sklearn(df):
        scaler = MinMaxScaler()
        df_scaled = pd.DataFrame(
            scaler.fit_transform(df),
            columns=df.columns,
            index=df.index
        )
        return df_scaled, scaler

    df_node_1_cpu_feature_scaled, scaler = normalize_with_sklearn(df_node_1_cpu_feature)

    def save_scaler_to_minio(minio_client, scaler, bucket, object_name):
        if not object_name.endswith('.pkl'):
            object_name += '.pkl'
        scaler_bytes = BytesIO()
        joblib.dump(scaler, scaler_bytes)
        scaler_bytes.seek(0)
        minio_client.put_object(
            bucket_name=bucket,
            object_name=object_name,
            data=scaler_bytes,
            length=len(scaler_bytes.getvalue()),
            content_type='application/octet-stream'
        )
        print(f"Scaler saved to s3://{bucket}/{object_name}")

    object_name_scaler = "data/k8s-preprocessed/node-1-cpu-scaler/node-1-scaler"
    save_scaler_to_minio(minio_client, scaler, bucket_name, object_name_scaler)

    def create_sequences(data, window_size=5, horizon=1):
        X, y = [], []
        for i in range(len(data) - window_size - horizon + 1):
            X.append(data[i:(i + window_size)])
            y.append(data[i + window_size:i + window_size + horizon])
        X = np.array(X).reshape(-1, window_size, 1)
        y = np.array(y).reshape(-1, horizon, 1)
        return X, y

    def split_time_series(X, y, test_size=0.2):
        from sklearn.model_selection import train_test_split
        return train_test_split(X, y, test_size=test_size, shuffle=False)

    X, y = create_sequences(df_node_1_cpu_feature_scaled)
    X_train, X_test, y_train, y_test = split_time_series(X, y)

    def upload_numpy_to_minio(client, array, bucket_name, object_name):
        buffer = BytesIO()
        np.save(buffer, array)
        buffer.seek(0)
        client.put_object(
            bucket_name=bucket_name,
            object_name=object_name,
            data=buffer,
            length=buffer.getbuffer().nbytes,
            content_type="application/octet-stream"
        )
        print(f"Uploaded to minio://{bucket_name}/{object_name} ({array.shape})")

    object_names = {
        "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
        "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
        "X_test":  "data/k8s-preprocessed/node-1-X_test/X_test.npy",
        "y_test":  "data/k8s-preprocessed/node-1-y_test/y_test.npy",
    }

    upload_numpy_to_minio(minio_client, X_train, bucket_name, object_names["X_train"])
    upload_numpy_to_minio(minio_client, y_train, bucket_name, object_names["y_train"])
    upload_numpy_to_minio(minio_client, X_test,  bucket_name, object_names["X_test"])
    upload_numpy_to_minio(minio_client, y_test,  bucket_name, object_names["y_test"])

    #timestamp as output fro the 2-3 stages
    tracking_timestamp =  datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S-") 
    return (tracking_timestamp,)

############################################################################################################
#####################################---2. STAGE---#########################################################
############################################################################################################

@dsl.component(
    base_image="docker.io/jhofydu/kpf-kserve:V1.0.0",
)
def model_building(tracking_timestamp: str ):

    # 1. Imports & MinIO client creation adn Download Function
    
    import numpy as np
    import io
    import torch
    from torch.utils.data import TensorDataset, DataLoader
    import torch.nn as nn
    import mlflow
    import matplotlib.pyplot as plt
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    from minio import Minio
    import datetime
    

    
    # Function to download numpy arrays from MinIO
    def download_numpy_from_minio(minio_client, bucket, object_name):
        try:
            with minio_client.get_object(bucket, object_name) as response:
                arr = np.load(io.BytesIO(response.read()))
                print(f"Downloaded: s3://{bucket}/{object_name} shape={arr.shape}")
                return arr
        except Exception as e:
            print(f"Error: {e}")
    # Minio client
    minio_client = Minio(
        "minio-service.kubeflow.svc.cluster.local:9000",
        access_key="minio",
        secret_key="minio123",
        secure=False,
    )
    # SET MLflow URI in the k8s cluster
    # This line must be placed before any mlflow.start_run()
    mlflow.set_tracking_uri("http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080")
    mlflow.set_experiment("k8s-cpu-forecasting")

    
    # 2: Load Train/Val Sets From MinIO
    bucket_name = "k8s-resources-forecast"
    object_names = {
        "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
        "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
        "X_val":   "data/k8s-preprocessed/node-1-X_test/X_test.npy",
        "y_val":   "data/k8s-preprocessed/node-1-y_test/y_test.npy",
    }
    
    X_train = download_numpy_from_minio(minio_client, bucket_name, object_names["X_train"])
    y_train = download_numpy_from_minio(minio_client, bucket_name, object_names["y_train"])
    X_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["X_val"])
    y_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["y_val"])
    
    print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
    print("X_val shape:", X_val.shape, "y_val shape:", y_val.shape)

    
    ## 3: Build PyTorch DataLoaders
    BATCH_SIZE = 32
    
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                  torch.tensor(y_train, dtype=torch.float32))
    val_dataset   = TensorDataset(torch.tensor(X_val,   dtype=torch.float32),
                                  torch.tensor(y_val,   dtype=torch.float32))
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)


    # 4: Define LSTM Model
    class LSTMForecaster(nn.Module):
        def __init__(self, input_size=1, hidden_size=64, num_layers=2, output_size=1, dropout=0.0):
            super().__init__()
            self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
            self.fc = nn.Linear(hidden_size, output_size)
        def forward(self, x):
            out, _ = self.lstm(x)
            out = self.fc(out[:, -1, :])
            return out.unsqueeze(1)  # (batch, horizon, 1)

    # 5: Training Loop With Early Stopping and MLflow Logging

    def train_model_with_early_stopping(
        train_loader, val_loader, input_size=1, hidden_size=64, num_layers=2,
        lr=0.001, epochs=35, patience=5, dropout=0.0, model_name="node-1-cpu-pct-forecast", run_name=None
    ):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = LSTMForecaster(input_size, hidden_size, num_layers, output_size=1, dropout=dropout).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = nn.MSELoss()
        best_val_loss = float('inf')
        best_model = None
        wait = 0
        train_losses, val_losses = [], []
    
        with mlflow.start_run(run_name=run_name):
            mlflow.log_params({
                "input_size": input_size, "hidden_size": hidden_size,
                "num_layers": num_layers, "lr": lr, "epochs": epochs,
                "batch_size": BATCH_SIZE, "dropout": dropout, "patience": patience
            })
    
            for epoch in range(epochs):
                model.train()
                running_loss = 0
                for xb, yb in train_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    optimizer.zero_grad()
                    out = model(xb)
                    loss = criterion(out, yb)
                    loss.backward()
                    optimizer.step()
                    running_loss += loss.item()
                train_loss = running_loss / len(train_loader)
                train_losses.append(train_loss)
    
                model.eval()
                val_running_loss = 0
                all_pred, all_true = [], []
                with torch.no_grad():
                    for xb, yb in val_loader:
                        xb, yb = xb.to(device), yb.to(device)
                        out = model(xb)
                        loss = criterion(out, yb)
                        val_running_loss += loss.item()
                        all_pred.append(out.cpu().numpy())
                        all_true.append(yb.cpu().numpy())
                val_loss = val_running_loss / len(val_loader)
                val_losses.append(val_loss)
    
                print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.5f} | Val Loss: {val_loss:.5f}")
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_model = model.state_dict()
                    wait = 0
                else:
                    wait += 1
                    if wait >= patience:
                        mlflow.log_metric("epoch_actual", epoch + 1)  # real epoch that run
                        print("Early stopping triggered!")
                        break
    
            # Load best
            if best_model: model.load_state_dict(best_model)
    
            # Final metrics
            model.eval()
            preds, targets = [], []
            with torch.no_grad():
                for xb, yb in val_loader:
                    out = model(xb.to(device)).cpu().numpy()
                    preds.append(out)
                    targets.append(yb.cpu().numpy())
            preds = np.concatenate(preds).reshape(-1)
            targets = np.concatenate(targets).reshape(-1)
    
            mae  = mean_absolute_error(targets, preds)
            #rmse = mean_squared_error(targets, preds, squared=False) #  scikit-learn new version has this
            rmse = np.sqrt(mean_squared_error(targets, preds))
            r2   = r2_score(targets, preds)
            mlflow.log_metric("val_mae", mae)
            mlflow.log_metric("val_rmse", rmse)
            mlflow.log_metric("val_r2", r2)
    
            # --- Plots
            plt.figure(figsize=(10,4))
            plt.plot(targets, label="True")
            plt.plot(preds, label="Predicted")
            plt.legend(); plt.title("True vs. Predicted CPU% (Validation)")
            #most KFP v2 components, the working directory for your step is /tmp, which is writeable.
            plt.tight_layout(); plt.savefig("/tmp/true_vs_pred.png"); plt.close()
            mlflow.log_artifact("/tmp/true_vs_pred.png")
    
            plt.figure(figsize=(10,4))
            plt.plot(preds - targets)
            plt.title("Residuals Over Time"); plt.xlabel("Time"); plt.ylabel("Residual (Pred - True)")
            #most KFP v2 components, the working directory for your step is /tmp, which is writeable.
            plt.tight_layout(); plt.savefig("/tmp/residuals.png"); plt.close()
            mlflow.log_artifact("/tmp/residuals.png")
    
            plt.figure()
            plt.plot(train_losses, label="Train Loss")
            plt.plot(val_losses, label="Val Loss")
            plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title("Learning Curve")
            #most KFP v2 components, the working directory for your step is /tmp, which is writeable.
            plt.legend(); plt.tight_layout(); plt.savefig("/tmp/learning_curve.png"); plt.close()
            mlflow.log_artifact("/tmp/learning_curve.png")
    
            # --- (NEW) Infer signature and log model properly ---
            sample_input_t = torch.tensor(X_val[:1], dtype=torch.float32)
            with torch.no_grad():
                sample_output_np = model(sample_input_t).detach().cpu().numpy()
            input_example_np = sample_input_t.cpu().numpy()
            
            from mlflow.models import infer_signature
            signature = infer_signature(input_example_np, sample_output_np)
            
            mlflow.pytorch.log_model(
                model,
                artifact_path="model",
                input_example=input_example_np,
                signature=signature,
            )
            print("Model + artifacts logged in MLflow (with input_example and signature).")
           
    
        return model, (mae, rmse, r2)

    # 6: Run the Training & Logging
    EPOCHS = 35
    PATIENCE = 5
    #tracking_timestamp =  datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S-") 
    model, metrics = train_model_with_early_stopping(
        train_loader, val_loader,
        input_size=X_train.shape[-1],
        hidden_size=64,
        num_layers=2,
        lr=0.001,
        epochs=EPOCHS,
        patience=PATIENCE,
        dropout=0.1,
        model_name= tracking_timestamp + "cpu-node-1-pct-model",
        run_name= tracking_timestamp + "cpu-node-1-forecast" 
    )
    
    print(f"Final MAE: {metrics[0]:.4f} | RMSE: {metrics[1]:.4f} | R2: {metrics[2]:.4f}")
    

############################################################################################################
#####################################---3. STAGE---#########################################################
############################################################################################################

@dsl.component(
    base_image="docker.io/jhofydu/kpf-kserve:V1.0.0"
)
def hpo_model_building(tracking_timestamp: str):

    #1. Imports & global config
        # ----- Core -----
    import os, io, json, math, random
    from io import BytesIO
    import numpy as np
    import pandas as pd
    
    # ----- MLflow -----
    import mlflow
    from mlflow.tracking import MlflowClient
    from mlflow.models import infer_signature 
    
    # ----- Torch -----
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import TensorDataset, DataLoader
    
    # ----- Metrics -----
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    
    # ----- HPO -----
    import optuna
    from optuna.pruners import PercentilePruner
    
    # ----- Plotting -----
    import matplotlib.pyplot as plt

    # from minio import Minio
    from minio import Minio    
    
    # ----- Repro -----
    def set_seed(seed: int = 42):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
    set_seed(42)
    
    # ===== MLflow tracking =====
    TRACKING_URI = "http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080"
    EXPERIMENT_NAME = "k8s-cpu-forecasting"
    REGISTERED_MODEL_NAME = "cpu-pct"     # model registry name (optional but nice)
    
    mlflow.set_tracking_uri(TRACKING_URI)
    mlflow.set_experiment(EXPERIMENT_NAME)
    ml_client = MlflowClient()
    

    # 2 MinIO client & data loaders (update creds if needed)    
    # If you already have `minio_client`, you can skip this cell.
    # Adjust endpoint/creds to your cluster.
    MINIO_ENDPOINT = "minio-service.kubeflow.svc.cluster.local:9000"
    MINIO_ACCESS_KEY = "minio"
    MINIO_SECRET_KEY = "minio123"
    
    minio_client = Minio(
        endpoint=MINIO_ENDPOINT,
        access_key=MINIO_ACCESS_KEY,
        secret_key=MINIO_SECRET_KEY,
        secure=False
    )
    
    def download_numpy_from_minio(minio_client, bucket_name: str, object_name: str) -> np.ndarray:
        """Download a .npy as numpy array (in-memory, no temp file)."""
        resp = minio_client.get_object(bucket_name, object_name)
        try:
            data = resp.read()
            arr = np.load(BytesIO(data))
            return arr
        finally:
            resp.close()
            resp.release_conn()
    
    # ---- Paths for your prepared arrays ----
    bucket_name = "k8s-resources-forecast"
    object_names = {
        "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
        "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
        "X_val":   "data/k8s-preprocessed/node-1-X_test/X_test.npy",
        "y_val":   "data/k8s-preprocessed/node-1-y_test/y_test.npy",
    }
    
    # ---- Pull arrays ----
    X_train = download_numpy_from_minio(minio_client, bucket_name, object_names["X_train"])
    y_train = download_numpy_from_minio(minio_client, bucket_name, object_names["y_train"])
    X_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["X_val"])
    y_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["y_val"])
    
    print("X_train:", X_train.shape, "y_train:", y_train.shape)
    print("X_val:  ", X_val.shape,   "y_val:  ", y_val.shape)
    
    def make_loader(X, y, batch_size=32, shuffle=True):
        ds = TensorDataset(
            torch.tensor(X, dtype=torch.float32),
            torch.tensor(y, dtype=torch.float32),
        )
        return DataLoader(ds, batch_size=batch_size, shuffle=shuffle)

    #  3 Model & training utils (plots, metrics, early stop)
    class LSTMForecaster(nn.Module):
        def __init__(self, input_size=1, hidden_size=64, num_layers=2, dropout=0.1, horizon=1):
            super().__init__()
            self.lstm = nn.LSTM(
                input_size=input_size,
                hidden_size=hidden_size,
                num_layers=num_layers,
                batch_first=True,
                dropout=dropout if num_layers > 1 else 0.0,
            )
            self.fc = nn.Linear(hidden_size, horizon)
    
        def forward(self, x):
            # x: [B, T, F]
            out, _ = self.lstm(x)       # [B, T, H]
            out = out[:, -1, :]         # [B, H] last step
            out = self.fc(out)          # [B, horizon]
            return out.unsqueeze(-1)    # [B, horizon, 1] to match y
    
    def compute_metrics(loader, model):
        """Return (MAE, RMSE, R2), along with flattened preds & targets for plotting."""
        model.eval()
        preds, targs = [], []
        with torch.no_grad():
            for xb, yb in loader:
                pr = model(xb)
                preds.append(pr.numpy())
                targs.append(yb.numpy())
        preds = np.concatenate(preds, axis=0).reshape(-1)
        targs = np.concatenate(targs, axis=0).reshape(-1)
        mae = mean_absolute_error(targs, preds)
        mse = mean_squared_error(targs, preds)
        rmse = math.sqrt(mse)
        r2 = r2_score(targs, preds)
        return (mae, rmse, r2), preds, targs
    #in kfp /tmp the folder to locally store 
    default_local_path = "/tmp/"
    def plot_and_log_learning_curves(train_curve, val_curve, filename="learning_curve.png"):
        plt.figure(figsize=(7,3))
        plt.plot(train_curve, label="train")
        plt.plot(val_curve, label="val")
        plt.legend(); plt.title("Learning Curve"); plt.tight_layout()
        plt.savefig(default_local_path + filename); plt.close()
        mlflow.log_artifact(default_local_path + filename)
    
    def plot_and_log_residuals(preds, targs, filename="residuals.png"):
        plt.figure(figsize=(7,3))
        plt.plot(preds - targs, label="residual")
        plt.axhline(0, color="black", linewidth=0.8)
        plt.legend(); plt.title("Residuals (val)"); plt.tight_layout()
        plt.savefig(default_local_path + filename); plt.close()
        mlflow.log_artifact(default_local_path + filename)
    
    def plot_and_log_true_vs_pred(targs, preds, filename="val_true_vs_pred.png"):
        plt.figure(figsize=(7,3))
        plt.plot(targs, label="true")
        plt.plot(preds, label="pred")
        plt.legend(); plt.title("True vs Pred (val)"); plt.tight_layout()
        plt.savefig(default_local_path + filename); plt.close()
        mlflow.log_artifact(default_local_path + filename)
    
# 4 4 — Single-run trainer (used by both trials & final), nested run aware
    
    def train_one_model_nested(
        X_train, y_train, X_val, y_val,
        hidden_size=64, num_layers=2, dropout=0.1,
        lr=1e-3, batch_size=32,
        epochs=35, patience=5,
        window_size=None, horizon=None,
        run_name="trial",
        nested=True,
    ):
        """
        Trains one model, logs to MLflow (as nested run if nested=True),
        returns (best_val_loss, run_id, metrics_dict).
        """
        device = torch.device("cpu")
        train_loader = make_loader(X_train, y_train, batch_size=batch_size, shuffle=True)
        val_loader   = make_loader(X_val,   y_val,   batch_size=batch_size, shuffle=False)
    
        input_size = X_train.shape[-1]
        if window_size is None: window_size = X_train.shape[1]
        if horizon is None:     horizon = y_train.shape[1]
    
        model = LSTMForecaster(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            horizon=horizon,
        ).to(device)
    
        criterion = nn.MSELoss()
        opt = optim.Adam(model.parameters(), lr=lr)
    
        best_val = float("inf")
        patience_left = patience
        best_state = None
        train_curve, val_curve = [], []
    
        with mlflow.start_run(run_name=run_name, nested=nested) as active_run:
            # Log params once
            mlflow.log_params({
                "hidden_size": hidden_size,
                "num_layers": num_layers,
                "dropout": dropout,
                "lr": lr,
                "batch_size": batch_size,
                "epochs": epochs,
                "patience": patience,
                "window_size": window_size,
                "horizon": horizon,
            })
    
            for ep in range(1, epochs + 1):
                # Train
                model.train()
                running = 0.0
                for xb, yb in train_loader:
                    opt.zero_grad()
                    pred = model(xb)
                    loss = criterion(pred, yb)
                    loss.backward()
                    opt.step()
                    running += loss.item()
                train_loss = running / max(1, len(train_loader))
    
                # Validate
                model.eval()
                v_running = 0.0
                with torch.no_grad():
                    for xb, yb in val_loader:
                        pr = model(xb)
                        vloss = criterion(pr, yb)
                        v_running += vloss.item()
                val_loss = v_running / max(1, len(val_loader))
    
                train_curve.append(train_loss)
                val_curve.append(val_loss)
    
                # Log per-epoch
                mlflow.log_metric("train_loss", train_loss, step=ep)
                mlflow.log_metric("val_loss", val_loss, step=ep)
    
                # Early stopping
                if val_loss < best_val - 1e-8:
                    best_val = val_loss
                    patience_left = patience
                    best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
                else:
                    patience_left -= 1
                    if patience_left <= 0:
                        print("Early stopping!")
                        break
    
            # Restore best weights & compute final metrics on val
            if best_state is not None:
                model.load_state_dict(best_state)
            (mae, rmse, r2), preds, targs = compute_metrics(val_loader, model)
    
            # Log final metrics & plots
            mlflow.log_metric("val_mae", mae)
            mlflow.log_metric("val_rmse", rmse)
            mlflow.log_metric("val_r2", r2)
    
            plot_and_log_learning_curves(train_curve, val_curve)
            plot_and_log_residuals(preds, targs)
            plot_and_log_true_vs_pred(targs, preds)
    
                    ## --------- Build input_example + signature and log the model (FIXED) ---------
            # Take one validation sample with the real shape [1, window, 1]
            sample_input_t = torch.tensor(X_val[:1], dtype=torch.float32)     # torch tensor
            with torch.no_grad():
                sample_output_np = model(sample_input_t).detach().cpu().numpy()  # numpy [1, 1, 1]
            
            #  MLflow requires numpy/pandas (not torch.Tensor) for input_example
            input_example_np = sample_input_t.cpu().numpy()  # [1, window, 1]
            
            # Infer signature from numpy arrays
            signature = infer_signature(
                input_example_np,   # inputs:  [1, 5, 1] in your case [batch, window, feature]
                sample_output_np    # outputs: [1, 1, 1] [batch, horizon, 1]
            )
            
            # Log PyTorch model WITH signature + input_example (no more warning)
            mlflow.pytorch.log_model(
                model,
                artifact_path="model",
                input_example=input_example_np,   # <-- numpy, not torch
                signature=signature
            )
            # ---------------------------------------------------------------------------
    
            # ------------------------------------------------------------------
    
            run_id = active_run.info.run_id
    
        metrics = {"val_loss": best_val, "val_mae": mae, "val_rmse": rmse, "val_r2": r2}
        return best_val, run_id, metrics
            
    # 5 — Optuna objective (each trial becomes a nested run)        
    WINDOW_SIZE = X_train.shape[1]  # should be 5
    HORIZON     = y_train.shape[1]  # should be 1
    
    def objective(trial: optuna.Trial):
        # Search space
        hidden_size = trial.suggest_int("hidden_size", 16, 128, step=16)
        num_layers  = trial.suggest_int("num_layers", 1, 3)
        dropout     = trial.suggest_float("dropout", 0.0, 0.4)
        lr          = trial.suggest_float("lr", 1e-4, 5e-3, log=True)
        batch_size  = trial.suggest_categorical("batch_size", [16, 32, 64])
        epochs      = trial.suggest_int("epochs", 15, 50)
        patience    = trial.suggest_int("patience", 3, 10)
    
        # Train one trial as a NESTED MLflow run
        val_loss, run_id, metrics = train_one_model_nested(
            X_train, y_train, X_val, y_val,
            hidden_size=hidden_size, num_layers=num_layers, dropout=dropout,
            lr=lr, batch_size=batch_size, epochs=epochs, patience=patience,
            window_size=WINDOW_SIZE, horizon=HORIZON,
            run_name=f"trial-{trial.number}",
            nested=True
        )
    
        # Attach run id to trial for traceability
        trial.set_user_attr("mlflow_run_id", run_id)
        # Report for pruning
        trial.report(val_loss, step=epochs)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
        return val_loss
        
    # 6 — Orchestrator: Parent run → nested trials → nested final
    def run_hpo_and_final_training(
        n_trials=8,
        pruner=None,            # e.g., PercentilePruner(percentage=50, n_startup_trials=3)
        study_name="cpu_pct_lstm",
        parent_run_name="session",
        register_model=True,
    ):
        """
        Creates a PARENT MLflow run.
        - Inside it, runs Optuna trials as nested runs.
        - Trains a final best model as another nested run.
        - Logs a summary artifact in the parent.
        Returns: dict with summary (best params, child run ids, final run id, etc.)
        """
        # Default pruner if not provided
        if pruner is None:
            pruner = PercentilePruner(percent=50, n_startup_trials=3)
    
        summary = {}
        trial_infos = []
    
        with mlflow.start_run(run_name=parent_run_name, nested=False) as parent_run:
            parent_run_id = parent_run.info.run_id
            mlflow.set_tag("session", parent_run_name)
    
            # --- HPO (trials) ---
            study = optuna.create_study(direction="minimize", pruner=pruner, study_name=study_name)
            study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    
            best_params = study.best_params
            best_value  = study.best_value
            best_trial  = study.best_trial.number
    
            # collect trial metadata (trial -> run_id)
            for t in study.trials:
                trial_infos.append({
                    "trial_number": t.number,
                    "value": t.value,
                    "params": t.params,
                    "state": str(t.state),
                    "mlflow_run_id": t.user_attrs.get("mlflow_run_id", None),
                })
    
            # log to parent
            mlflow.log_param("n_trials", n_trials)
            mlflow.log_param("study_name", study_name)
            mlflow.log_metric("best_val_loss", best_value)
            mlflow.set_tag("best_trial_number", best_trial)
            for k, v in best_params.items():
                mlflow.log_param(f"best_{k}", v)
    
    
            # --- Final training (nested) ---
            final_val_loss, final_run_id, final_metrics = train_one_model_nested(
                X_train, y_train, X_val, y_val,
                hidden_size=best_params["hidden_size"],
                num_layers=best_params["num_layers"],
                dropout=best_params["dropout"],
                lr=best_params["lr"],
                batch_size=best_params["batch_size"],
                epochs=best_params["epochs"],
                patience=best_params["patience"],
                window_size=WINDOW_SIZE,
                horizon=HORIZON,
                run_name=  tracking_timestamp + "final-best-hpo-cpu-node-1-forecast",
                nested=True
            )
    
            # build return summary
            summary = {
                "parent_run_id": parent_run_id,
                "best_trial_number": best_trial,
                "best_params": best_params,
                "best_val_loss": best_value,
                "trial_infos": trial_infos,
                "final_run_id": final_run_id,
                "final_metrics": final_metrics,
                "registered_model_name": REGISTERED_MODEL_NAME if register_model else None,
                #"registered_model_version": registered_version,
                "mlflow_ui_session": f"{TRACKING_URI}/#/experiments/{ml_client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id}/runs/{parent_run_id}"
            }

                # ==== AÑADE ESTO AQUÍ (justo antes del return) ====
            for t in trial_infos:
                t["is_best_trial"] = (t["trial_number"] == best_trial)
            trial_infos.append({
                "best_trial": {
                    "trial_number": best_trial,
                    "mlflow_run_id": final_run_id,
                    "metrics": final_metrics
                }
            })

            
            # Save trial summary JSON as artifact
            optuna_summary_path = "/tmp/optuna_trials_summary.json"
            with open(optuna_summary_path, "w") as f:
                json.dump(trial_infos, f, indent=2)
            mlflow.log_artifact(optuna_summary_path)
    
        return summary

    
    # 7 Running Set TRIALS For Controlling HPO
    summary = run_hpo_and_final_training(
        n_trials=3,
        pruner=PercentilePruner(percentile=50, n_startup_trials=3),
        study_name="cpu_pct_lstm",
        parent_run_name= tracking_timestamp +"hpo-cpu-node-1-forecast-session",
        register_model=False,  # set False if you don't want to register now
    )
    


############################################################################################################
#####################################---4. STAGE---#########################################################
############################################################################################################

@dsl.component(
    base_image="docker.io/jhofydu/kpf-kserve:V1.0.0",
    packages_to_install=["uv", "virtualenv"]
)

def model_validation(tracking_timestamp: str) -> NamedTuple("Outputs", [("single_model_uri",str),("cpu_node_1_forecast", str), ("hpo_model_uri",str), ("final_best_hpo_cpu_node_1_forecast", str)]):

    #imports and functions
    import mlflow
    from mlflow.tracking import MlflowClient
    import numpy as np
    
    def setup_mlflow(tracking_uri, experiment_name):
        mlflow.set_tracking_uri(tracking_uri)
        mlflow.set_experiment(experiment_name)
        client = MlflowClient()
        experiment = client.get_experiment_by_name(experiment_name)
        return client, experiment
    
    # The number of nested runs that can be retrieved
    def find_runs(client, experiment_id, single_suffix="cpu-node-1-forecast", hpo_suffix="hpo-cpu-node-1-forecast-session", limit=50):
        runs = client.search_runs(
            experiment_ids=[experiment_id],
            order_by=["attributes.start_time DESC"],
            max_results=limit
        )
        single_run, hpo_parent_run = None, None
        for run in runs:
            rn = run.data.tags.get("mlflow.runName", "")
            if rn.endswith(single_suffix) and "hpo" not in rn:
                single_run = run
            if rn.endswith(hpo_suffix):
                hpo_parent_run = run
            if single_run and hpo_parent_run:
                break
        if not single_run or not hpo_parent_run:
            raise Exception("Could not find both single and HPO runs.")
        return single_run, hpo_parent_run

    def find_final_hpo_child_run(client, experiment_id, parent_run, keyword="final-best"):
        hpo_child_runs = client.search_runs(
            experiment_ids=[experiment_id],
            filter_string=f"tags.mlflow.parentRunId = '{parent_run.info.run_id}'",
            order_by=["attributes.start_time DESC"],
            max_results=10
        )
        for run in hpo_child_runs:
            rn = run.data.tags.get("mlflow.runName", "")
            if keyword in rn or "final" in rn or "best" in rn:
                return run
        return hpo_child_runs[0] if hpo_child_runs else None
    
    def build_model_uris(single_run, hpo_run):
        single_model_uri = f"runs:/{single_run.info.run_id}/model"
        hpo_model_uri    = f"runs:/{hpo_run.info.run_id}/model"
        print("Single model uri:", single_model_uri)
        print("HPO model uri:", hpo_model_uri)
        return single_model_uri, hpo_model_uri

    def validate_model(model_uri, example_input):
        import mlflow
        model = mlflow.pyfunc.load_model(model_uri)
        result = model.predict(example_input)
        print(f"Result for {model_uri}: {result}")
        return result
    
    

    # Calling
    TRACKING_URI = "http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080"
    EXPERIMENT_NAME = "k8s-cpu-forecasting"
    
    client, experiment = setup_mlflow(TRACKING_URI, EXPERIMENT_NAME)
    single_run, hpo_parent_run = find_runs(client, experiment.experiment_id)
    hpo_best_run = find_final_hpo_child_run(client, experiment.experiment_id, hpo_parent_run)
    
    single_model_uri, hpo_model_uri = build_model_uris(single_run, hpo_best_run)
    
    # Example input (replace with your real validation input!)
    input_data = np.array([[
        [0.19565217],
        [0.22826087],
        [0.25],
        [0.20652173],
        [0.22826087]
    ]], dtype=np.float32)
    
    single_result = validate_model(single_model_uri, input_data)
    hpo_result = validate_model(hpo_model_uri, input_data)
    single_result_scalar = float(np.array(single_result).squeeze())   # -> just the number
    hpo_result_scalar = float(np.array(hpo_result).squeeze())   # -> just the number
    print("Single model output prediction:", single_result_scalar)
    print("HPO model output:prediction", hpo_result_scalar)
    
    return (single_model_uri, str(single_result_scalar),hpo_model_uri, str(hpo_result_scalar))



############################################################################################################
#####################################---5. STAGE---#########################################################
############################################################################################################

@dsl.component(
    base_image="docker.io/jhofydu/kpf-kserve:V1.0.0",
    packages_to_install=["uv", "virtualenv"]
)

def best_model_registration(tracking_timestamp: str, single_model_uri: str, hpo_model_uri: str ) -> NamedTuple("Outputs", [("model_uri",str)]):


    
    # Registering models
    
    import time
    import mlflow
    import json
    from mlflow import MlflowClient
    
    # ========= config =========
    TRACKING_URI = "http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080"
    REGISTERED_MODEL_NAME = "cpu-pct-test-5"   # <-- change to your model name
    
    mlflow.set_tracking_uri(TRACKING_URI)
    client = MlflowClient()
    
    # ---------- helpers ----------
    def _safe_get_alias_version(name: str, alias: str):
        """Return int version for alias or None if alias does not exist."""
        try:
            return int(client.get_model_version_by_alias(name, alias).version)
        except Exception:
            return None
    
    def _majority_is_better(m1: dict, m2: dict) -> int:
        votes_1 = 0; votes_2 = 0
        # rmse (↓)
        votes_1 += m1["val_rmse"] < m2["val_rmse"]
        votes_2 += m2["val_rmse"] < m1["val_rmse"]
        # mae (↓)
        votes_1 += m1["val_mae"] < m2["val_mae"]
        votes_2 += m2["val_mae"] < m1["val_mae"]
        # r2 (↑)
        votes_1 += m1["val_r2"] > m2["val_r2"]
        votes_2 += m2["val_r2"] > m1["val_r2"]
    
        if votes_1 > votes_2: return 1
        if votes_2 > votes_1: return 2
        # deterministic tie-breakers
        if m1["val_rmse"] != m2["val_rmse"]:
            return 1 if m1["val_rmse"] < m2["val_rmse"] else 2
        if m1["val_mae"] != m2["val_mae"]:
            return 1 if m1["val_mae"] < m2["val_mae"] else 2
        if m1["val_r2"] != m2["val_r2"]:
            return 1 if m1["val_r2"] > m2["val_r2"] else 2
        return 1
    
    def _wait_ready(model_name: str, version: int, timeout_s: int = 300):
        ver_s = str(version)
        t0 = time.time()
        while time.time() - t0 < timeout_s:
            mv = client.get_model_version(model_name, ver_s)
            if mv.status == "READY":
                return
            time.sleep(1)
        raise TimeoutError(f"ModelVersion v{ver_s} of {model_name} not READY in {timeout_s}s")
    
    def _register_with_tags(model_uri: str, metrics: dict, params: dict, role: str) -> int:
        """Register model, set description & tags. Return version int."""
        mv = mlflow.register_model(model_uri=model_uri, name=REGISTERED_MODEL_NAME)
        ver = int(mv.version)
        ver_s = str(ver)
        _wait_ready(REGISTERED_MODEL_NAME, ver)
    
        # short description
        desc = (
            ("Bootstrap winner. " if role == "winner" else "Auto-registered challenger. ")
            + "Rule: majority(val_rmse↓, val_mae↓, val_r2↑). "
            f"rmse={metrics['val_rmse']:.6f}, mae={metrics['val_mae']:.6f}, r2={metrics['val_r2']:.4f}. "
            f"window={params.get('window_size','?')}, horizon={params.get('horizon','?')}, "
            f"source_id={params.get('source_id','unknown')}."
        )
        client.update_model_version(REGISTERED_MODEL_NAME, ver_s, description=desc)
    
        # tags – only source_id and run_id added
        tags = {
            "owner": "ml-pipeline",
            "comparison_rule": "majority(val_rmse↓,val_mae↓,val_r2↑)",
            "val_rmse": f"{metrics['val_rmse']}",
            "val_mae":  f"{metrics['val_mae']}",
            "val_r2":   f"{metrics['val_r2']}",
            "window_size": f"{params.get('window_size', 'unknown')}",
            "horizon":     f"{params.get('horizon', 'unknown')}",
            "source_id":   f"{params.get('source_id', 'unknown')}",
            "run_id":      f"{params.get('run_id', 'unknown')}",
            "role": role,                                # "winner" or "challenger"
            "candidate": "true" if role == "challenger" else "false",
        }
        for k, v in tags.items():
            client.set_model_version_tag(REGISTERED_MODEL_NAME, ver_s, k, v)
    
        return ver
    
    def _parse_run_id_from_runs_uri(model_uri: str) -> str:
        prefix = "runs:/"
        if not model_uri.startswith(prefix):
            raise ValueError(f"Only runs:/ URIs are supported here, got: {model_uri}")
        rest = model_uri[len(prefix):]
        return rest.split("/", 1)[0]
    
    def _metrics_and_params_from_uri(model_uri: str):
        run_id = _parse_run_id_from_runs_uri(model_uri)
        run = client.get_run(run_id)
        m = run.data.metrics
        p = run.data.params
        t = run.data.tags
    
        metrics = {
            "val_rmse": float(m["val_rmse"]),
            "val_mae":  float(m["val_mae"]),
            "val_r2":   float(m["val_r2"]),
        }
    
        # Prefer explicit source_id (param or tag); otherwise use the run name (mlflow.runName)
        run_name = t.get("mlflow.runName")
        source_id = p.get("source_id") or t.get("source_id") or run_name or "unknown"
    
        params = {
            "window_size": p.get("window_size") or t.get("window_size"),
            "horizon":     p.get("horizon")     or t.get("horizon"),
            "source_id":   source_id,
            "run_id":      run_id,
        }
        return metrics, params
    
    def register_best_of_two_by_uri(uri_a: str, uri_b: str):
        m_a, p_a = _metrics_and_params_from_uri(uri_a)
        m_b, p_b = _metrics_and_params_from_uri(uri_b)
    
        pick = _majority_is_better(m_a, m_b)   # 1 means A chosen, else B
        chosen_uri, chosen_m, chosen_p = (uri_a, m_a, p_a) if pick == 1 else (uri_b, m_b, p_b)
    
        current_winner_ver = _safe_get_alias_version(REGISTERED_MODEL_NAME, "winner")
    
        if current_winner_ver is None:
            # first time -> make winner
            new_ver = _register_with_tags(chosen_uri, chosen_m, chosen_p, role="winner")
            client.set_registered_model_alias(REGISTERED_MODEL_NAME, "winner", str(new_ver))
            # ensure others are absent
            for al in ("challenger", "prev_challenger"):
                try:
                    client.delete_registered_model_alias(REGISTERED_MODEL_NAME, al)
                except Exception:
                    pass
            return {"picked": "A" if pick == 1 else "B",
                    "winner": new_ver, "challenger": None, "prev_challenger": None}
    
        # normal run -> challenger flow
        new_ver = _register_with_tags(chosen_uri, chosen_m, chosen_p, role="challenger")
    
        old_challenger_ver = _safe_get_alias_version(REGISTERED_MODEL_NAME, "challenger")
        if old_challenger_ver is not None:
            client.set_registered_model_alias(REGISTERED_MODEL_NAME, "prev_challenger", str(old_challenger_ver))
        client.set_registered_model_alias(REGISTERED_MODEL_NAME, "challenger", str(new_ver))
    
        return {"picked": "A" if pick == 1 else "B",
                "winner": current_winner_ver,
                "challenger": new_ver,
                "prev_challenger": _safe_get_alias_version(REGISTERED_MODEL_NAME, "prev_challenger")}
    
    # =======================
    # ======= CALL IT =======
    # =======================
    hpo_model_uri    = hpo_model_uri
    single_model_uri = single_model_uri
    
    result = register_best_of_two_by_uri(hpo_model_uri, single_model_uri)
    
    print("== Registration summary ==")
    print(f"Picked:       {'A (hpo)' if result['picked']=='A' else 'B (single)'}")
    chosen_uri = hpo_model_uri if result["picked"] == "A" else single_model_uri
    chosen_m, _ = _metrics_and_params_from_uri(chosen_uri)
    print(f"Metrics:      rmse={chosen_m['val_rmse']:.8f}  mae={chosen_m['val_mae']:.8f}  r2={chosen_m['val_r2']:.6f}")
    print(f"Aliases now:  winner -> v{_safe_get_alias_version(REGISTERED_MODEL_NAME,'winner')}, "
          f"challenger -> v{_safe_get_alias_version(REGISTERED_MODEL_NAME,'challenger')}, "
          f"prev_challenger -> v{_safe_get_alias_version(REGISTERED_MODEL_NAME,'prev_challenger')}")
 

    return ("HOLA_mundo", )


##########################################################################################################
#####################################---6 Stage---################################################
############################################################################################################

from kfp import dsl
from typing import NamedTuple

@dsl.component(
    base_image="docker.io/jhofydu/kpf-kserve:V1.0.0"
)
def promote_challenger_if_better() -> NamedTuple(
    "Outputs",
    [("decision", str), ("winner_version", str), ("challenger_version", str)]
):
    """
    Compares models aliased as @winner and @challenger for a registered model.
    If challenger is better, promote it to @winner and move old winner to @prev_winner.
    """
    import mlflow
    from mlflow import MlflowClient

    TRACKING_URI = "http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080"
    MODEL_NAME   = "cpu-pct-test-5"

    mlflow.set_tracking_uri(TRACKING_URI)
    client = MlflowClient()

    # ---------- helpers ----------
    def get_alias(alias: str):
        try:
            mv = client.get_model_version_by_alias(MODEL_NAME, alias)
            return int(mv.version), mv
        except Exception:
            return None, None

    def to_float(x):
        try:
            return float(str(x))
        except Exception:
            return None

    def metrics_from_model_version(mv):
        """
        Prefer metrics from tags (you already set them when registering).
        Fallback to run metrics if tags are missing.
        """
        tags = mv.tags or {}
        rmse = to_float(tags.get("val_rmse"))
        mae  = to_float(tags.get("val_mae"))
        r2   = to_float(tags.get("val_r2"))
        if rmse is None or mae is None or r2 is None:
            # fallback: read the run’s metrics
            run_id = tags.get("run_id")
            if run_id:
                run = client.get_run(run_id)
                rmse = rmse if rmse is not None else to_float(run.data.metrics.get("val_rmse"))
                mae  = mae  if mae  is not None else to_float(run.data.metrics.get("val_mae"))
                r2   = r2   if r2   is not None else to_float(run.data.metrics.get("val_r2"))
        if rmse is None or mae is None or r2 is None:
            raise RuntimeError(f"Missing metrics for model version v{mv.version}.")
        return {"val_rmse": rmse, "val_mae": mae, "val_r2": r2}

    def challenger_better(m_c, m_w):
        # 3 votes
        votes_c = 0
        votes_w = 0
        # RMSE (lower better)
        votes_c += m_c["val_rmse"] < m_w["val_rmse"]
        votes_w += m_w["val_rmse"] < m_c["val_rmse"]
        # MAE (lower better)
        votes_c += m_c["val_mae"] < m_w["val_mae"]
        votes_w += m_w["val_mae"] < m_c["val_mae"]
        # R2 (higher better)
        votes_c += m_c["val_r2"] > m_w["val_r2"]
        votes_w += m_w["val_r2"] > m_c["val_r2"]

        if votes_c > votes_w:
            return True
        if votes_w > votes_c:
            return False
        # ties → RMSE → MAE → R²
        if m_c["val_rmse"] != m_w["val_rmse"]:
            return m_c["val_rmse"] < m_w["val_rmse"]
        if m_c["val_mae"] != m_w["val_mae"]:
            return m_c["val_mae"] < m_w["val_mae"]
        return m_c["val_r2"] > m_w["val_r2"]

    # ---------- current aliases ----------
    winner_ver, winner_mv = get_alias("winner")
    challenger_ver, challenger_mv = get_alias("challenger")

    if winner_ver is None and challenger_ver is None:
        raise RuntimeError("No @winner or @challenger found.")

    if winner_ver is None and challenger_ver is not None:
        # bootstrap case: make challenger the first winner
        client.set_registered_model_alias(MODEL_NAME, "winner", str(challenger_ver))
        try:
            client.delete_registered_model_alias(MODEL_NAME, "challenger")
        except Exception:
            pass
        decision = f"bootstrap: promoted v{challenger_ver} to @winner"
        return (decision, str(challenger_ver), "None")

    if challenger_ver is None:
        decision = f"no-op: no @challenger present; current @winner is v{winner_ver}"
        return (decision, str(winner_ver), "None")

    # ---------- compare ----------
    m_w = metrics_from_model_version(winner_mv)
    m_c = metrics_from_model_version(challenger_mv)

    print(f"@winner  v{winner_ver}:  rmse={m_w['val_rmse']:.8f}, mae={m_w['val_mae']:.8f}, r2={m_w['val_r2']:.6f}")
    print(f"@challenger v{challenger_ver}: rmse={m_c['val_rmse']:.8f}, mae={m_c['val_mae']:.8f}, r2={m_c['val_r2']:.6f}")

    if challenger_better(m_c, m_w):
        # move winner -> prev_winner, promote challenger -> winner, clear challenger
        client.set_registered_model_alias(MODEL_NAME, "prev_winner", str(winner_ver))
        client.set_registered_model_alias(MODEL_NAME, "winner", str(challenger_ver))
        try:
            client.delete_registered_model_alias(MODEL_NAME, "challenger")
        except Exception:
            pass
        decision = f"promoted: v{challenger_ver} -> @winner; previous v{winner_ver} -> @prev_winner"
        return (decision, str(challenger_ver), str(challenger_ver))
    else:
        decision = f"kept: v{winner_ver} remains @winner; v{challenger_ver} stays @challenger"
        return (decision, str(winner_ver), str(challenger_ver))





##########################################################################################################
#####################################---7 Stage---################################################
############################################################################################################
@dsl.component(
    base_image="docker.io/jhofydu/kpf-kserve:V1.0.0",
    packages_to_install=["kserve==0.15.0", "kubernetes"]
)

def deploying_model() -> NamedTuple("Outputs", [("info", str)]):

    import mlflow
    from mlflow import MlflowClient
    import re
    from kubernetes import client
    from kserve import KServeClient
    from kserve import constants
    from kserve import V1beta1InferenceService, V1beta1InferenceServiceSpec, V1beta1PredictorSpec, V1beta1ModelSpec
    
    # ===== config =====
    TRACKING_URI = "http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080"
    REGISTERED_MODEL_NAME = "cpu-pct-test-5"
    ALIAS = "winner"
    
    namespace = "lstm-iqu"
    name = "inference-serving-kserve-cpu"
    service_account = "sa-private-mlflow"
    S3_PREFIX = "s3://mlflow"
    
    # ===== MLflow: Get model version + run ID =====
    mlflow.set_tracking_uri(TRACKING_URI)
    mlflow_client = MlflowClient()
    
    try:
        model_version = mlflow_client.get_model_version_by_alias(REGISTERED_MODEL_NAME, ALIAS)
        version = model_version.version
        run_id = model_version.tags.get("run_id")
    
        if not run_id and model_version.source:
            match = re.search(r"/([0-9a-f]{32})/artifacts", model_version.source)
            if match:
                run_id = match.group(1)
    
        if not run_id:
            raise RuntimeError(" Could not extract run_id from alias.")
    
        # Fetch run info to get experiment_id
        run = mlflow_client.get_run(run_id)
        experiment_id = run.info.experiment_id
    
        # ✅ Correct storage URI based on experiment_id and run_id
        storage_uri = f"{S3_PREFIX}/{experiment_id}/{run_id}/artifacts/model"
        print(f"✅ Storage URI: {storage_uri}")
    
    except Exception as e:
        raise RuntimeError(f" Failed to resolve MLflow model: {e}")
    
    # ===== KServe Deployment =====
    model_spec = V1beta1ModelSpec(
        model_format={"name": "mlflow"},
        storage_uri=storage_uri,
        runtime="kserve-mlserver"
    )
    
    predictor_spec = V1beta1PredictorSpec(
        service_account_name=service_account,
        model=model_spec,
    )
    
    isvc = V1beta1InferenceService(
        api_version="serving.kserve.io/v1beta1",
        kind="InferenceService",
        metadata=client.V1ObjectMeta(
            name=name,
            namespace=namespace
        ),
        spec=V1beta1InferenceServiceSpec(
            predictor=predictor_spec
        )
    )
    
    kserve_client = KServeClient()
    kserve_client.create(isvc, namespace=namespace)
    
    # Optionally, wait for readiness
    kserve_client.wait_isvc_ready(name, namespace=namespace)
    return ("Version: "+ version, )


##########################################################################################################
#####################################---8 Stages---################################################
############################################################################################################
@dsl.component(
    base_image="docker.io/jhofydu/pytorch-kfp:v1.0.0",   # your image
    packages_to_install=["kafka-python", "requests"]     # essentials only
)
def forecasting_cpu(
    kafka_topic: str = "node-windows",
    kafka_broker: str = "kafka.apache-kafka.svc.cluster.local:9092",
    kserve_url: str = "http://inference-serving-kserve-cpu-predictor-00001-private.lstm-iqu.svc.cluster.local/v2/models/inference-serving-kserve-cpu/infer",
    target_node: str = "k3d-sunrise-agent-0:9100"
) -> NamedTuple("Outputs", [("last_prediction", float)]):

    from kafka import KafkaConsumer
    import requests, json

    # === helpers ===
    def normalize(cpu_percent):
        return float(cpu_percent) / 100.0

    def make_payload(cpu_window):
        shaped = [[[x]] for x in cpu_window]
        return {
            "inputs": [{
                "name": "input-0",
                "shape": [1, 5, 1],
                "datatype": "FP32",
                "data": [shaped]
            }]
        }

    # === Kafka setup ===
    consumer = KafkaConsumer(
        kafka_topic,
        bootstrap_servers=[kafka_broker],
        group_id="kserve-cpu-inference",
        auto_offset_reset="latest",
        enable_auto_commit=True,
        value_deserializer=lambda v: json.loads(v.decode("utf-8"))
    )

    session = requests.Session()
    last_seen_ts = None
    last_prediction = 0.0

    print(f"🚀 Listening for 5-step CPU windows from node: {target_node}\n")

    for msg in consumer:
        data = msg.value
        node = data.get("node")
        window = data.get("window", [])

        if node != target_node or len(window) != 5:
            continue

        # deduplicate by timestamp
        current_last_ts = window[-1]["timestamp"]
        if current_last_ts == last_seen_ts:
            continue
        last_seen_ts = current_last_ts

        cpu_window = [normalize(e["cpu_percent"]) for e in window]

        print(f"\n📊 Inference Window from {node}")
        for e in window:
            print(f"   {e['timestamp']} | CPU: {e['cpu_percent']}%")

        payload = make_payload(cpu_window)
        try:
            resp = session.post(kserve_url, json=payload, timeout=10)
            resp.raise_for_status()   # ⚠️ here you got 403 originally
            last_prediction = resp.json()["outputs"][0]["data"][0]
            print(f"✅ Prediction: {last_prediction:.4f}")
        except Exception as e:
            print(f"❌ Inference failed: {e}")
            break   # stop after failure for pipeline run

        break   # stop after first successful inference

    return (last_prediction,)


##########################################################################################################
#####################################---Connecting Stages---################################################
############################################################################################################

@dsl.pipeline(name="Sunrise", description="Generate models to forecast cpu% in k8s")
def sunrise():

    caching_option= False
    step1_1 = get_data_batch().set_caching_options(caching_option)
    step2_1 = model_building(tracking_timestamp=step1_1.outputs["tracking_timestamp"]).after(step1_1).set_caching_options(caching_option)
    step2_2 = hpo_model_building(tracking_timestamp=step1_1.outputs["tracking_timestamp"]).after(step1_1).set_caching_options(caching_option)
    step3_1 = model_validation(tracking_timestamp=step1_1.outputs["tracking_timestamp"]).after(step2_1, step2_2).set_caching_options(caching_option)
    step4_1 = best_model_registration(tracking_timestamp=step1_1.outputs["tracking_timestamp"], single_model_uri=step3_1.outputs["single_model_uri"], hpo_model_uri=step3_1.outputs["hpo_model_uri"]).after(step3_1).set_caching_options(caching_option)
    step5_1 = promote_challenger_if_better().after(step4_1).set_caching_options(False)
    step6_1 = deploying_model().after(step5_1).set_caching_options(False) 
    step7_1 = forecasting_cpu().after(step6_1).set_caching_options(False)
    step7_1.service_account = "default-editor"
    




###########################################################################################
#####################################---Running Pipeline---################################
###########################################################################################
if __name__ == '__main__':
    import kfp
    client = kfp.Client()
    client.create_run_from_pipeline_func(
        sunrise,
        arguments={},
        experiment_name="cpu-forecasting"
    )


ApiException: (500)
Reason: Internal Server Error
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'date': 'Tue, 16 Sep 2025 19:19:29 GMT', 'content-length': '623', 'x-envoy-upstream-service-time': '118', 'server': 'envoy'})
HTTP response body: {"error":"Failed to create a new run: Failed to create a run: InternalServerError: Failed to store run sunrise 2025-09-16 19-19-29 to table: Error 3988 (HY000): Conversion from collation utf8_general_ci into utf8mb4_0900_ai_ci impossible for parameter","code":13,"message":"Failed to create a new run: Failed to create a run: InternalServerError: Failed to store run sunrise 2025-09-16 19-19-29 to table: Error 3988 (HY000): Conversion from collation utf8_general_ci into utf8mb4_0900_ai_ci impossible for parameter","details":[{"@type":"type.googleapis.com/google.rpc.Status","code":13,"message":"Internal Server Error"}]}
