In [7]:
import kfp
from kfp import dsl

###########################################################################################
#####################################---1. STAGE---########################################
###########################################################################################
@dsl.component(
    base_image="docker.io/jhofydu/pytorch-kfp:v1.0.0",
    packages_to_install=["minio"]
)
def get_data_batch():
    import pandas as pd
    import numpy as np
    import torch
    import joblib
    from minio import Minio
    from io import BytesIO
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split

    # Minio client
    minio_client = Minio(
        "minio-service.kubeflow.svc.cluster.local:9000",
        access_key="minio",
        secret_key="minio123",
        secure=False,
    )

    def read_csv_from_minio(minio_client, bucket, object_name):
        with minio_client.get_object(bucket, object_name) as response:
            df = pd.read_csv(BytesIO(response.read()))
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.set_index('timestamp', inplace=True)
        df.sort_index(inplace=True)
        return df

    bucket_name = "k8s-resources-forecast"
    object_name = "data/k8s-preprocessed/prec-pct-k8s-dataset.csv"
    df = read_csv_from_minio(minio_client, bucket_name, object_name)

    selected_feature = 'ac3-node-1-vm_cpu_pct'
    df_node_1_cpu_feature = df[[selected_feature]]

    def normalize_with_sklearn(df):
        scaler = MinMaxScaler()
        df_scaled = pd.DataFrame(
            scaler.fit_transform(df),
            columns=df.columns,
            index=df.index
        )
        return df_scaled, scaler

    df_node_1_cpu_feature_scaled, scaler = normalize_with_sklearn(df_node_1_cpu_feature)

    def save_scaler_to_minio(minio_client, scaler, bucket, object_name):
        if not object_name.endswith('.pkl'):
            object_name += '.pkl'
        scaler_bytes = BytesIO()
        joblib.dump(scaler, scaler_bytes)
        scaler_bytes.seek(0)
        minio_client.put_object(
            bucket_name=bucket,
            object_name=object_name,
            data=scaler_bytes,
            length=len(scaler_bytes.getvalue()),
            content_type='application/octet-stream'
        )
        print(f"Scaler saved to s3://{bucket}/{object_name}")

    object_name_scaler = "data/k8s-preprocessed/node-1-cpu-scaler/node-1-scaler"
    save_scaler_to_minio(minio_client, scaler, bucket_name, object_name_scaler)

    def create_sequences(data, window_size=5, horizon=1):
        X, y = [], []
        for i in range(len(data) - window_size - horizon + 1):
            X.append(data[i:(i + window_size)])
            y.append(data[i + window_size:i + window_size + horizon])
        X = np.array(X).reshape(-1, window_size, 1)
        y = np.array(y).reshape(-1, horizon, 1)
        return X, y

    def split_time_series(X, y, test_size=0.2):
        from sklearn.model_selection import train_test_split
        return train_test_split(X, y, test_size=test_size, shuffle=False)

    X, y = create_sequences(df_node_1_cpu_feature_scaled)
    X_train, X_test, y_train, y_test = split_time_series(X, y)

    def upload_numpy_to_minio(client, array, bucket_name, object_name):
        buffer = BytesIO()
        np.save(buffer, array)
        buffer.seek(0)
        client.put_object(
            bucket_name=bucket_name,
            object_name=object_name,
            data=buffer,
            length=buffer.getbuffer().nbytes,
            content_type="application/octet-stream"
        )
        print(f"Uploaded to minio://{bucket_name}/{object_name} ({array.shape})")

    object_names = {
        "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
        "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
        "X_test":  "data/k8s-preprocessed/node-1-X_test/X_test.npy",
        "y_test":  "data/k8s-preprocessed/node-1-y_test/y_test.npy",
    }

    upload_numpy_to_minio(minio_client, X_train, bucket_name, object_names["X_train"])
    upload_numpy_to_minio(minio_client, y_train, bucket_name, object_names["y_train"])
    upload_numpy_to_minio(minio_client, X_test,  bucket_name, object_names["X_test"])
    upload_numpy_to_minio(minio_client, y_test,  bucket_name, object_names["y_test"])


###########################################################################################
#####################################---2. STAGE---########################################
###########################################################################################

@dsl.component(
    base_image="docker.io/jhofydu/pytorch-kfp:v1.0.0",
    packages_to_install=["minio", "mlflow==2.21.3"]
)
def model_building():

    # 1. Imports & MinIO client creation adn Download Function
    
    import numpy as np
    import io
    import torch
    from torch.utils.data import TensorDataset, DataLoader
    import torch.nn as nn
    import mlflow
    import matplotlib.pyplot as plt
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    from minio import Minio
    import datetime
    
    # Function to download numpy arrays from MinIO
    def download_numpy_from_minio(minio_client, bucket, object_name):
        try:
            with minio_client.get_object(bucket, object_name) as response:
                arr = np.load(io.BytesIO(response.read()))
                print(f"Downloaded: s3://{bucket}/{object_name} shape={arr.shape}")
                return arr
        except Exception as e:
            print(f"Error: {e}")
    # Minio client
    minio_client = Minio(
        "minio-service.kubeflow.svc.cluster.local:9000",
        access_key="minio",
        secret_key="minio123",
        secure=False,
    )
    # SET MLflow URI in the k8s cluster
    # This line must be placed before any mlflow.start_run()
    mlflow.set_tracking_uri("http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080")
    mlflow.set_experiment("k8s-cpu-forecasting")

    
    # 2: Load Train/Val Sets From MinIO
    bucket_name = "k8s-resources-forecast"
    object_names = {
        "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
        "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
        "X_val":   "data/k8s-preprocessed/node-1-X_test/X_test.npy",
        "y_val":   "data/k8s-preprocessed/node-1-y_test/y_test.npy",
    }
    
    X_train = download_numpy_from_minio(minio_client, bucket_name, object_names["X_train"])
    y_train = download_numpy_from_minio(minio_client, bucket_name, object_names["y_train"])
    X_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["X_val"])
    y_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["y_val"])
    
    print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
    print("X_val shape:", X_val.shape, "y_val shape:", y_val.shape)

    
    ## 3: Build PyTorch DataLoaders
    BATCH_SIZE = 32
    
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                  torch.tensor(y_train, dtype=torch.float32))
    val_dataset   = TensorDataset(torch.tensor(X_val,   dtype=torch.float32),
                                  torch.tensor(y_val,   dtype=torch.float32))
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)


    # 4: Define LSTM Model
    class LSTMForecaster(nn.Module):
        def __init__(self, input_size=1, hidden_size=64, num_layers=2, output_size=1, dropout=0.0):
            super().__init__()
            self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
            self.fc = nn.Linear(hidden_size, output_size)
        def forward(self, x):
            out, _ = self.lstm(x)
            out = self.fc(out[:, -1, :])
            return out.unsqueeze(1)  # (batch, horizon, 1)

    # 5: Training Loop With Early Stopping and MLflow Logging

    def train_model_with_early_stopping(
        train_loader, val_loader, input_size=1, hidden_size=64, num_layers=2,
        lr=0.001, epochs=35, patience=5, dropout=0.0, model_name="cpu-pct", run_name=None
    ):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = LSTMForecaster(input_size, hidden_size, num_layers, output_size=1, dropout=dropout).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = nn.MSELoss()
        best_val_loss = float('inf')
        best_model = None
        wait = 0
        train_losses, val_losses = [], []
    
        with mlflow.start_run(run_name=run_name):
            mlflow.log_params({
                "input_size": input_size, "hidden_size": hidden_size,
                "num_layers": num_layers, "lr": lr, "epochs": epochs,
                "batch_size": BATCH_SIZE, "dropout": dropout, "patience": patience
            })
    
            for epoch in range(epochs):
                model.train()
                running_loss = 0
                for xb, yb in train_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    optimizer.zero_grad()
                    out = model(xb)
                    loss = criterion(out, yb)
                    loss.backward()
                    optimizer.step()
                    running_loss += loss.item()
                train_loss = running_loss / len(train_loader)
                train_losses.append(train_loss)
    
                model.eval()
                val_running_loss = 0
                all_pred, all_true = [], []
                with torch.no_grad():
                    for xb, yb in val_loader:
                        xb, yb = xb.to(device), yb.to(device)
                        out = model(xb)
                        loss = criterion(out, yb)
                        val_running_loss += loss.item()
                        all_pred.append(out.cpu().numpy())
                        all_true.append(yb.cpu().numpy())
                val_loss = val_running_loss / len(val_loader)
                val_losses.append(val_loss)
    
                print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.5f} | Val Loss: {val_loss:.5f}")
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_model = model.state_dict()
                    wait = 0
                else:
                    wait += 1
                    if wait >= patience:
                        mlflow.log_metric("epoch_actual", epoch + 1)  # real epoch that run
                        print("Early stopping triggered!")
                        break
    
            # Load best
            if best_model: model.load_state_dict(best_model)
    
            # Final metrics
            model.eval()
            preds, targets = [], []
            with torch.no_grad():
                for xb, yb in val_loader:
                    out = model(xb.to(device)).cpu().numpy()
                    preds.append(out)
                    targets.append(yb.cpu().numpy())
            preds = np.concatenate(preds).reshape(-1)
            targets = np.concatenate(targets).reshape(-1)
    
            mae  = mean_absolute_error(targets, preds)
            #rmse = mean_squared_error(targets, preds, squared=False) #  scikit-learn new version has this
            rmse = np.sqrt(mean_squared_error(targets, preds))
            r2   = r2_score(targets, preds)
            mlflow.log_metric("val_mae", mae)
            mlflow.log_metric("val_rmse", rmse)
            mlflow.log_metric("val_r2", r2)
    
            # --- Plots
            plt.figure(figsize=(10,4))
            plt.plot(targets, label="True")
            plt.plot(preds, label="Predicted")
            plt.legend(); plt.title("True vs. Predicted CPU% (Validation)")
            #most KFP v2 components, the working directory for your step is /tmp, which is writeable.
            plt.tight_layout(); plt.savefig("/tmp/true_vs_pred.png"); plt.close()
            mlflow.log_artifact("/tmp/true_vs_pred.png")
    
            plt.figure(figsize=(10,4))
            plt.plot(preds - targets)
            plt.title("Residuals Over Time"); plt.xlabel("Time"); plt.ylabel("Residual (Pred - True)")
            #most KFP v2 components, the working directory for your step is /tmp, which is writeable.
            plt.tight_layout(); plt.savefig("/tmp/residuals.png"); plt.close()
            mlflow.log_artifact("/tmp/residuals.png")
    
            plt.figure()
            plt.plot(train_losses, label="Train Loss")
            plt.plot(val_losses, label="Val Loss")
            plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title("Learning Curve")
            #most KFP v2 components, the working directory for your step is /tmp, which is writeable.
            plt.legend(); plt.tight_layout(); plt.savefig("/tmp/learning_curve.png"); plt.close()
            mlflow.log_artifact("/tmp/learning_curve.png")
    
            mlflow.pytorch.log_model(model, model_name)
            print("Model + artifacts logged in MLflow.")
    
        return model, (mae, rmse, r2)

    # 6: Run the Training & Logging
    EPOCHS = 35
    PATIENCE = 5
    
    model, metrics = train_model_with_early_stopping(
        train_loader, val_loader,
        input_size=X_train.shape[-1],
        hidden_size=64,
        num_layers=2,
        lr=0.001,
        epochs=EPOCHS,
        patience=PATIENCE,
        dropout=0.1,
        model_name="cpu-node-1-pct-model",
        run_name= datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S-") + "cpu-node-1-forecast" 
    )
    
    print(f"Final MAE: {metrics[0]:.4f} | RMSE: {metrics[1]:.4f} | R2: {metrics[2]:.4f}")






###########################################################################################
#####################################---Connecting Stages---###############################
###########################################################################################

@dsl.pipeline(name="Sunrise", description="Generate models to forecast cpu% in k8s")
def sunrise():
    step1_1 = get_data_batch().set_caching_options(True)
    step2_1 = model_building().after(step1_1).set_caching_options(False)






###########################################################################################
#####################################---Running Pipeline---################################
###########################################################################################
if __name__ == '__main__':
    import kfp
    client = kfp.Client()
    client.create_run_from_pipeline_func(
        sunrise,
        arguments={},
        experiment_name="cpu-forecasting"
    )
