In [None]:
import kfp
from kfp import dsl
from typing import NamedTuple
import kfp.components as comp

################################################################################################################################################################################
@dsl.component(
    base_image="docker.io/jhofydu/pytorch-kfp:v1.0.0",
    packages_to_install=["minio"]
)

def get_data_batch():
    import pandas as pd
    import numpy as np
    import torch
    import torch.nn as nn
    import joblib
    from minio import Minio
    from torch.utils.data import DataLoader, TensorDataset
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    from io import BytesIO

    # "Create Minio Client and Reads Data"
    minio_client = Minio(
        "minio-service.kubeflow.svc.cluster.local:9000",
        access_key="minio",
        secret_key="minio123",
        secure=False,
    )

     #  Function to Get the object as a stream (not saving to file in the pod)
    def read_csv_from_minio(minio_client, bucket, object_name):
        with minio_client.get_object(bucket, object_name) as response:
            df = pd.read_csv(BytesIO(response.read()))
        # Parse timestamp column, enabling efficient filtering, resampling, rolling windows
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.set_index('timestamp', inplace=True)
        df.sort_index(inplace=True)
    
        return df   

    # Load data 
    bucket_name = "k8s-resources-forecast"
    object_name = "/data/k8s-preprocessed/prec-pct-k8s-dataset.csv"
    df = read_csv_from_minio(minio_client, bucket_name, object_name)
    
    #Select only the feature to train my model 'ac3-node-1-vm_cpu_pct'
    selected_feature = 'ac3-node-1-vm_cpu_pct'
    df_node_1_cpu_feature = df[[selected_feature]]  # Keep as DataFrame (not Series)
        
    
    # Fuction to Normalize teh dataset, the timestamp is not sccaled since it is the index. Values are from (0-100[%])
    def normalize_with_sklearn(df):
        scaler = MinMaxScaler()
        df_scaled = pd.DataFrame(
            scaler.fit_transform(df),
            columns=df.columns,
            index=df.index
        )
        return df_scaled, scaler  # return scaler if you want to inverse later

    #gettin scaled data and scaler
    df_node_1_cpu_feature_scaled, scaler = normalize_with_sklearn(df_node_1_cpu_feature)

    # Function to save teh escaler in Minio
    def save_scaler_to_minio(minio_client, scaler, bucket, object_name):
        if not object_name.endswith('.pkl'):
            object_name += '.pkl'  # Ensure the extension
    
        # Serialize the scaler to bytes
        scaler_bytes = BytesIO()
        joblib.dump(scaler, scaler_bytes)
        scaler_bytes.seek(0)  # Rewind to the beginning
    
        # Upload to MinIO
        minio_client.put_object(
            bucket_name=bucket,
            object_name=object_name,
            data=scaler_bytes,
            length=len(scaler_bytes.getvalue()),
            content_type='application/octet-stream'
        )
    
        print(f"Scaler saved to s3://{bucket}/{object_name}")

    #Upload Scaler to Minio as .pkl
    bucket_name = "k8s-resources-forecast"
    object_name = "data/k8s-preprocessed/node-1-cpu-scaler/node-1-scaler"
    save_scaler_to_minio(minio_client, scaler, bucket_name, object_name)

    # ===========================================
    #    Fucntion Creating sliding window sequences
    #    Window size = 5, Forecast horizon = 1
    # ===========================================
    def create_sequences(data, window_size=5, horizon=1):
        """
        Create sliding window sequences from time series data.
    
        Args:
            data (np.ndarray): Shape (n_samples, 1)
            window_size (int): Number of past steps as input
            horizon (int): Steps ahead to predict (usually 1)
    
        Returns:
            X (np.ndarray): Shape (samples, window_size, 1)
            y (np.ndarray): Shape (samples, horizon, 1)
        """
        X, y = [], []
        for i in range(len(data) - window_size - horizon + 1):
            X.append(data[i:(i + window_size)])
            y.append(data[i + window_size:i + window_size + horizon])
        
        X = np.array(X).reshape(-1, window_size, 1)
        y = np.array(y).reshape(-1, horizon, 1)
        return X, y
    
    
    # Function Splitting data into train and test sets
    #    (80% train, 20% test)
    # ===========================================
    def split_time_series(X, y, test_size=0.2):
        #Split time series without shuffling.
        return train_test_split(X, y, test_size=test_size, shuffle=False)
    
    # Obtaining my Train and test sets
    X,y = create_sequences(df_node_1_cpu_feature_scaled)
    X_train, X_test, y_train, y_test = split_time_series(X, y)

    # Fucntion to upload the X_train, X_test, y_train, y_test to Minio (as)
    def upload_numpy_to_minio(client, array, bucket_name, object_name):
        """
        Uploads a numpy array to MinIO directly from memory as .npy.
        """
        buffer = io.BytesIO()
        np.save(buffer, array)
        buffer.seek(0)
        client.put_object(
            bucket_name=bucket_name,
            object_name=object_name,
            data=buffer,
            length=buffer.getbuffer().nbytes,
            content_type="application/octet-stream"
        )
        print(f"Uploaded to minio://{bucket_name}/{object_name} ({array.shape})")

    #Uploading ot Minio
    
    bucket_name = "k8s-resources-forecast"
    
    # These are your desired object paths for each array
    object_names = {
        "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
        "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
        "X_test":  "data/k8s-preprocessed/node-1-X_test/X_test.npy",
        "y_test":  "data/k8s-preprocessed/node-1-y_test/y_test.npy",
    }
    
    
    # Upload each array
    upload_numpy_to_minio(minio_client, X_train, bucket_name, object_names["X_train"])
    upload_numpy_to_minio(minio_client, y_train, bucket_name, object_names["y_train"])
    upload_numpy_to_minio(minio_client, X_test,  bucket_name, object_names["X_test"])
    upload_numpy_to_minio(minio_client, y_test,  bucket_name, object_names["y_test"])
    
################################################################################################################################################################################






################################################################################################################################################################################

@dsl.pipeline(name="Sunrise", description="Generate models to forecast cpu% in k8s")
def sunrise():
    step1_1 = get_data_batch().set_caching_options(True)


################################################################################################################################################################################
# Pipeline execution 
if __name__ == '__main__':
    import kfp
    client = kfp.Client()
    client.create_run_from_pipeline_func(sunrise, arguments={}, experiment_name="cpu-forecasting")



In [3]:
import kfp
from kfp import dsl

@dsl.component(
    base_image="docker.io/jhofydu/pytorch-kfp:v1.0.0",
    packages_to_install=["minio"]
)
def get_data_batch():
    import pandas as pd
    import numpy as np
    import torch
    import joblib
    from minio import Minio
    from io import BytesIO
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split

    # Minio client
    minio_client = Minio(
        "minio-service.kubeflow.svc.cluster.local:9000",
        access_key="minio",
        secret_key="minio123",
        secure=False,
    )

    def read_csv_from_minio(minio_client, bucket, object_name):
        with minio_client.get_object(bucket, object_name) as response:
            df = pd.read_csv(BytesIO(response.read()))
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.set_index('timestamp', inplace=True)
        df.sort_index(inplace=True)
        return df

    bucket_name = "k8s-resources-forecast"
    object_name = "data/k8s-preprocessed/prec-pct-k8s-dataset.csv"
    df = read_csv_from_minio(minio_client, bucket_name, object_name)

    selected_feature = 'ac3-node-1-vm_cpu_pct'
    df_node_1_cpu_feature = df[[selected_feature]]

    def normalize_with_sklearn(df):
        scaler = MinMaxScaler()
        df_scaled = pd.DataFrame(
            scaler.fit_transform(df),
            columns=df.columns,
            index=df.index
        )
        return df_scaled, scaler

    df_node_1_cpu_feature_scaled, scaler = normalize_with_sklearn(df_node_1_cpu_feature)

    def save_scaler_to_minio(minio_client, scaler, bucket, object_name):
        if not object_name.endswith('.pkl'):
            object_name += '.pkl'
        scaler_bytes = BytesIO()
        joblib.dump(scaler, scaler_bytes)
        scaler_bytes.seek(0)
        minio_client.put_object(
            bucket_name=bucket,
            object_name=object_name,
            data=scaler_bytes,
            length=len(scaler_bytes.getvalue()),
            content_type='application/octet-stream'
        )
        print(f"Scaler saved to s3://{bucket}/{object_name}")

    object_name_scaler = "data/k8s-preprocessed/node-1-cpu-scaler/node-1-scaler"
    save_scaler_to_minio(minio_client, scaler, bucket_name, object_name_scaler)

    def create_sequences(data, window_size=5, horizon=1):
        X, y = [], []
        for i in range(len(data) - window_size - horizon + 1):
            X.append(data[i:(i + window_size)])
            y.append(data[i + window_size:i + window_size + horizon])
        X = np.array(X).reshape(-1, window_size, 1)
        y = np.array(y).reshape(-1, horizon, 1)
        return X, y

    def split_time_series(X, y, test_size=0.2):
        from sklearn.model_selection import train_test_split
        return train_test_split(X, y, test_size=test_size, shuffle=False)

    X, y = create_sequences(df_node_1_cpu_feature_scaled)
    X_train, X_test, y_train, y_test = split_time_series(X, y)

    def upload_numpy_to_minio(client, array, bucket_name, object_name):
        buffer = BytesIO()
        np.save(buffer, array)
        buffer.seek(0)
        client.put_object(
            bucket_name=bucket_name,
            object_name=object_name,
            data=buffer,
            length=buffer.getbuffer().nbytes,
            content_type="application/octet-stream"
        )
        print(f"Uploaded to minio://{bucket_name}/{object_name} ({array.shape})")

    object_names = {
        "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
        "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
        "X_test":  "data/k8s-preprocessed/node-1-X_test/X_test.npy",
        "y_test":  "data/k8s-preprocessed/node-1-y_test/y_test.npy",
    }

    upload_numpy_to_minio(minio_client, X_train, bucket_name, object_names["X_train"])
    upload_numpy_to_minio(minio_client, y_train, bucket_name, object_names["y_train"])
    upload_numpy_to_minio(minio_client, X_test,  bucket_name, object_names["X_test"])
    upload_numpy_to_minio(minio_client, y_test,  bucket_name, object_names["y_test"])


@dsl.pipeline(name="Sunrise", description="Generate models to forecast cpu% in k8s")
def sunrise():
    step1_1 = get_data_batch().set_caching_options(False)


if __name__ == '__main__':
    import kfp
    client = kfp.Client()
    client.create_run_from_pipeline_func(
        sunrise,
        arguments={},
        experiment_name="cpu-forecasting"
    )
