### 1.Stage "Split Data into Train and Test Sets"

In [99]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import joblib
from minio import Minio
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from io import BytesIO


In [12]:
# "Create Minio Client and Reads Data"
df minio_client()

minio_client = Minio(
    "minio-service.kubeflow.svc.cluster.local:9000",
    access_key="minio",
    secret_key="minio123",
    secure=False,
)


In [29]:
#  Function to Get the object as a stream (not saving to file in the pod)
def read_csv_from_minio(minio_client, bucket, object_name):
    with minio_client.get_object(bucket, object_name) as response:
        df = pd.read_csv(BytesIO(response.read()))
    # Parse timestamp column, enabling efficient filtering, resampling, rolling windows
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)
    df.sort_index(inplace=True)

    return df


In [55]:
# Load data 
bucket_name = "k8s-resources-forecast"
object_name = "data/k8s-preprocessed/prec-pct-k8s-dataset.csv"
df = read_csv_from_minio(minio_client, bucket_name, object_name)

df.head(5)


Unnamed: 0_level_0,ac3-master-vm_cpu_pct,ac3-master-vm_mem_pct,ac3-node-1-vm_cpu_pct,ac3-node-1-vm_mem_pct,ac3-node-2-vm_cpu_pct,ac3-node-2-vm_mem_pct,ac3-node-3-vm_cpu_pct,ac3-node-3-vm_mem_pct
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-05-08 18:42:05,4,49,3,17,3,20,6,27
2025-05-08 18:42:35,4,49,3,17,3,20,6,27
2025-05-08 18:43:05,4,49,3,17,3,20,6,26
2025-05-08 18:43:35,4,49,3,17,3,20,6,26
2025-05-08 18:44:05,4,49,3,17,3,20,6,27


In [60]:
# Select only the feature to train my model 'ac3-node-1-vm_cpu_pct'

selected_feature = 'ac3-node-1-vm_cpu_pct'
df_node_1_cpu_feature = df[[selected_feature]]  # Keep as DataFrame (not Series)
df_node_1_cpu_feature.head()

Unnamed: 0_level_0,ac3-node-1-vm_cpu_pct
timestamp,Unnamed: 1_level_1
2025-05-08 18:42:05,3
2025-05-08 18:42:35,3
2025-05-08 18:43:05,3
2025-05-08 18:43:35,3
2025-05-08 18:44:05,3


In [61]:
# Fuction to Normalize teh dataset, the timestamp is not sccaled since it is the index. Values are from (0-100[%])
def normalize_with_sklearn(df):
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df),
        columns=df.columns,
        index=df.index
    )
    return df_scaled, scaler  # return scaler if you want to inverse later

In [64]:
df_node_1_cpu_feature_scaled, scaler = normalize_with_sklearn(df_node_1_cpu_feature)
df_node_1_cpu_feature_scaled.head()

Unnamed: 0_level_0,ac3-node-1-vm_cpu_pct
timestamp,Unnamed: 1_level_1
2025-05-08 18:42:05,0.021739
2025-05-08 18:42:35,0.021739
2025-05-08 18:43:05,0.021739
2025-05-08 18:43:35,0.021739
2025-05-08 18:44:05,0.021739


In [65]:
def save_scaler_to_minio(minio_client, scaler, bucket, object_name):
    if not object_name.endswith('.pkl'):
        object_name += '.pkl'  # Ensure the extension

    # Serialize the scaler to bytes
    scaler_bytes = BytesIO()
    joblib.dump(scaler, scaler_bytes)
    scaler_bytes.seek(0)  # Rewind to the beginning

    # Upload to MinIO
    minio_client.put_object(
        bucket_name=bucket,
        object_name=object_name,
        data=scaler_bytes,
        length=len(scaler_bytes.getvalue()),
        content_type='application/octet-stream'
    )

    print(f"Scaler saved to s3://{bucket}/{object_name}")


In [73]:
#Upload Scaler to Minio as .pkl
bucket_name = "k8s-resources-forecast"
object_name = "data/k8s-preprocessed/node-1-cpu-scaler/node-1-scaler"

save_scaler_to_minio(minio_client, scaler, bucket_name, object_name)

Scaler saved to s3://k8s-resources-forecast/data/k8s-preprocessed/node-1-cpu-scaler/node-1-scaler.pkl


In [74]:
# ===========================================
#    Fucntion Creating sliding window sequences
#    Window size = 5, Forecast horizon = 1
# ===========================================
def create_sequences(data, window_size=5, horizon=1):
    """
    Create sliding window sequences from time series data.

    Args:
        data (np.ndarray): Shape (n_samples, 1)
        window_size (int): Number of past steps as input
        horizon (int): Steps ahead to predict (usually 1)

    Returns:
        X (np.ndarray): Shape (samples, window_size, 1)
        y (np.ndarray): Shape (samples, horizon, 1)
    """
    X, y = [], []
    for i in range(len(data) - window_size - horizon + 1):
        X.append(data[i:(i + window_size)])
        y.append(data[i + window_size:i + window_size + horizon])
    
    X = np.array(X).reshape(-1, window_size, 1)
    y = np.array(y).reshape(-1, horizon, 1)
    return X, y


# Function Splitting data into train and test sets
#    (80% train, 20% test)
# ===========================================
def split_time_series(X, y, test_size=0.2):
    #Split time series without shuffling.
    return train_test_split(X, y, test_size=test_size, shuffle=False)

In [78]:
# Obtaining my Train and test sets
X,y = create_sequences(df_node_1_cpu_feature_scaled)
X_train, X_test, y_train, y_test = split_time_series(X, y)


In [102]:
import io
import numpy as np

def upload_numpy_to_minio(client, array, bucket_name, object_name):
    """
    Uploads a numpy array to MinIO directly from memory as .npy.
    """
    buffer = io.BytesIO()
    np.save(buffer, array)
    buffer.seek(0)
    client.put_object(
        bucket_name=bucket_name,
        object_name=object_name,
        data=buffer,
        length=buffer.getbuffer().nbytes,
        content_type="application/octet-stream"
    )
    print(f"Uploaded to minio://{bucket_name}/{object_name} ({array.shape})")


In [104]:
# Uploading ot Minio

bucket_name = "k8s-resources-forecast"

# These are your desired object paths for each array
object_names = {
    "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
    "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
    "X_test":  "data/k8s-preprocessed/node-1-X_test/X_test.npy",
    "y_test":  "data/k8s-preprocessed/node-1-y_test/y_test.npy",
}


# Upload each array
upload_numpy_to_minio(minio_client, X_train, bucket_name, object_names["X_train"])
upload_numpy_to_minio(minio_client, y_train, bucket_name, object_names["y_train"])
upload_numpy_to_minio(minio_client, X_test,  bucket_name, object_names["X_test"])
upload_numpy_to_minio(minio_client, y_test,  bucket_name, object_names["y_test"])


Uploaded to minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-X_train/X_train.npy ((8238, 5, 1))
Uploaded to minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-y_train/y_train.npy ((8238, 1, 1))
Uploaded to minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-X_test/X_test.npy ((2060, 5, 1))
Uploaded to minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-y_test/y_test.npy ((2060, 1, 1))


# Testing my dataset and sets uploaded

In [92]:
#Exploring my A and  y sets
print("X_train.shape:", type(X_train))
print("y_train.shape:", type(y_train))
print("X_test.shape:", type(X_test))
print("y_test.shape:", type(y_test), "\n")

print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)
print("X_test.shape:", X_test.shape)
print("y_test.shape:", y_test.shape)

# Let's see the first few samples (flatten to 1D for readability)
print("\nFirst training input window (X_train[0]):", X_train[0].flatten())
print("First training target (y_train[0]):", y_train[0].flatten())

print("\nLast training input window (X_train[-1]):", X_train[-1].flatten())
print("Last training target (y_train[-1]):", y_train[-1].flatten())

print("\nFirst test input window (X_test[0]):", X_test[0].flatten())
print("First test target (y_test[0]):", y_test[0].flatten())

print("\nLast test input window (X_test[-1]):", X_test[-1].flatten())
print("Last test target (y_test[-1]):", y_test[-1].flatten())

print("First 5 input windows (X_train):")
display(X_train)
print("\nCorresponding targets (y_train):")
display(y_train)


X_train.shape: <class 'numpy.ndarray'>
y_train.shape: <class 'numpy.ndarray'>
X_test.shape: <class 'numpy.ndarray'>
y_test.shape: <class 'numpy.ndarray'> 

X_train.shape: (8238, 5, 1)
y_train.shape: (8238, 1, 1)
X_test.shape: (2060, 5, 1)
y_test.shape: (2060, 1, 1)

First training input window (X_train[0]): [0.02173913 0.02173913 0.02173913 0.02173913 0.02173913]
First training target (y_train[0]): [0.02173913]

Last training input window (X_train[-1]): [0.22826087 0.19565217 0.22826087 0.25       0.20652174]
Last training target (y_train[-1]): [0.22826087]

First test input window (X_test[0]): [0.19565217 0.22826087 0.25       0.20652174 0.22826087]
First test target (y_test[0]): [0.23913043]

Last test input window (X_test[-1]): [0.06521739 0.08695652 0.08695652 0.07608696 0.10869565]
Last test target (y_test[-1]): [0.02173913]
First 5 input windows (X_train):


array([[[0.02173913],
        [0.02173913],
        [0.02173913],
        [0.02173913],
        [0.02173913]],

       [[0.02173913],
        [0.02173913],
        [0.02173913],
        [0.02173913],
        [0.02173913]],

       [[0.02173913],
        [0.02173913],
        [0.02173913],
        [0.02173913],
        [0.02173913]],

       ...,

       [[0.25      ],
        [0.20652174],
        [0.22826087],
        [0.19565217],
        [0.22826087]],

       [[0.20652174],
        [0.22826087],
        [0.19565217],
        [0.22826087],
        [0.25      ]],

       [[0.22826087],
        [0.19565217],
        [0.22826087],
        [0.25      ],
        [0.20652174]]])


Corresponding targets (y_train):


array([[[0.02173913]],

       [[0.02173913]],

       [[0.02173913]],

       ...,

       [[0.25      ]],

       [[0.20652174]],

       [[0.22826087]]])

In [106]:
# dowloading from minio my tran and test sets

def download_numpy_from_minio(client, bucket_name, object_name):
    """
    Download a numpy array from MinIO directly into memory.
    Returns the loaded numpy array.
    """
    try:
        response = client.get_object(bucket_name, object_name)
        data = response.read()  # Read bytes
        array = np.load(io.BytesIO(data))
        response.close()
        response.release_conn()
        print(f"Downloaded from minio://{bucket_name}/{object_name} (shape: {array.shape})")
        return array
    except Exception as e:
        print(f"Failed to download {object_name} from MinIO: {e}")
        return None

# Test download each array
X_train_dl = download_numpy_from_minio(minio_client, bucket_name, object_names["X_train"])
y_train_dl = download_numpy_from_minio(minio_client, bucket_name, object_names["y_train"])
X_test_dl  = download_numpy_from_minio(minio_client, bucket_name, object_names["X_test"])
y_test_dl  = download_numpy_from_minio(minio_client, bucket_name, object_names["y_test"])

# Check they are equal to originals (optional, for assurance)
print(np.allclose(X_train, X_train_dl))  # Should print True
print(np.allclose(y_train, y_train_dl))  # Should print True
print(np.allclose(X_test,  X_test_dl))   # Should print True
print(np.allclose(y_test,  y_test_dl))   # Should print True


Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-X_train/X_train.npy (shape: (8238, 5, 1))
Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-y_train/y_train.npy (shape: (8238, 1, 1))
Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-X_test/X_test.npy (shape: (2060, 5, 1))
Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-y_test/y_test.npy (shape: (2060, 1, 1))
True
True
True
True
