# Exploratory model analysis

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt
import mlflow
from mlflow.tracking import MlflowClient
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import io
import os
from minio import Minio
from io import BytesIO



# 1.  Configuration for Local MLflow Tracking and MinIO

In [3]:
# "Create Minio Client and Reads Data"

minio_client = Minio(
    "minio-service.kubeflow.svc.cluster.local:9000",
    access_key="minio",
    secret_key="minio123",
    secure=False,
    )

# ------------------------
# Configuration for MLflow Tracking
# ------------------------
# --- MLflow Tracking Config ---
TRACKING_URI = "http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080"
EXPERIMENT_NAME = "k8s-cpu-forecasting"
REGISTERED_MODEL_NAME = "cpu-pct"

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
client = MlflowClient()

# 2. Early Stopping Utility

'''
Remembers the best validation loss seen so far.

If the current validation loss improves (by at least min_delta[0.01 - 0]), it resets its counter.

If not, it increments the counter.

If the counter reaches patience[5-10] (number of allowed epochs without improvement), it sets early_stop = True, signaling you should halt training.
'''

In [4]:

class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None or val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True


# 3. Model Definition

nput: x shape is (batch, seq_len, input_size) (e.g., (32, 5, 1) for 32 samples, window size 5, 1 feature).

out, _ = self.lstm(x):

Runs the sequence through the LSTM.

out contains all hidden states for all timesteps (out.shape == (batch, seq_len, hidden_size)).

out[:, -1, :]:

Take the hidden state from the last time step for each batch—this encodes the whole sequence's info for forecasting the next step.

self.fc(...):

Pass the last hidden state through the linear layer to get your forecast.

unsqueeze(1):

Adds an extra dimension for compatibility (PyTorch expects shape (batch, horizon, features)). If forecasting one step, horizon is 1.

           ┌─────────────────────────────────────────────────────────┐
           │                  Input Sequence (batch)                 │
           │      X: (batch, window_size, input_size)                │
           │      Example: [[x₁], [x₂], [x₃], [x₄], [x₅]]           │
           └─────────────────────────────────────────────────────────┘
                                    │
                                    ▼
      ┌────────────────────────────────────────────────────────────┐
      │                  LSTM Layers (stacked)                    │
      │        - num_layers (e.g., 2)                             │
      │        - hidden_size (e.g., 64)                           │
      │        - processes sequence step by step                   │
      └────────────────────────────────────────────────────────────┘
                                    │
                          (All time steps produce hidden states)
                                    │
                                    ▼
       ┌──────────────────────────────────────────────────────────┐
       │     Select LAST hidden state from LSTM output           │
       │     out[:, -1, :]  → shape: (batch, hidden_size)        │
       └──────────────────────────────────────────────────────────┘
                                    │
                                    ▼
      ┌───────────────────────────────────────────────────────────┐
      │    Fully Connected (Linear) Layer                        │
      │    - Maps hidden state to output_size                    │
      │    - Output: (batch, output_size)                        │
      └───────────────────────────────────────────────────────────┘
                                    │
                                    ▼
      ┌───────────────────────────────────────────────────────────┐
      │    Unsqueeze to add horizon dimension                    │
      │    Output: (batch, 1, output_size)                      │
      │    (for 1-step forecasting: horizon=1)                  │
      └───────────────────────────────────────────────────────────┘


Input Batch (X):                Shape: (32, 5, 1)
  └── 32 samples (batch size)
  └── 5 time steps (window size)
  └── 1 feature (CPU%)

           │
           ▼
─────────────────────────────────────────────
         LSTM Layer 1
    - Input:  (32, 5, 1)
    - Output: (32, 5, 64)
─────────────────────────────────────────────
           │
           ▼
─────────────────────────────────────────────
         LSTM Layer 2
    - Input:  (32, 5, 64)
    - Output: (32, 5, 64)
─────────────────────────────────────────────
           │
           ▼
Take the last time step's hidden state:
    - out[:, -1, :]  →  (32, 64)
           │
           ▼
Fully Connected (Linear) Layer:
    - Input:  (32, 64)
    - Output: (32, 1)
           │
           ▼
Unsqueeze to add horizon dim:
    - Output: (32, 1, 1)


### How each part works:

Input:
Each sample is a sequence of 5 timesteps (window), each timestep has 1 value (CPU%).

LSTM Layer 1:
Transforms each input to a hidden state of size 64 for each time step.

LSTM Layer 2:
Takes outputs from Layer 1, produces new hidden state of size 64 (for deeper abstraction).

Select Last Time Step:
Use only the final hidden state for each sample.

Linear Layer:
Maps 64 → 1 to make a prediction for the next CPU%.

Unsqueeze:
Makes output shape (batch, 1, 1) — good for compatibility.

In [5]:
class LSTMForecaster(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, output_size=1, dropout=0.0):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out.unsqueeze(1)  # Shape: (batch, horizon, features)


# Getting my X_train y_train X_val and y-val from MinIO

In [36]:
# dowloading from minio my tran and test sets

def download_numpy_from_minio(client, bucket_name, object_name):
    """
    Download a numpy array from MinIO directly into memory.
    Returns the loaded numpy array.
    """
    try:
        response = client.get_object(bucket_name, object_name)
        data = response.read()  # Read bytes
        array = np.load(io.BytesIO(data))
        response.close()
        response.release_conn()
        print(f"Downloaded from minio://{bucket_name}/{object_name} (shape: {array.shape})")
        return array
    except Exception as e:
        print(f"Failed to download {object_name} from MinIO: {e}")
        return None

# Test download each array

bucket_name = "k8s-resources-forecast"

# These are your desired object paths for each array
object_names = {
    "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
    "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
    "X_test":  "data/k8s-preprocessed/node-1-X_test/X_test.npy",
    "y_test":  "data/k8s-preprocessed/node-1-y_test/y_test.npy",
}

# test= val
X_train = download_numpy_from_minio(minio_client, bucket_name, object_names["X_train"])
y_train = download_numpy_from_minio(minio_client, bucket_name, object_names["y_train"])
X_val  = download_numpy_from_minio(minio_client, bucket_name, object_names["X_test"])
y_val  = download_numpy_from_minio(minio_client, bucket_name, object_names["y_test"])

#PyTorch DataLoaders


train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
val_dataset   = TensorDataset(torch.tensor(X_val,   dtype=torch.float32), torch.tensor(y_val,   dtype=torch.float32))
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)



# Optional: Exploring my A and  y sets

print("X_train.shape:", type(X_train))
print("y_train.shape:", type(y_train))
print("X_val.shape:", type(X_val))
print("y_test.shape:", type(y_val), "\n")

print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)
print("X_val.shape:", X_val.shape)
print("y_val.shape:", y_val.shape)

# Let's see the first few samples (flatten to 1D for readability)
print("\nFirst training input window (X_train[0]):", X_train[0].flatten())
print("First training target (y_train[0]):", y_train[0].flatten())

print("\nLast training input window (X_train[-1]):", X_train[-1].flatten())
print("Last training target (y_train[-1]):", y_train[-1].flatten())

print("\nFirst test input window (X_val[0]):", X_val[0].flatten())
print("First test target (y_val[0]):", y_val[0].flatten())

print("\nLast test input window (X_val[-1]):", X_val[-1].flatten())
print("Last test target (y_val[-1]):", y_val[-1].flatten())

print("First 5 input windows (X_val):")
display(X_val)
print("\nCorresponding targets (y_val):")
display(y_val)



Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-X_train/X_train.npy (shape: (8238, 5, 1))
Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-y_train/y_train.npy (shape: (8238, 1, 1))
Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-X_test/X_test.npy (shape: (2060, 5, 1))
Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-y_test/y_test.npy (shape: (2060, 1, 1))
X_train.shape: <class 'numpy.ndarray'>
y_train.shape: <class 'numpy.ndarray'>
X_val.shape: <class 'numpy.ndarray'>
y_test.shape: <class 'numpy.ndarray'> 

X_train.shape: (8238, 5, 1)
y_train.shape: (8238, 1, 1)
X_val.shape: (2060, 5, 1)
y_val.shape: (2060, 1, 1)

First training input window (X_train[0]): [0.02173913 0.02173913 0.02173913 0.02173913 0.02173913]
First training target (y_train[0]): [0.02173913]

Last training input window (X_train[-1]): [0.22826087 0.19565217 0.22826087 0.25       0.20652174]
Last training target

array([[[0.19565217],
        [0.22826087],
        [0.25      ],
        [0.20652174],
        [0.22826087]],

       [[0.22826087],
        [0.25      ],
        [0.20652174],
        [0.22826087],
        [0.23913043]],

       [[0.25      ],
        [0.20652174],
        [0.22826087],
        [0.23913043],
        [0.22826087]],

       ...,

       [[0.09782609],
        [0.08695652],
        [0.06521739],
        [0.08695652],
        [0.08695652]],

       [[0.08695652],
        [0.06521739],
        [0.08695652],
        [0.08695652],
        [0.07608696]],

       [[0.06521739],
        [0.08695652],
        [0.08695652],
        [0.07608696],
        [0.10869565]]])


Corresponding targets (y_val):


array([[[0.23913043]],

       [[0.22826087]],

       [[0.25      ]],

       ...,

       [[0.07608696]],

       [[0.10869565]],

       [[0.02173913]]])

# 5. Training Loop 

In [7]:
def train_lstm(
    X_train, y_train, X_val, y_val,
    hidden_size=64, num_layers=2, dropout=0.0, lr=0.001,
    batch_size=32, epochs=100, patience=8, verbose=True
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LSTMForecaster(hidden_size=hidden_size, num_layers=num_layers, dropout=dropout).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
    val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    early_stopper = EarlyStopping(patience=patience)
    history = {"train_loss": [], "val_loss": []}
    
    for epoch in range(epochs):
        # --- Train ---
        model.train()
        train_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            output = model(xb)
            loss = criterion(output, yb)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * xb.size(0)
        train_loss /= len(train_loader.dataset)

        # --- Validation ---
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                output = model(xb)
                loss = criterion(output, yb)
                val_loss += loss.item() * xb.size(0)
        val_loss /= len(val_loader.dataset)
        
        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        
        if verbose:
            print(f"Epoch {epoch+1:3d} | Train Loss: {train_loss:.5f} | Val Loss: {val_loss:.5f}")
        
        early_stopper(val_loss)
        if early_stopper.early_stop:
            if verbose: print("Early stopping!")
            break

    return model, history


# 6 Evaluation Functions

In [8]:
def calc_metrics(y_true, y_pred):
    y_pred = y_pred.flatten()
    y_true = y_true.flatten()
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return {"MAE": mae, "RMSE": rmse, "R2": r2}

def plot_results(y_true, y_pred, history, out_prefix=""):
    # True vs Predicted
    plt.figure(figsize=(12,3))
    plt.plot(y_true, label="True", alpha=0.7)
    plt.plot(y_pred, label="Pred", alpha=0.7)
    plt.title("True vs. Predicted CPU%")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{out_prefix}true_vs_pred.png"); plt.close()
    
    # Residuals
    plt.figure(figsize=(12,3))
    plt.plot(y_pred - y_true)
    plt.title("Residuals: Prediction Error Over Time")
    plt.tight_layout()
    plt.savefig(f"{out_prefix}residuals.png"); plt.close()
    
    # Learning Curve
    plt.figure(figsize=(8,3))
    plt.plot(history["train_loss"], label="Train")
    plt.plot(history["val_loss"], label="Val")
    plt.title("Learning Curve")
    plt.xlabel("Epoch")
    plt.ylabel("Loss (MSE)")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{out_prefix}learning_curve.png"); plt.close()


In [9]:
# 7 MLflow-Tracked Training

In [10]:
def run_and_track(
    X_train, y_train, X_val, y_val,
    hidden_size=64, num_layers=2, dropout=0.0, lr=0.001,
    batch_size=32, epochs=40, patience=6,
    window_size=5, horizon=1
):
    with mlflow.start_run() as run:
        # Log all HP
        mlflow.log_params({
            "hidden_size": hidden_size, "num_layers": num_layers, "dropout": dropout,
            "lr": lr, "batch_size": batch_size, "epochs": epochs,
            "window_size": window_size, "horizon": horizon
        })
        # Train
        model, history = train_lstm(
            X_train, y_train, X_val, y_val,
            hidden_size, num_layers, dropout, lr,
            batch_size, epochs, patience
        )
        # Predict (CORRECTED SECTION)
        model.eval()
        y_pred = []
        y_true = []
        device = next(model.parameters()).device
        val_dataset = TensorDataset(
            torch.tensor(X_val, dtype=torch.float32),
            torch.tensor(y_val, dtype=torch.float32)
        )
        val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                preds = model(xb).cpu().numpy()
                y_pred.append(preds)
                y_true.append(yb.numpy())
        y_pred = np.vstack(y_pred).squeeze()
        y_true = np.vstack(y_true).squeeze()
        # Metrics
        metrics = calc_metrics(y_true, y_pred)
        mlflow.log_metrics(metrics)
        # Plots
        plot_results(y_true, y_pred, history, out_prefix="mlflow_")
        mlflow.log_artifact("mlflow_true_vs_pred.png")
        mlflow.log_artifact("mlflow_residuals.png")
        mlflow.log_artifact("mlflow_learning_curve.png")
        # Save model
        model_path = "lstm_model.pt"
        torch.save(model.state_dict(), model_path)
        mlflow.log_artifact(model_path)
        # Register Model
        REGISTERED_MODEL_NAME = "k8s-cpu-forecast-lstm"
        result = mlflow.pytorch.log_model(model, artifact_path="model", registered_model_name=REGISTERED_MODEL_NAME)
        print("MLflow run:", run.info.run_id)
        return run.info.run_id, metrics


In [14]:

run_id, metrics = run_and_track(
    X_train, y_train, X_val, y_val,
    hidden_size=64, num_layers=2, dropout=0.1, lr=0.001,
    batch_size=32, epochs=30, patience=5,
    window_size=5, horizon=1
)
print(metrics)


Epoch   1 | Train Loss: 0.02373 | Val Loss: 0.00113
Epoch   2 | Train Loss: 0.00562 | Val Loss: 0.00076
Epoch   3 | Train Loss: 0.00548 | Val Loss: 0.00061
Epoch   4 | Train Loss: 0.00521 | Val Loss: 0.00100
Epoch   5 | Train Loss: 0.00507 | Val Loss: 0.00111
Epoch   6 | Train Loss: 0.00472 | Val Loss: 0.00059
Epoch   7 | Train Loss: 0.00432 | Val Loss: 0.00062
Epoch   8 | Train Loss: 0.00421 | Val Loss: 0.00072
Epoch   9 | Train Loss: 0.00413 | Val Loss: 0.00078
Epoch  10 | Train Loss: 0.00410 | Val Loss: 0.00069
Epoch  11 | Train Loss: 0.00396 | Val Loss: 0.00067
Early stopping!


Registered model 'k8s-cpu-forecast-lstm' already exists. Creating a new version of this model...
2025/09/04 13:39:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: k8s-cpu-forecast-lstm, version 3
Created version '3' of model 'k8s-cpu-forecast-lstm'.


MLflow run: fdb2407ad84b423a904979036d8374e3
🏃 View run likeable-fox-184 at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/fdb2407ad84b423a904979036d8374e3
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
{'MAE': 0.02034158632159233, 'RMSE': 0.025939794427791898, 'R2': 0.7101677656173706}


In [15]:
print(mlflow.__version__)

2.21.3


# Second version

## 1. Imports & MinIO Download Function

In [27]:
import numpy as np
import io
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import mlflow
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Fucntion to download numpy arrays from MinIO
def download_numpy_from_minio(minio_client, bucket, object_name):
    try:
        with minio_client.get_object(bucket, object_name) as response:
            arr = np.load(io.BytesIO(response.read()))
            print(f"Downloaded: s3://{bucket}/{object_name} shape={arr.shape}")
            return arr
    except Exception as e:
        print(f"Error: {e}")



## 2: Load Train/Val Sets From MinIO

In [37]:
bucket_name = "k8s-resources-forecast"
object_names = {
    "X_train": "data/k8s-preprocessed/node-1-X_train/X_train.npy",
    "y_train": "data/k8s-preprocessed/node-1-y_train/y_train.npy",
    "X_val":   "data/k8s-preprocessed/node-1-X_test/X_test.npy",
    "y_val":   "data/k8s-preprocessed/node-1-y_test/y_test.npy",
}

X_train = download_numpy_from_minio(minio_client, bucket_name, object_names["X_train"])
y_train = download_numpy_from_minio(minio_client, bucket_name, object_names["y_train"])
X_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["X_val"])
y_val   = download_numpy_from_minio(minio_client, bucket_name, object_names["y_val"])

print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape, "y_val shape:", y_val.shape)


Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-X_train/X_train.npy (shape: (8238, 5, 1))
Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-y_train/y_train.npy (shape: (8238, 1, 1))
Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-X_test/X_test.npy (shape: (2060, 5, 1))
Downloaded from minio://k8s-resources-forecast/data/k8s-preprocessed/node-1-y_test/y_test.npy (shape: (2060, 1, 1))
X_train shape: (8238, 5, 1) y_train shape: (8238, 1, 1)
X_val shape: (2060, 5, 1) y_val shape: (2060, 1, 1)


## 3: Build PyTorch DataLoaders

In [38]:
BATCH_SIZE = 32

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                              torch.tensor(y_train, dtype=torch.float32))
val_dataset   = TensorDataset(torch.tensor(X_val,   dtype=torch.float32),
                              torch.tensor(y_val,   dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)


## 4: Define LSTM Model

In [30]:
class LSTMForecaster(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, output_size=1, dropout=0.0):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out.unsqueeze(1)  # (batch, horizon, 1)


## 5: Training Loop With Early Stopping and MLflow Logging

In [39]:
def train_model_with_early_stopping(
    train_loader, val_loader, input_size=1, hidden_size=64, num_layers=2,
    lr=0.001, epochs=35, patience=5, dropout=0.0, model_name="cpu-pct", run_name=None
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LSTMForecaster(input_size, hidden_size, num_layers, output_size=1, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    best_val_loss = float('inf')
    best_model = None
    wait = 0
    train_losses, val_losses = [], []

    with mlflow.start_run(run_name=run_name):
        mlflow.log_params({
            "input_size": input_size, "hidden_size": hidden_size,
            "num_layers": num_layers, "lr": lr, "epochs": epochs,
            "batch_size": BATCH_SIZE, "dropout": dropout, "patience": patience
        })

        for epoch in range(epochs):
            model.train()
            running_loss = 0
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                out = model(xb)
                loss = criterion(out, yb)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            train_loss = running_loss / len(train_loader)
            train_losses.append(train_loss)

            model.eval()
            val_running_loss = 0
            all_pred, all_true = [], []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    out = model(xb)
                    loss = criterion(out, yb)
                    val_running_loss += loss.item()
                    all_pred.append(out.cpu().numpy())
                    all_true.append(yb.cpu().numpy())
            val_loss = val_running_loss / len(val_loader)
            val_losses.append(val_loss)

            print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.5f} | Val Loss: {val_loss:.5f}")
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = model.state_dict()
                wait = 0
            else:
                wait += 1
                if wait >= patience:
                    mlflow.log_metric("epoch_actual", epoch + 1)  # real epoch that run
                    print("Early stopping triggered!")
                    break

        # Load best
        if best_model: model.load_state_dict(best_model)

        # Final metrics
        model.eval()
        preds, targets = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                out = model(xb.to(device)).cpu().numpy()
                preds.append(out)
                targets.append(yb.cpu().numpy())
        preds = np.concatenate(preds).reshape(-1)
        targets = np.concatenate(targets).reshape(-1)

        mae  = mean_absolute_error(targets, preds)
        #rmse = mean_squared_error(targets, preds, squared=False) #  scikit-learn new version has this
        rmse = np.sqrt(mean_squared_error(targets, preds))
        r2   = r2_score(targets, preds)
        mlflow.log_metric("val_mae", mae)
        mlflow.log_metric("val_rmse", rmse)
        mlflow.log_metric("val_r2", r2)

        # --- Plots
        plt.figure(figsize=(10,4))
        plt.plot(targets, label="True")
        plt.plot(preds, label="Predicted")
        plt.legend(); plt.title("True vs. Predicted CPU% (Validation)")
        plt.tight_layout(); plt.savefig("true_vs_pred.png"); plt.close()
        mlflow.log_artifact("true_vs_pred.png")

        plt.figure(figsize=(10,4))
        plt.plot(preds - targets)
        plt.title("Residuals Over Time"); plt.xlabel("Time"); plt.ylabel("Residual (Pred - True)")
        plt.tight_layout(); plt.savefig("residuals.png"); plt.close()
        mlflow.log_artifact("residuals.png")

        plt.figure()
        plt.plot(train_losses, label="Train Loss")
        plt.plot(val_losses, label="Val Loss")
        plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title("Learning Curve")
        plt.legend(); plt.tight_layout(); plt.savefig("learning_curve.png"); plt.close()
        mlflow.log_artifact("learning_curve.png")

        mlflow.pytorch.log_model(model, "model")
        print("Model + artifacts logged in MLflow.")

    return model, (mae, rmse, r2)


## 6: Run the Training & Logging 

In [41]:
EPOCHS = 35
PATIENCE = 5

model, metrics = train_model_with_early_stopping(
    train_loader, val_loader,
    input_size=X_train.shape[-1],
    hidden_size=64,
    num_layers=2,
    lr=0.001,
    epochs=EPOCHS,
    patience=PATIENCE,
    dropout=0.1,
    model_name="cpu-pct",
    run_name="lstm-early-stop"
)

print(f"Final MAE: {metrics[0]:.4f} | RMSE: {metrics[1]:.4f} | R2: {metrics[2]:.4f}")


Epoch 1/35 | Train Loss: 0.02433 | Val Loss: 0.00150
Epoch 2/35 | Train Loss: 0.00557 | Val Loss: 0.00070
Epoch 3/35 | Train Loss: 0.00548 | Val Loss: 0.00065
Epoch 4/35 | Train Loss: 0.00524 | Val Loss: 0.00059
Epoch 5/35 | Train Loss: 0.00507 | Val Loss: 0.00087
Epoch 6/35 | Train Loss: 0.00504 | Val Loss: 0.00086
Epoch 7/35 | Train Loss: 0.00475 | Val Loss: 0.00098
Epoch 8/35 | Train Loss: 0.00444 | Val Loss: 0.00083
Epoch 9/35 | Train Loss: 0.00450 | Val Loss: 0.00062
Early stopping triggered!




Model + artifacts logged in MLflow.
🏃 View run lstm-early-stop at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27/runs/54fc01de8c3f4ea9839f4d5deb32bca0
🧪 View experiment at: http://sunrise-mlflow-tracking.mlflow.svc.cluster.local:5080/#/experiments/27
Final MAE: 0.0195 | RMSE: 0.0248 | R2: 0.7353
