In [9]:
# System and path management
import sys
sys.path.append('/projectnb/peaclab-mon/boztop/resource-allocation/config')

# Data handling and manipulation
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta, datetime

# Machine learning
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
import xgboost as xgb
from xgboost import XGBRegressor

# Plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm

# Custom imports
from fugaku_data_preprocessing import preprocess_data
from ml_model_training import train_model_per_cluster

# Utility
import itertools


# Data Preprocessing

In [2]:
directory = '/projectnb/peaclab-mon/boztop/resource-allocation/datasets/fugaku/24_04.parquet'
df_success, df_failure, numerical_features = preprocess_data(directory)
print("Preprocessing complete.")
print(f"Number of successful jobs: {len(df_success)}")
print(f"Number of failed jobs: {len(df_failure)}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = label_encoders[col].fit_transform(df[col])


Preprocessing complete.
Number of successful jobs: 338796
Number of failed jobs: 81654


# Menear et al.

In [3]:
#sourced from: https://github.com/NREL/eagle-jobs/blob/master/python_scripts/EVAL_FinalModel.py
def baseline_training(train_df, test_df, train_features, target_feature, filename):
    
    train_df = train_df.dropna()
    test_df = test_df.dropna()
    
    X_train = train_df[train_features]
    X_test = test_df[train_features]
    y_train = train_df[target_feature]
    y_test = test_df[target_feature]


    params = { # From Optuna HPO
        'n_estimators': 168,
        'max_depth': 7,
        'learning_rate': 0.3968571956999504,
        'gamma': 0.640232768439118,
        'subsample': 0.747747407403972,
        'colsample_bytree': 0.6280085182287491
    }

    model = XGBRegressor(**params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    pred_vs_act_df = pd.DataFrame(columns=['runtime_pred', 'runtime_act'])
    pred_vs_act_df = pd.DataFrame({'runtime_pred': y_pred, 'runtime_act': y_test})

    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    print(f'r2: {r2:.3f}, rmse: {rmse:.0f}')

    # save the results of the baseline    
    pred_vs_act_df.to_pickle(filename)
        

# Execution Time Prediction Experiments

In [5]:
def create_sub_dataframes(df, selected_features, n_clusters):

    feature_data = df[selected_features].replace([np.inf, -np.inf], np.nan).fillna(df.mean(numeric_only=True))
    feature_data_scaled = feature_data.values  

    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    
    df['cluster'] = kmeans.fit_predict(feature_data_scaled)
    
    cluster_centers = kmeans.cluster_centers_
    
    sub_dataframes = [df[df['cluster'] == cluster].drop(columns=['cluster']) for cluster in range(n_clusters)]
    
    return sub_dataframes, cluster_centers


In [6]:
def make_predictions(test_df, train_features, target_feature, cluster_centers, models, biases, bias_type, filename, add_bias=False):

    test_features = test_df[train_features].values
    distances = np.linalg.norm(cluster_centers[:, np.newaxis] - test_features, axis=2)
    
    test_df.loc[:, 'cluster'] = np.argmin(distances, axis=0)

    pred_vs_act_df = pd.DataFrame(columns=['runtime_pred', 'runtime_act', 'original_index'])

    for cluster_id, (model, bias_dict) in enumerate(zip(models, biases)):
        cluster_data = test_df[test_df['cluster'] == cluster_id]
        if not cluster_data.empty:
            X_test = cluster_data[train_features].values
            y_test = cluster_data[target_feature].values

            if add_bias:
                # Select the appropriate bias based on the specified bias type
                bias = bias_dict.get(bias_type, 0) 
                y_pred = model.predict(X_test) + bias
            else:
                y_pred = model.predict(X_test)

            new_df = pd.DataFrame({
                'runtime_pred': y_pred,
                'runtime_act': y_test,
                'original_index': cluster_data.index
            })
            pred_vs_act_df = pd.concat([pred_vs_act_df, new_df], ignore_index=True)

    # Sort results by the original index
    pred_vs_act_df = pred_vs_act_df.sort_values(by='original_index').reset_index(drop=True)

    # Save results to a file
    pred_vs_act_df.to_pickle(filename)

    
    

In [11]:
bias_types = ['mean', 'mad', 'std_dev', 'two_sigma']

train_features = ['usr', 'jnam', 'cnumr', 'nnumr', 'elpl', 'mszl', 'freq_req']
target_feature = 'duration'
req_feature = 'elpl'



In [12]:
# Train-Test split
train_df, test_df = train_test_split(df_success, test_size=0.2, random_state=33)


print("[INFO] Creating sub-dataframes and clustering...")
sub_dataframes, cluster_centers = create_sub_dataframes(
    df=train_df, 
    selected_features=train_features, 
    n_clusters=4
)


print("[INFO] Training plain models (XGBoost and RandomForest)...")
xgb_plain_models, plain_biases = train_model_per_cluster(
    sub_dataframes, train_features, target_feature, 'xgboost', alpha=1)
rf_plain_models, plain_biases = train_model_per_cluster(
    sub_dataframes, train_features, target_feature, 'rf', alpha=1)
print("[INFO] Plain models training completed.")


print("[INFO] Running baseline training...")
baseline_training(train_df, test_df, 
                  train_features, target_feature, 
                  f'fugaku_baseline_execution_time_pred_vs_act_df.pkl')
print("[INFO] Baseline training completed.")


print("[INFO] Making plain XGBoost predictions...")
make_predictions(
    test_df, train_features, target_feature, cluster_centers, 
    xgb_plain_models, plain_biases, 'mad', 
    f'fugaku_CP_xgboost_execution_time_pred_vs_act_df.pkl', 
    add_bias=False
)


print("[INFO] Making plain RandomForest predictions...")
make_predictions(
    test_df, train_features, target_feature, cluster_centers, 
    rf_plain_models, plain_biases, 'mad', 
    f'fugaku_CP_rf_execution_time_pred_vs_act_df.pkl', 
    add_bias=False
)

for bias_type in bias_types:
    print(f"[INFO] Evaluating with bias type: {bias_type}...")

    make_predictions(
        test_df, train_features, target_feature, cluster_centers,
        xgb_plain_models, plain_biases, bias_type,
        f'fugaku_CB_xgboost_{bias_type}_execution_time_pred_vs_act_df.pkl', 
        add_bias=True
    )

    make_predictions(
        test_df, train_features, target_feature, cluster_centers,
        rf_plain_models, plain_biases, bias_type,
        f'fugaku_CB_rf_{bias_type}_execution_time_pred_vs_act_df.pkl', 
        add_bias=True
    )


y_test = test_df[target_feature]
req_test = test_df[req_feature]

user_req_df = pd.DataFrame({'runtime_pred': req_test, 'runtime_act': y_test})
filename = f'fugaku_user_req_execution_time_pred_vs_act_df.pkl'
user_req_df.to_pickle(filename)



[INFO] Creating sub-dataframes and clustering...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


[INFO] Training plain models (XGBoost and RandomForest)...
[INFO] Plain models training completed.
[INFO] Running baseline training...
r2: 0.904, rmse: 8820
[INFO] Baseline training completed.
[INFO] Making plain XGBoost predictions...
[INFO] Making plain RandomForest predictions...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


[INFO] Evaluating with bias type: mean...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


[INFO] Evaluating with bias type: mad...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


[INFO] Evaluating with bias type: std_dev...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


[INFO] Evaluating with bias type: two_sigma...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
