In [1]:
# Data manipulation and numerical operations
import pandas as pd
import numpy as np
from pathlib import Path
import datetime
    
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Jupyter notebook import

from tensorflow.keras import regularizers
from keras.callbacks import EarlyStopping

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
# Column headers, extracted from the "man 5 accounting" command:
col_headers=["qname", "hostname", "group", "owner", "job_name", "job_number", "account", "priority", 
             "submission_time", "start_time", "end_time", "failed", "exit_status", "ru_wallclock", 
             "ru_utime", "ru_stime", "ru_maxrss", "ru_ixrss", "ru_ismrss", "ru_idrss", "ru_isrss", 
             "ru_minflt", "ru_majflt", "ru_nswap", "ru_inblock", "ru_oublock", "ru_msgsnd", 
             "ru_msgrcv", "ru_nsignals", "ru_nvcsw", "ru_nivcsw", "project", "department", "granted_pe", 
             "slots", "task_number", "cpu", "mem", "io", "category", "iow", "pe_taskid", "maxvmem", 
             "arid", "ar_submission_time"]

use_col_headers = ["group", "owner", "job_name", "account", 
             "submission_time", "start_time", "end_time", "failed", "exit_status", "ru_wallclock", 
             "ru_utime", "ru_stime", "ru_maxrss", "ru_isrss", "project", "granted_pe", 
             "slots", "cpu", "mem", "io", "category"]

# Input file path
file_path = "/projectnb/peaclab-mon/boztop/resource-allocation/datasets/bu scc accounting/bu.accounting.2023" 

# Reading the utf-8 encoded CSV file
df = pd.read_csv(file_path, encoding='utf-8', skiprows=4, 
                 names=col_headers,  sep=':', usecols = use_col_headers)

In [3]:
# Remove any rows where wallclock is 0
df = df[df['ru_wallclock'] != 0]

# Keep the successful jobs only
df = df[df['failed'] == 0]
df = df[df['exit_status'] == 0]

# Add a column for the lag time
df['lag_time'] = df['start_time'] - df['submission_time']

# Remove any rows where this new column is <= 0
df = df[df['lag_time'] > 0]

df = df.dropna()

In [4]:
df['submission_time'] = df['submission_time'].apply(lambda x: datetime.datetime.fromtimestamp(x))

In [5]:
# Add additional metrics
df['execution_time'] = df['end_time'] - df['start_time']

# CPU-related
# df['total_cpu_time'] = df['ru_utime'] + df['ru_stime']
# df['ncpu'] = np.ceil(df['total_cpu_time'] / df['ru_wallclock'])
df['ncpu'] = np.ceil(df['cpu'] / df['ru_wallclock'])
df['cpu_waste'] = df['slots'] - df['ncpu']



In [6]:
# Get the summary stats
all_stats = df.describe()

In [7]:
# The total number of jobs
njobs = int(all_stats['ru_wallclock']['count'])
print(f'Total number of jobs: {njobs}')

# The total number of project groups
total_groups = df['group'].nunique()
print(f"Total number of different groups: {total_groups}")

# The total number of users
total_users = df['owner'].nunique()
print(f"Total number of different users: {total_users}")

# The total number of jobs where the request was more than actual usage
jobs_with_request_more_than_usage = (df['slots'] > df['ncpu']).sum()
print(f'Total number of jobs where the request was more than actual usage: {jobs_with_request_more_than_usage}')

percentage_waste = 100 * (jobs_with_request_more_than_usage/njobs)
print(f'Percentage of jobs where the request was more than actual usage: {percentage_waste} %')


# The total CPU core wastage
total_cpu_wastage = df['cpu_waste'].sum()
print(f"Total CPU wastage (number of cores): {total_cpu_wastage}")


Total number of jobs: 11403153
Total number of different groups: 524
Total number of different users: 1388
Total number of jobs where the request was more than actual usage: 2299636
Percentage of jobs where the request was more than actual usage: 20.1666679382448 %
Total CPU wastage (number of cores): 16058705.0


In [8]:
# Showing the cpu wasted jobs

df_cpu = df[df['cpu_waste'] > 0]

df_cpu[['owner', 'job_name', 'ru_wallclock', 'ru_maxrss', 'ru_utime','ru_stime', 'slots', 'ncpu', 'cpu_waste','cpu', 'mem']]


Unnamed: 0,owner,job_name,ru_wallclock,ru_maxrss,ru_utime,ru_stime,slots,ncpu,cpu_waste,cpu,mem
1,piotrt,caltech101_erm_5e-5-0-v3_2-eval,14,3468304.0,10.508114,3.146432,4,1.0,3.0,13.654546,113.041253
5,piotrt,hospital3_erm_5e-5-0-v3_3-nosma-full,15585,3567556.0,26864.717200,5754.854540,4,3.0,1.0,32619.571740,782537.612000
11,piotrt,caltech101_erm_5e-5-0-v3_2-nosma-eval,13,3470784.0,10.539715,3.114007,4,2.0,2.0,13.653722,108.458730
13,drhall,sge.jcf,94,133992.0,90.256206,2.717040,28,1.0,27.0,92.973246,16.411151
18,piotrt,caltech101_erm_5e-5-0-v3_2-nosma-full-eval,14,3481124.0,10.547930,3.138102,4,1.0,3.0,13.686032,118.503044
...,...,...,...,...,...,...,...,...,...,...,...
13689662,farrell,nf-haplotype_caller_(61968),5855,2958472.0,11606.093233,60.866751,3,2.0,1.0,11666.959984,115496.741931
13689663,gychuang,sge.jcf,1072,18392.0,1066.370101,0.551287,8,1.0,7.0,1066.921388,105.564728
13689667,okhan01,map-3n7a_L,307,1136084.0,6705.686539,403.934059,28,24.0,4.0,7109.620598,12670.641529
13689669,okhan01,map-3n7a_L,313,1032180.0,6812.607501,169.738650,28,23.0,5.0,6982.346151,12418.058213


In [None]:
row_counts = df.groupby('owner').size()
users_with_enough_data = row_counts[row_counts >= 10].index
filtered_df = df[df['owner'].isin(users_with_enough_data)]

top_10_users = filtered_df.groupby('owner')['cpu_waste'].mean().sort_values(ascending=False).head(10)


# Count the number of rows for each of these top 10 users
row_counts = df[df['owner'].isin(top_10_users.index)].groupby('owner').size()

# Convert the result to a DataFrame for a cleaner display
row_counts_df = row_counts.reset_index(name='row_count')


# Display the group names along with their row counts
print(row_counts_df)

In [None]:
selected_columns = df[df['owner'] == 'hrhiginb'][['cpu','ru_wallclock','slots', 'ncpu']]
selected_columns

In [None]:
'''
from waste_visualize import waste_visualize
waste_visualize(df, 'owner')
waste_visualize(df, 'group')
'''

In [None]:
'''
from statistical_analysis import process_user_data
# Select the last "x" entries
df_last_1500 = df.tail(50000)

# Process data for each user with a step size of 3
user_dfs, accuracy_results = process_user_data(df_last_1500, 'ncpu', 3)

# Print the results
for user, (mae, mape, row_count) in accuracy_results.items():
    print(f"Accuracy for user {user}: MAE = {mae}, MAPE = {mape}%, Rows = {row_count}")
    print()
'''

In [None]:
# --- Helper Functions ----

def custom_loss(alpha, beta):
    def loss(y_true, y_pred):
        error = y_true - y_pred
        squared_error = tf.square(error)
        
        weighted_error = tf.where(error > 0, alpha * squared_error, beta * squared_error)
        return tf.reduce_mean(weighted_error)
    return loss

def dtw_distance(s1, s2):
    """Calculate Dynamic Time Warping (DTW) distance between two time series."""
    n, m = len(s1), len(s2)
    dtw = np.zeros((n + 1, m + 1))
    
    for i in range(n + 1):
        for j in range(m + 1):
            dtw[i, j] = float('inf')
    
    dtw[0, 0] = 0
    
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = abs(s1[i - 1] - s2[j - 1])
            dtw[i, j] = cost + min(dtw[i - 1, j],    # Insertion
                                   dtw[i, j - 1],    # Deletion
                                   dtw[i - 1, j - 1])  # Match
    
    return dtw[n, m]

def dtw_metric(y_true, y_pred):
    """Calculate Dynamic Time Warping (DTW) score between true and predicted values."""
    return dtw_distance(y_true, y_pred)

def mean_absolute_error(y_true, y_pred):
    """Calculate Mean Absolute Error (MAE) between true and predicted values."""
    return np.mean(np.abs(y_true - y_pred))

def mean_absolute_percentage_error(y_true, y_pred):
    """Calculate Mean Absolute Percentage Error (MAPE) between true and predicted values."""
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test).flatten()
    
    dtw_score = dtw_metric(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    return {
        'DTW': dtw_score,
        'MAE': mae,
        'MAPE': mape
    }

In [None]:
%matplotlib inline

# --- Plotting Graphs ----
def draw_combined_scatter_plot(y_test, lstm_y_pred, cnn_y_pred, bnn_y_pred, y_requested, owner):
    plt.figure(figsize=(14, 8))
    
    plt.scatter(range(len(y_test)), y_test, label='True CPU Usage', color='blue', marker='o')
    plt.scatter(range(len(lstm_y_pred)), lstm_y_pred, label='LSTM Predicted CPU Usage', color='orange', marker='x')
    plt.scatter(range(len(cnn_y_pred)), cnn_y_pred, label='CNN Predicted CPU Usage', color='green', marker='^')
    plt.scatter(range(len(bnn_y_pred)), bnn_y_pred, label='BNN Predicted CPU Usage', color='red', marker='s')
    plt.scatter(range(len(y_requested)), y_requested, label='Requested CPU cores', color='purple', linestyle='--')
    
    plt.xlabel('Job submissions')
    plt.ylabel('CPU Usage')
    plt.title(f'User {owner}: Machine Learning Model Predictions vs True Values')
    plt.legend()
    plt.show()
    
def draw_combined_prediction_results(y_test, lstm_y_pred, cnn_y_pred, bnn_y_pred, y_requested, owner):
    plt.figure(figsize=(14, 8))
    
    plt.plot(y_test, label='True CPU Usage', color='blue')
    plt.plot(lstm_y_pred, label='LSTM Predicted CPU Usage', color='orange')
    plt.plot(cnn_y_pred, label='CNN Predicted CPU Usage', color='green')
    plt.plot(bnn_y_pred, label='BNN Predicted CPU Usage', color='red')
    plt.plot(y_requested, label='Requested CPU cores', color='purple', linestyle='--')
    
    plt.xlabel('Job submissions')
    plt.ylabel('CPU Usage')
    plt.title(f'User {owner}: Machine Learning Model Predictions vs True Values')
    plt.legend()
    plt.show()
    
def draw_prediction_results(y_test, y_pred, y_requested, title):    
    plt.figure(figsize=(10, 6))
    
    plt.plot(y_test, label='True Values', color='blue')
    plt.plot(y_pred, label='Predicted Values', color='orange')
    plt.plot(y_requested, label='True Values', color='green', linestyle='--')
    
    plt.xlabel('Job submissions')
    plt.ylabel('CPU Usage')
    plt.title(title)
    plt.legend()
    plt.show()
    
# Histogram time!!
def plot_metrics_histogram(results):
    results_df = pd.DataFrame(results)

    bar_width = 0.2
    index = np.arange(len(results_df['owner'].unique()))
    
    plt.figure(figsize=(16, 8))
    plt.bar(index - bar_width, results_df.groupby('owner')['cores_less_than_requested_percent'].mean(), bar_width, label='Cores Used < Requested Slots', color='red')

    models = ['LSTM', 'CNN', 'BNN']
    for i, model_name in enumerate(models):
        model_results = results_df[results_df['model'] == model_name]
        offset = (i - 1) * bar_width  # Offset for grouping bars

        plt.bar(index + offset, model_results['cores_saved_percent'], bar_width, label=f'{model_name} Predictions')
    
    plt.xlabel('Users')
    plt.ylabel('Percentage of Core Saved Job Predictions')
    plt.title('Reducing CPU Waste for Different Users with ML Models')
    plt.xticks(index + bar_width, results_df['owner'].unique(), rotation=45)
    plt.legend()

    plt.tight_layout()
    plt.show()
    
    # Print the results table
    print("\nResults Table:")
    print(results_df[['owner', 'model', 'cores_less_than_requested_percent', 'cores_saved_percent']].to_string(index=False))


In [None]:
# ---- Creating Machine Learning Models -----

# --- LSTM ----

def build_lstm_model(input_shape, alpha, beta):
    lstm_model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=input_shape),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1, activation='relu')
    ])
    lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss=custom_loss(alpha, beta),
                  metrics=['mean_squared_error'])
    return lstm_model

# --- BNN ----

def build_bnn_model(input_shape, alpha, beta):
    bnn_model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(128, kernel_size=3, activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv1D(128, kernel_size=3, activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(1, activation='relu')
    ])
    bnn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss=custom_loss(alpha, beta),
                  metrics=['mean_squared_error'])
    return bnn_model


# --- CNN ----

def build_cnn_model(input_shape, alpha, beta):
    cnn_model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(128, kernel_size=3, activation='relu', padding='same', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv1D(128, kernel_size=3, activation='relu', padding='same', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu', padding='same', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(1, activation='relu')
    ])
    cnn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                      loss=custom_loss(alpha, beta),
                      metrics=['mean_squared_error'])
    return cnn_model


In [None]:
# ---- Data Preprocessing for ML -----

def preprocess_data(group_data, lag=3):
    """
    Preprocess the data by generating lag features, aligning requested CPU data,
    splitting into train/test datasets, and reshaping the data.

    Parameters:
    - group_data (pd.DataFrame): The input dataframe containing 'ncpu' and 'slots' columns.
    - lag (int): The number of lag features to generate.

    Returns:
    - X_train (np.ndarray): Training data features.
    - X_test (np.ndarray): Testing data features.
    - y_train (np.ndarray): Training data target values.
    - y_test (np.ndarray): Testing data target values.
    - requested_resource_data (np.ndarray): Requested resource data for the test set.
    """
    lag_features = []
    for i in range(1, lag + 1):
        group_data[f'ncpu_lag_{i}'] = group_data['ncpu'].shift(i)
        lag_features.append(f'ncpu_lag_{i}')

    # Drop rows with NaN resulting from the lag
    group_data.dropna(inplace=True)

    # Align the requested CPU column's data
    group_data['slots'] = group_data['slots'].shift(lag)
    group_data.dropna(inplace=True)

    X = group_data[lag_features].values
    y = group_data['ncpu'].values

    # print(f"Number of lag features: {len(lag_features)}")
    # print(f"Number of ncpu features: {len(y)}")

    split = int(0.8 * len(X))
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    requested_resource_data = group_data['slots'].values[split:]

    # Reshape data
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    return X_train, X_test, y_train, y_test, requested_resource_data


In [None]:
results = []

# Initialize lists to store metrics for each group
underpredictions_list = []
overpredictions_list = []
cores_less_than_requested_list = []
cores_saved_list = []
group_names = []

# Custom loss parameters
alpha = 1.0
beta = 1.0 

for owner in top_10_users.index: # Apply the ML models for all 10 groups
    print(f"Processing user: {owner}")
    group_data = df[df['owner'] == owner].copy()
    
    X_train, X_test, y_train, y_test, requested_resource_data = preprocess_data(group_data, lag=3)
    input_shape = (X_train.shape[1], 1)
    bias = np.mean(y_train)


    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # LSTM Model
    lstm_model = build_lstm_model(input_shape, alpha, beta)
    lstm_history = lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, 
                                 validation_split=0.2, callbacks=[early_stopping], verbose=0)
    lstm_y_pred_without_bias = lstm_model.predict(X_test).flatten()
    lstm_y_pred = lstm_y_pred_without_bias + bias
    mse_lstm = mean_squared_error(y_test, lstm_y_pred)
    
    # CNN Model
    cnn_model = build_cnn_model(input_shape, alpha, beta)
    cnn_history = cnn_model.fit(X_train, y_train, epochs=50, batch_size=32, 
                               validation_split=0.2, callbacks=[early_stopping], verbose=0)
    cnn_y_pred_without_bias = cnn_model.predict(X_test).flatten()
    cnn_y_pred = cnn_y_pred_without_bias + bias
    mse_cnn = mean_squared_error(y_test, cnn_y_pred)
    
    # BNN Model
    bnn_model = build_bnn_model(input_shape, alpha, beta)
    bnn_history = bnn_model.fit(X_train, y_train, epochs=50, batch_size=32, 
                               validation_split=0.2, callbacks=[early_stopping], verbose=0)
    bnn_y_pred_without_bias = bnn_model.predict(X_test).flatten()
    bnn_y_pred = bnn_y_pred_without_bias + bias
    mse_bnn = mean_squared_error(y_test, bnn_y_pred)
    
    # Calculate some more informative metrics
    for y_pred, mse, model_name in zip([lstm_y_pred, cnn_y_pred, bnn_y_pred],
                                        [mse_lstm, mse_cnn, mse_bnn],
                                        ['LSTM', 'CNN', 'BNN']):
        underpredictions = np.sum(y_test > y_pred)
        overpredictions = np.sum(y_test < y_pred)
        cores_less_than_requested = np.sum(y_test < requested_resource_data)
        cores_saved = np.sum((y_pred < requested_resource_data) & (y_pred > y_test))

        total_samples = len(y_test)  # Total number of test samples

        # Calculate percentages
        underpredictions_percent = (underpredictions / total_samples) * 100
        overpredictions_percent = (overpredictions / total_samples) * 100
        cores_less_than_requested_percent = (cores_less_than_requested / total_samples) * 100
        cores_saved_percent = (cores_saved / total_samples) * 100
        
        # Append results
        results.append({
            'owner': owner,
            'model': model_name,
            'underpredictions_percent': underpredictions_percent,
            'overpredictions_percent': overpredictions_percent,
            'cores_less_than_requested_percent': cores_less_than_requested_percent,
            'cores_saved_percent': cores_saved_percent,
            'mse': mse
        })

    # Plot the results
    draw_combined_prediction_results(y_test, lstm_y_pred, cnn_y_pred, bnn_y_pred, requested_resource_data, owner)
    
plot_metrics_histogram(results)


In [None]:
plot_metrics_histogram(results)

In [None]:
# Evaluate all models
results = {}

for model_name, model in [('LSTM', lstm_model), ('BNN', bnn_model), ('CNN', cnn_model)]:
    results[model_name] = evaluate_model(model, X_test, y_test)

for model_name, metrics in results.items():
    print(f"Results for {model_name}:")
    print(f"DTW Score: {metrics['DTW']}")
    print(f"MAE: {metrics['MAE']}")
    print(f"MAPE: {metrics['MAPE']}\n")
    

In [None]:
# --- MULTIPLE RESOURCE PREDICTION ----