In [1]:
import os
import sys
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt


# Eagle

In [None]:
start_time = '2018-11-14'
end_time = '2023-02-01'
target_feature = 'run_time' 
models = ['baseline', 'clustering_xgb','clustering_rf', 'resampled_xgb', 'resampled_rf']
bias_types = ['none', 'two_sigma']

start_dates = pd.date_range(start=start_time, end=end_time, freq='7D')
window_sizes = [7, 14]

eagle_results = {}

for start_date in start_dates:
    eagle_results[start_date] = {}
    
    for window_size in window_sizes:
        eagle_results[start_date][window_size] = {}

        for model in models:
            eagle_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                eagle_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/eagle/window_size={window_size}/eagle_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                if os.path.exists(file_path):
                    eagle_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
  
                else:
                    print(f"Error: The file {file_path} was not found.")
    
    
start_dates = pd.date_range(start=start_time, end=end_time, freq='1M')
window_sizes = [30, 60, 90]

for start_date in start_dates:
    eagle_results[start_date] = {}
    
    for window_size in window_sizes:
        eagle_results[start_date][window_size] = {}

        for model in models:
            eagle_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                eagle_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/eagle/window_size={window_size}/eagle_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                eagle_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
              



In [None]:
data = []

for start_date, window_data in eagle_results.items():
    for window_size, model_data in window_data.items():
        for model, bias_data in model_data.items():
            for bias_type, result in bias_data.items():
                if isinstance(result, pd.DataFrame) and {'req', 'act', 'pred'}.issubset(result.columns):
                    for _, row in result.iterrows():
                        data.append({
                            'start_date': start_date,
                            'window_size': window_size,
                            'model': model,
                            'bias_type': bias_type,
                            'req': row['req'],
                            'act': row['act'],
                            'pred': row['pred']
                        })
                else:
                    print(f"Warning: Missing expected columns in result for {start_date}, {window_size}, {model}, {bias_type}")


df = pd.DataFrame(data)

df.to_parquet('eagle_results.parquet', engine='pyarrow', index=False)

print("Saved eagle_results to eagle_results.parquet")

# BU SCC

In [None]:
start_time = '2022-12-31'
end_time = '2023-12-31'
target_feature = ['execution_time'] 
models = ['baseline', 'clustering_xgb','clustering_rf', 'resampled_xgb', 'resampled_rf']
bias_types = ['none', 'two_sigma']

start_dates = pd.date_range(start=start_time, end=end_time, freq='7D')
window_sizes = [7, 14]


bu_scc_results = {}

for start_date in start_dates:
    bu_scc_results[start_date] = {}
    
    for window_size in window_sizes:
        bu_scc_results[start_date][window_size] = {}

        for model in models:
            bu_scc_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                bu_scc_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/bu_scc/window_size={window_size}/bu_scc_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                bu_scc_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
start_dates = pd.date_range(start=start_time, end=end_time, freq='1M')
window_sizes = [30, 60, 90]
for start_date in start_dates:
    bu_scc_results[start_date] = {}
    
    for window_size in window_sizes:
        bu_scc_results[start_date][window_size] = {}

        for model in models:
            bu_scc_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                bu_scc_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/bu_scc/window_size={window_size}/bu_scc_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                bu_scc_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 

           

# Fugaku

In [None]:
start_time = '2024-04-05'
end_time = '2024-04-30'
target_feature = ['duration'] 
models = ['baseline', 'clustering_xgb','clustering_rf', 'resampled_xgb', 'resampled_rf']
bias_types = ['none', 'two_sigma']


start_dates = pd.date_range(start=start_time, end=end_time, freq='7D')
window_sizes = [7, 14]

fugaku_results = {}

for start_date in start_dates:
    fugaku_results[start_date] = {}
    
    for window_size in window_sizes:
        fugaku_results[start_date][window_size] = {}

        for model in models:
            fugaku_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                fugaku_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/fugaku/window_size={window_size}/fugaku_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                fugaku_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
start_dates = pd.date_range(start=start_time, end=end_time, freq='3D')
window_sizes = [3]

for start_date in start_dates:
    fugaku_results[start_date] = {}
    
    for window_size in window_sizes:
        fugaku_results[start_date][window_size] = {}

        for model in models:
            fugaku_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                fugaku_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/fugaku/window_size={window_size}/fugaku_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                fugaku_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
              



# M100

In [None]:
start_time = '2021-12-01'
end_time = '2021-12-31'
target_feature = ['execution_time'] 
models = ['baseline', 'clustering_xgb','clustering_rf', 'resampled_xgb', 'resampled_rf']
bias_types = ['none', 'two_sigma']

start_dates = pd.date_range(start=start_time, end=end_time, freq='7D')
window_sizes = [7, 14]

m100_results = {}

for start_date in start_dates:
    m100_results[start_date] = {}
    
    for window_size in window_sizes:
        m100_results[start_date][window_size] = {}

        for model in models:
            m100_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                m100_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/m100/window_size={window_size}/m100_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                m100_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
start_dates = pd.date_range(start=start_time, end=end_time, freq='3D')
window_sizes = [3]
for start_date in start_dates:
    m100_results[start_date] = {}
    
    for window_size in window_sizes:
        m100_results[start_date][window_size] = {}

        for model in models:
            m100_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                m100_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/m100/window_size={window_size}/m100_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                m100_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
              


# Sandia

In [None]:
start_time = '2024-05-01'
end_time = '2024-09-23'
target_feature = ['execution_time']
models = ['baseline', 'clustering_xgb','clustering_rf', 'resampled_xgb', 'resampled_rf']
bias_types = ['none', 'two_sigma']


start_dates = pd.date_range(start=start_time, end=end_time, freq='7D')
window_sizes = [7, 14]

sandia_results = {}

for start_date in start_dates:
    sandia_results[start_date] = {}
    
    for window_size in window_sizes:
        sandia_results[start_date][window_size] = {}

        for model in models:
            sandia_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                sandia_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/sandia/window_size={window_size}/sandia_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                sandia_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
start_dates = pd.date_range(start=start_time, end=end_time, freq='1M')
window_sizes = [30]

for start_date in start_dates:
    sandia_results[start_date] = {}
    
    for window_size in window_sizes:
        sandia_results[start_date][window_size] = {}

        for model in models:
            sandia_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                sandia_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/sandia/window_size={window_size}/sandia_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                sandia_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
              
            
            


start_dates = pd.date_range(start=start_time, end=end_time, freq='7D')

for model in models:
    for bias in bias_types:
        total_underpred = 0
        total_jobs = 0

        for start_date in start_dates:
            start_date = pd.Timestamp(start_date)

            df = sandia_results[start_date].get(7, {}).get(model, {}).get(bias)

            if df is None:
                continue
                
            df = df.copy()
            underpredictions = df[df['pred'] < df['act']].copy()
            total_underpred = total_underpred + len(underpredictions)
            total_jobs = total_jobs +  len(df)
            
        success_rate = 100 * (1 - total_underpred / total_jobs)
        underpred_ratio = 100 * (total_underpred / total_jobs)

        print(f"Model name: {model}, Bias type: {bias}, UR: {underpred_ratio}")
   

# ALL Results

In [None]:
import pandas as pd

def find_metrics(dictionary, start_time, end_time, input_freq, window_size, df_name):
    models = ['baseline', 'clustering_xgb', 'clustering_rf', 'resampled_xgb', 'resampled_rf']
    bias_types = ['none', 'two_sigma']

    start_dates = pd.date_range(start=start_time, end=end_time, freq=input_freq)
    results = []

    available_dates = list(dictionary.keys())  # Store available keys for debugging

    for model in models:
        for bias in bias_types:
            total_underpred = 0
            total_jobs = 0
            total_user_req = 0
            total_pred = 0
            underpred_counts = []  # List to store underprediction counts for each date

            for start_date in start_dates:
                start_date = pd.Timestamp(start_date)

                if window_size not in dictionary[start_date]:
                    print(f"Missing window_size {window_size} for {start_date}")
                    continue

                df = dictionary[start_date][window_size][model][bias]

                underpredictions = df[df['pred'] < df['act']]
                underpred_counts.append(len(underpredictions))  # Store underpredictions count for this date
                total_underpred += len(underpredictions)
                total_jobs += len(df)

                user_overpred = df[df['act'] < df['req']]
                total_user_req += (user_overpred['req'] - user_overpred['act']).sum()

                overpred = df[df['act'] < df['pred']]
                total_pred += (overpred['pred'] - overpred['act']).sum()

            # Calculate mean and standard deviation of underpredictions
            if underpred_counts:  # Ensure the list is not empty
                mean_underpred = sum(underpred_counts) / len(underpred_counts)
                std_underpred = pd.Series(underpred_counts).std()
            else:
                mean_underpred = 0
                std_underpred = 0

            if total_jobs > 0:
                success_rate = 100 * (1 - total_underpred / total_jobs)
                underpred_ratio = 100 * (total_underpred / total_jobs)
            else:
                success_rate = 0
                underpred_ratio = 0

            result_entry = {
                "DataFrame": df_name,
                "Model": model,
                "Bias": bias,
                "Success Rate (%)": round(success_rate, 2),
                "Underprediction Ratio (%)": round(underpred_ratio, 2),
                "Total User Request Overprediction (hours)": round(total_user_req / 3600, 2),
                "Total Model Overprediction (hours)": round(total_pred / 3600, 2),
                "Mean Underprediction Count": round(mean_underpred, 2),
                "Std Underprediction Count": round(std_underpred, 2)
            }

            results.append(result_entry)

    return pd.DataFrame(results)


In [None]:
all_results = []

datasets = [("Eagle", eagle_results,'2018-11-14','2023-02-01','1M','30'),
            ("BU SCC", bu_scc_results,'2022-12-31','2023-12-31','1M','60'), 
            ("Fugaku", fugaku_results,'2024-04-05','2024-04-30','3D','3'),
            ("M100", m100_results,'2021-12-01','2021-12-31' ,'3D','3')]
#("Sandia", sandia_results,'2024-05-01','2024-09-23','7D','7')

for df_name, dictionary, start_time, end_time, input_freq, window_size in datasets:
    window_size = int(window_size.replace('D', '').replace('M', ''))
    df_results = find_metrics(dictionary, start_time, end_time, input_freq, window_size, df_name)
    all_results.append(df_results)

final_results_df = pd.concat(all_results, ignore_index=True)

final_results_df

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import to_rgb
import seaborn as sns
import matplotlib.colors as mcolors

'''
Possible values are: Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, 
BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, 
OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, 
Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, 
Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, 
Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Wistia, Wistia_r, YlGn,
YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, 
binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cividis, cividis_r, cool, cool_r, coolwarm, 
coolwarm_r, copper, copper_r, crest, crest_r, cubehelix, cubehelix_r, flag, flag_r, flare, flare_r, 
gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, 
gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, 
gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, 
jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, CMRmap, ocean_r, pink, pink_r, 
plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spring,
spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, 
terrain, terrain_r, twilight, twilight_r, twilight_shifted, twilight_shifted_r, viridis, viridis_r,
vlag, vlag_r, winter, winter_r
'''
base_colors = sns.color_palette("Dark2", 6)

filtered_df = final_results_df[final_results_df["Bias"] == "two_sigma"]

datasets = filtered_df["DataFrame"].unique()
models = filtered_df["Model"].unique()

x_labels = {
    "baseline - User Request": "User Requested",
    "baseline": r'Single-XGB + $2\sigma$',
    "clustering_xgb": r'Clustering-XGB + $2\sigma$',
    "clustering_rf": r'Clustering-RF + $2\sigma$',
    "resampled_xgb": r'Resampling-XGB + $2\sigma$',
    "resampled_rf": r'Resampling-RF + $2\sigma$',
}


fig, axes = plt.subplots(1,4, figsize=(16,4))
axes = axes.flatten() 


for idx, df_name in enumerate(datasets):

    user_requested_hours_df = []
    model_overprediction_hours_df = []

    for model in models:
        
        df_subset = filtered_df[(filtered_df["DataFrame"] == df_name) & (filtered_df["Model"] == model)]
        
        user_requested_hours_df.append(df_subset["Total User Request Overprediction (hours)"].sum())
        
        model_overprediction_hours_df.append(df_subset["Total Model Overprediction (hours)"].sum())
        

        ratio = df_subset["Total User Request Overprediction (hours)"].sum()/df_subset["Total Model Overprediction (hours)"].sum() 

        print(f"Dataset: {df_name}, Model: {model}, Ratio: {ratio:.2f}")
        
    
    ax = axes[idx]

    x = np.arange(6)
    width = 0.5
    bars_user_request = []
    bars_model_overprediction = []
    
    bars_user_request.append(ax.bar(x[0], user_requested_hours_df[0], width, color=base_colors[0], alpha=0.7))


    for i, model in enumerate(models):        
        bars_model_overprediction.append(ax.bar(x[i+1], model_overprediction_hours_df[i], width, 
                                                color=base_colors[i+1], alpha=0.7))
    
    
    #ax.set_xlabel("Model Name",fontsize=16)
    ax.set_ylabel("Total Overpredicted Hours",fontsize=16)
    ax.set_title(f"{df_name}",fontsize=16)
    ax.set_xticks([])
    #ax.set_xticks(x)
    #ax.set_xticklabels([x_labels["baseline - User Request"]] + [x_labels[m] for m in models], fontsize=12)
    #ax.tick_params(axis='x', rotation=45)
    

handles_user_request = [bar for bar in bars_user_request]
handles_model_overprediction = [bar for bar in bars_model_overprediction[:5]]  # One per model

labels_user_request = ["User Requested"]
labels_model_overprediction = [x_labels[model] for model in models]

fig.legend(handles_user_request + handles_model_overprediction, 
           labels_user_request + labels_model_overprediction,
           title='Model Name',title_fontsize='14',loc="upper center", bbox_to_anchor=(0.5, 1.3), ncol=3, fontsize=14)


fig.suptitle("User Request vs Model Execution Time Overprediction Hours", fontsize=20)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plot_name = 'exec_time_overpred.svg'
file_path = os.path.join('/projectnb/peaclab-mon/boztop/resource-allocation/plots', plot_name)
plt.savefig(file_path, format="svg", dpi=300, bbox_inches='tight')

plt.show()


# Other Resource Types


In [None]:
start_time = '2024-04-05'
end_time = '2024-04-30'
target_feature = ['cnumut'] 
models = ['baseline', 'clustering_xgb','clustering_rf', 'resampled_xgb', 'resampled_rf']
bias_types = ['none', 'two_sigma']


start_dates = pd.date_range(start=start_time, end=end_time, freq='7D')
window_sizes = [7, 14]

fugaku_cpu_results = {}

for start_date in start_dates:
    fugaku_cpu_results[start_date] = {}
    
    for window_size in window_sizes:
        fugaku_cpu_results[start_date][window_size] = {}

        for model in models:
            fugaku_cpu_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                fugaku_cpu_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/fugaku/cpu_pred/window_size={window_size}/fugaku_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                fugaku_cpu_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
start_dates = pd.date_range(start=start_time, end=end_time, freq='3D')
window_sizes = [3]

for start_date in start_dates:
    fugaku_cpu_results[start_date] = {}
    
    for window_size in window_sizes:
        fugaku_cpu_results[start_date][window_size] = {}

        for model in models:
            fugaku_cpu_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                fugaku_cpu_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/fugaku/cpu_pred/window_size={window_size}/fugaku_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                fugaku_cpu_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
              



In [None]:
start_time = '2024-04-05'
end_time = '2024-04-30'
target_feature = ['mmszu'] 
models = ['baseline', 'clustering_xgb','clustering_rf', 'resampled_xgb', 'resampled_rf']
bias_types = ['none', 'two_sigma']


start_dates = pd.date_range(start=start_time, end=end_time, freq='7D')
window_sizes = [7, 14]

fugaku_mem_results = {}

for start_date in start_dates:
    fugaku_mem_results[start_date] = {}
    
    for window_size in window_sizes:
        fugaku_mem_results[start_date][window_size] = {}

        for model in models:
            fugaku_mem_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                fugaku_mem_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/fugaku/mem_pred/window_size={window_size}/fugaku_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                fugaku_mem_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
start_dates = pd.date_range(start=start_time, end=end_time, freq='3D')
window_sizes = [3]

for start_date in start_dates:
    fugaku_mem_results[start_date] = {}
    
    for window_size in window_sizes:
        fugaku_mem_results[start_date][window_size] = {}

        for model in models:
            fugaku_mem_results[start_date][window_size][model] = {}

            for bias_type in bias_types:
                fugaku_mem_results[start_date][window_size][model][bias_type] = {}

                file_path = f'/projectnb/peaclab-mon/boztop/resource-allocation/exp_results/fugaku/mem_pred/window_size={window_size}/fugaku_{model}_bias_{bias_type}_{target_feature}_window_{window_size}_days_{start_date}.pkl' 
                fugaku_mem_results[start_date][window_size][model][bias_type] = pd.read_pickle(file_path) 
                
              


            

In [None]:
datasets = [("Fugaku_cpu", fugaku_cpu_results, '2024-04-05', '2024-04-30', '3D', '3'), 
            ("Fugaku_mem", fugaku_mem_results, '2024-04-05', '2024-04-30', '3D', '3')]

models = ['baseline', 'clustering_xgb', 'clustering_rf', 'resampled_xgb', 'resampled_rf']
bias_types = ['two_sigma']

results = {}

for df_name, dictionary, start_time, end_time, input_freq, window_size in datasets:
    window_size = int(window_size.replace('D', '').replace('M', '')) 
    
    if 'cpu' in df_name.lower():
        results['cpu'] = {}
    elif 'mem' in df_name.lower(): 
        results['mem'] = {}

    for model in models:
        if 'cpu' in df_name.lower():
            results['cpu'][model] = {}
        elif 'mem' in df_name.lower(): 
            results['mem'][model] = {}

        for bias in bias_types:
            cpu_results = []
            mem_results = []

            for start_date in start_dates:
                start_date = pd.Timestamp(start_date) 

                df = dictionary[start_date][window_size][model][bias]
                df['start_date'] = start_date
                df['model'] = model
                df['bias'] = bias

                if 'cpu' in df_name.lower():
                    df[f'cpu_overprediction_{model}'] = df['pred'] / df['act']
                    df[f'cpu_overprediction_user_req'] = df['req'] / df['act']
                    cpu_results.append(df) 
                elif 'mem' in df_name.lower(): 
                    df[f'mem_overprediction_{model}'] = df['pred'] / df['act'] 
                    df[f'mem_overprediction_user_req'] = df['req'] / df['act']
                    mem_results.append(df) 
            
            if 'cpu' in df_name.lower():
                cpu_df = pd.concat(cpu_results, axis=0, ignore_index=True)
                results['cpu'][model][bias] = cpu_df
            elif 'mem' in df_name.lower(): 
                mem_df = pd.concat(mem_results, axis=0, ignore_index=True)
                results['mem'][model][bias] = mem_df
                
                

In [None]:
rename_models = {
    
    'baseline' : r'Single-XGB + 2$\sigma$',
    'clustering_xgb' : r'Clustering-XGB + 2$\sigma$',
    'clustering_rf' : r'Clustering-RF + 2$\sigma$',
    'resampled_xgb' : r'Resampling-XGB + 2$\sigma$',
    'resampled_rf' : r'Resampling-RF + 2$\sigma$'
}

colors = sns.color_palette("Dark2", 15)
def plot_overestimation_mem_kde(dictionary, 
                                title='Fugaku Dataset Resource Prediction Overestimation Results'):
    
    resources = ['mem', 'cpu']
    titles = ['Maximum Memory Size (Bytes) Prediction', 'Number of Processors Prediction']
    fig, axs = plt.subplots(1, 2, figsize=(16,5))
    
    i = 0
    j = 1
    for resource in resources:
        for model in models:
            df = dictionary[resource][model]['two_sigma']
            
            if model == 'baseline':
                sns.kdeplot(df[f'{resource}_overprediction_user_req'], label='User Requested Values', fill=True, alpha=0.6, linewidth=2, log_scale=True, color=colors[0], ax=axs[i])
            sns.kdeplot(df[f'{resource}_overprediction_{model}'], label=rename_models[model], fill=True, alpha=0.6, linewidth=2, log_scale=True, color=colors[j], ax=axs[i])
            j = j + 1
        
        axs[i].axvline(1, color='red', linestyle='--', label='Perfect Match (Ratio=1)')
        axs[i].set_xscale('log')
        axs[i].set_title(f"{titles[i]}", fontsize=14)
        axs[i].set_xlabel('')
        axs[i].set_ylabel('Density', fontsize=16)
        axs[i].grid(True)

        i=1
        j=1

    fig.suptitle(title, fontsize=20,y=1.01)
    
    # Place the legend only once outside the subplots, with no overlapping
    handles, labels = axs[0].get_legend_handles_labels()  # Get handles and labels from the first subplot
    fig.legend(handles, labels, title='Model Name', title_fontsize='14', loc='upper center', fontsize=14, bbox_to_anchor=(0.415, 1.31), ncol=3)

    fig.text(0.5, 0.04, 'Overestimation Factor (Predicted/Actual)', ha='center', fontsize=16)

    plt.grid(True)

    plot_name = 'memory_cpu_OF_density.svg'
    file_path = os.path.join('/projectnb/peaclab-mon/boztop/resource-allocation/plots', plot_name)
    plt.savefig(file_path, format="svg", dpi=300, bbox_inches='tight')  

    plt.show()


In [None]:
plot_overestimation_mem_kde(results)


In [None]:
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

rename_models = {
    'baseline': r'Single-XGB + 2$\sigma$',
    'clustering_xgb': r'Clustering-XGB + 2$\sigma$',
    'clustering_rf': r'Clustering-RF + 2$\sigma$',
    'resampled_xgb': r'Resampling-XGB + 2$\sigma$',
    'resampled_rf': r'Resampling-RF + 2$\sigma$'
}

colors = sns.color_palette("Dark2", 15)

def plot_overestimation_mem_ridgeline(dictionary, 
                                      title='Fugaku Dataset Resource Prediction Overestimation Results'):
    resources = ['mem', 'cpu']
    titles = ['Maximum Memory Size (Bytes) Prediction', 'Number of Processors Prediction']

    for i, resource in enumerate(resources):
        plt.figure(figsize=(10, 6))
        models_sorted = ['baseline'] + [m for m in rename_models if m != 'baseline']
        
        for j, model in enumerate(models_sorted):
            df = dictionary[resource][model]['two_sigma']
            data = df[f'{resource}_overprediction_{model}']
            
            # KDE plot offset vertically for ridgeline effect
            sns.kdeplot(data, fill=True, alpha=0.7, linewidth=1.5, log_scale=True, color=colors[j])
            
            # Adding model labels on the right side
            plt.text(data.median(), j * 0.3 + 0.5, rename_models[model] if model != 'baseline' else 'User Requested Values', 
                     fontsize=12, color=colors[j], verticalalignment='bottom')

        plt.axvline(1, color='red', linestyle='--', label='Perfect Match (Ratio=1)')
        plt.xscale('log')
        plt.title(titles[i], fontsize=14)
        plt.xlabel('Overestimation Factor (Predicted/Actual)', fontsize=12)
        plt.ylabel('Density', fontsize=12)
        plt.grid(True)

        # Save each figure separately
        plot_name = f'{resource}_overestimation_ridgeline.svg'
        file_path = os.path.join('/projectnb/peaclab-mon/boztop/resource-allocation/plots', plot_name)
        plt.savefig(file_path, format="svg", dpi=300, bbox_inches='tight')

        plt.show()


In [None]:
plot_overestimation_mem_ridgeline(results)

