Import necessary packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Load the Data

In [45]:
# NMSG = 100_000
# ITERATIONS = 20
# # Load Scala benchmark data
# scala_idle_raw_data = pd.read_csv('../data/ping_pong/1/ping_pong_akka_actor_benchmark_idle_power.csv')
# scala_bench_raw_data = pd.read_csv('../data/ping_pong/1/ping_pong_akka_actor_benchmark_power_metrics.csv')

# # Load Erlang benchmark data
# erlang_idle_raw_data = pd.read_csv('../data/ping_pong/1/ping_pong_benchmark_idle_power.csv')
# erlang_bench_raw_data = pd.read_csv('../data/ping_pong/1/ping_pong_benchmark_power_metrics.csv')

# NMSG = 1_000_000
# ITERATIONS = 10
# # Load Scala benchmark data
# scala_idle_raw_data = pd.read_csv('../data/ping_pong/2/ping_pong_akka_actor_benchmark_idle_power.csv')
# scala_bench_raw_data = pd.read_csv('../data/ping_pong/2/ping_pong_akka_actor_benchmark_power_metrics.csv')

# # Load Erlang benchmark data
# erlang_idle_raw_data = pd.read_csv('../data/ping_pong/2/ping_pong_benchmark_idle_power.csv')
# erlang_bench_raw_data = pd.read_csv('../data/ping_pong/2/ping_pong_benchmark_power_metrics.csv')

NMSG = 10_000_000
ITERATIONS = 10
parent_dir = '../data/ping_pong/'
# Load Scala benchmark data
scala_idle_raw_data = pd.read_csv(f'{parent_dir}3/ping_pong_akka_actor_benchmark_idle_power.csv')
scala_bench_raw_data = pd.read_csv(f'{parent_dir}3/ping_pong_akka_actor_benchmark_power_metrics.csv')

# Load Erlang benchmark data
erlang_idle_raw_data = pd.read_csv(f'{parent_dir}3/ping_pong_benchmark_idle_power.csv')
erlang_bench_raw_data = pd.read_csv(f'{parent_dir}3/ping_pong_benchmark_power_metrics.csv')

# Replace empty or missing values with "N/A" in non-numeric columns
scala_idle_raw_data.fillna("N/A", inplace=True)
scala_bench_raw_data.fillna("N/A", inplace=True)
erlang_idle_raw_data.fillna("N/A", inplace=True)
erlang_bench_raw_data.fillna("N/A", inplace=True)

In [None]:
def format_message_count(nmsg):
    """Formats large numbers into shortened notation (e.g., 1B, 100M, 10M)."""
    if nmsg >= 1_000_000_000:
        return f"{nmsg // 1_000_000_000}B"
    elif nmsg >= 1_000_000:
        return f"{nmsg // 1_000_000}M"
    elif nmsg >= 1_000:
        return f"{nmsg // 1_000}K"
    return str(nmsg)

message_label = format_message_count(NMSG)
title_config_prefix = f"({message_label} Messages, {ITERATIONS} Iterations)"

Preview the Data

In [None]:
print("Scala Idle Raw Data:")
print(scala_idle_raw_data.shape)
scala_idle_raw_data.head()

In [None]:
print("Scala Benchmark Raw Data:")
print(scala_bench_raw_data.shape)
scala_bench_raw_data.head()

In [None]:
print("Erlang Idle Raw Data:")
print(erlang_idle_raw_data.shape)
erlang_idle_raw_data.head()

In [None]:
print("Erlang Benchmark Raw Data:")
print(erlang_bench_raw_data.shape)
erlang_bench_raw_data.head()

Cleaning the Data


In [None]:
def clean_data(df):
    df_copy = df.copy() 

    # Convert Timestamp to datetime format (extract date-time pattern)
    df_copy['Timestamp'] = pd.to_datetime(
        df['Timestamp'].str.extract(r'(\w{3} \w{3} \d{1,2} \d{2}:\d{2}:\d{2} \d{4})')[0], 
        errors='coerce'
    )

    # Extract elapsed time in milliseconds
    df_copy['Time Elapsed (ms)'] = pd.to_numeric(
        df['Timestamp'].str.extract(r'\((\d+\.\d+)ms elapsed\)')[0], 
        errors='coerce'
    )
    
    # Extract numeric temperature values
    df_copy['CPU Temp(C)'] = df['CPU Temp(C)'].str.extract(r'(\d+\.\d+|\d+)').astype(float)

    # Calculate Energy columns (Power * Time Elapsed)
    power_columns = ["CPU Core Power(W)", "GT Power(W)", "DRAM Power(W)", "(CPUs+GT+SA) Power(W)"]
    for column in power_columns:
        energy_column = column.replace('Power(W)', 'Energy(J)')
        df_copy[energy_column] = df_copy[column] * df_copy['Time Elapsed (ms)'] / 1000  # Convert ms to seconds

    # Fill N/A values with 0 for numerical columns
    df_copy = df_copy.fillna(0)
    # df_copy.fillna({col: 0 for col in df_copy.select_dtypes(include=['number']).columns}, inplace=True)


    return df_copy

In [None]:
scala_idle_clean_data = clean_data(scala_idle_raw_data)
scala_bench_clean_data = clean_data(scala_bench_raw_data)
erlang_idle_clean_data = clean_data(erlang_idle_raw_data)
erlang_bench_clean_data = clean_data(erlang_bench_raw_data)

Verify Cleaned Data

In [None]:
print("Scala Idle Clean Data:")
print(scala_idle_clean_data.shape)
scala_idle_clean_data.head(15)


In [None]:
print("Scala Benchmark Clean Data:")
print(scala_bench_clean_data.shape)
scala_bench_clean_data.head()


In [None]:
print("Erlang Idle Clean Data:")
print(erlang_idle_clean_data.shape)
erlang_idle_clean_data.head()


In [None]:
print("Erlang Benchmark Clean Data:")
print(erlang_bench_clean_data.shape)
erlang_bench_clean_data.head()

Handling Outliers


In [None]:
def remove_outliers(df, columns):
    """
    Removes outliers from the specified columns using the IQR method.
    """
    df_clean = df.copy()
    
    for column in columns:
        Q1 = df_clean[column].quantile(0.25)
        Q3 = df_clean[column].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        df_clean = df_clean[(df_clean[column] >= lower_bound) & (df_clean[column] <= upper_bound)]
    
    return df_clean

In [None]:
outlier_removal_columns = [
    "CPU Core Power(W)", "GT Power(W)", "DRAM Power(W)", 
    "(CPUs+GT+SA) Power(W)", "Avg Num Cores Active", "CPU Temp(C)"
]

scala_idle_data = remove_outliers(scala_idle_clean_data, outlier_removal_columns)
scala_bench_data = remove_outliers(scala_bench_clean_data, outlier_removal_columns)
erlang_idle_data = remove_outliers(erlang_idle_clean_data, outlier_removal_columns)
erlang_bench_data = remove_outliers(erlang_bench_clean_data, outlier_removal_columns)

In [None]:
print("Scala Idle Data: ")
print(scala_idle_data.shape)
scala_idle_data.head()


In [None]:
print("Scala Benchmark Data: ")
print(scala_bench_data.shape)
scala_bench_data.head()

In [None]:
print("Erlang Idle Data: ")
print(erlang_idle_data.shape)
erlang_idle_data.head()

In [None]:
print("Erlang Benchmark Data: ")
print(erlang_bench_data.shape)
erlang_bench_data.head()

Visualization

In [None]:
def plot_graphs(idle_df, bench_df, title_prefix):
    """
    Creates side-by-side subplots for each metric, plotting idle (left) and benchmark (right),
    using elapsed time on the x-axis.
    """
    columns_to_plot = [
        "CPU Core Power(W)", "GT Power(W)", "DRAM Power(W)", 
        "(CPUs+GT+SA) Power(W)", "Avg Num Cores Active", "CPU Temp(C)"
    ]
    
    # Set Seaborn theme
    sns.set_theme(style="whitegrid", palette="muted")

    # Compute elapsed time starting from t=0
    idle_df = idle_df.copy()
    bench_df = bench_df.copy()
    
    # idle_df["Elapsed Time (s)"] = (idle_df["Timestamp"] - idle_df["Timestamp"].iloc[0]).dt.total_seconds()
    # bench_df["Elapsed Time (s)"] = (bench_df["Timestamp"] - bench_df["Timestamp"].iloc[0]).dt.total_seconds()

    idle_df["Elapsed Time (s)"] = (idle_df["Time Elapsed (ms)"].cumsum() - idle_df["Time Elapsed (ms)"].iloc[0]) / 1000
    bench_df["Elapsed Time (s)"] = (bench_df["Time Elapsed (ms)"].cumsum() - bench_df["Time Elapsed (ms)"].iloc[0]) / 1000

    for column in columns_to_plot:
        # Create side-by-side subplots
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))  # 1 row, 2 columns

        # Plot Idle data on the first subplot
        sns.lineplot(data=idle_df, x="Elapsed Time (s)", y=column, label="Idle", color='blue', linestyle='--', ax=axes[0])
        axes[0].set_title(f"Idle - {column}", fontsize=12)
        axes[0].set_xlabel("Elapsed Time (s)", fontsize=10)
        axes[0].set_ylabel(column, fontsize=10)
        axes[0].legend()
        axes[0].grid(True)

        # Plot Benchmark data on the second subplot
        sns.lineplot(data=bench_df, x="Elapsed Time (s)", y=column, label="Benchmark", color='red', ax=axes[1])
        axes[1].set_title(f"Benchmark - {column}", fontsize=12)
        axes[1].set_xlabel("Elapsed Time (s)", fontsize=10)
        axes[1].set_ylabel(column, fontsize=10)
        axes[1].legend()
        axes[1].grid(True)

        # Main figure title
        fig.suptitle(f"{title_prefix} - {column}", fontsize=14)
        
        # Adjust layout to avoid overlap
        plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust layout to fit title
        plt.show()


In [None]:
plot_graphs(scala_idle_data, scala_bench_data, "Scala Benchmark - Idle vs Active")
plot_graphs(erlang_idle_data, erlang_bench_data, "Erlang Benchmark - Idle vs Active")

In [None]:
def plot_erlang_vs_scala(erlang_df, scala_df, idle_erlang_df, idle_scala_df, title_prefix):
    """
    Plots Erlang and Scala benchmark data with elapsed time starting at t=0,
    after subtracting corresponding idle values.
    """
    columns_to_plot = [
        "CPU Core Power(W)", "GT Power(W)", "DRAM Power(W)", 
        "(CPUs+GT+SA) Power(W)", "Avg Num Cores Active", "CPU Temp(C)"
    ]

    sns.set_theme(style="whitegrid", palette="muted")

    # Normalize elapsed time to start from t=0
    erlang_df = erlang_df.copy()
    scala_df = scala_df.copy()

    # erlang_df["Elapsed Time (s)"] = (erlang_df["Timestamp"] - erlang_df["Timestamp"].iloc[0]).dt.total_seconds()
    # scala_df["Elapsed Time (s)"] = (scala_df["Timestamp"] - scala_df["Timestamp"].iloc[0]).dt.total_seconds()

    erlang_df["Elapsed Time (s)"] = (erlang_df["Time Elapsed (ms)"].cumsum() - erlang_df["Time Elapsed (ms)"].iloc[0]) / 1000
    scala_df["Elapsed Time (s)"] = (scala_df["Time Elapsed (ms)"].cumsum() - scala_df["Time Elapsed (ms)"].iloc[0]) / 1000


    for column in columns_to_plot:
        fig, ax = plt.subplots(figsize=(10, 5))

        # Subtract idle values
        erlang_df["Adjusted " + column] = np.maximum(0, erlang_df[column] - idle_erlang_df[column].mean())
        scala_df["Adjusted " + column] = np.maximum(0, scala_df[column] - idle_scala_df[column].mean())

        # Plot Erlang benchmark
        sns.lineplot(data=erlang_df, x="Elapsed Time (s)", y="Adjusted " + column, label="Erlang", color='blue', ax=ax)

        # Plot Scala benchmark
        sns.lineplot(data=scala_df, x="Elapsed Time (s)", y="Adjusted " + column, label="Scala", color='red', ax=ax)

        # Graph formatting
        ax.set_title(f"{title_prefix} - {column}", fontsize=14)
        ax.set_xlabel("Elapsed Time (s)", fontsize=12)
        ax.set_ylabel(column, fontsize=12)
        ax.legend()
        ax.grid(True)

        plt.tight_layout()
        plt.show()


In [None]:
plot_erlang_vs_scala(erlang_bench_data, scala_bench_data, erlang_idle_data, scala_idle_data, f"Eralng vs Scala Ping-Pong Benchmark {title_config_prefix}")


Compute Average Metrics

In [None]:
metrics_columns = ["CPU Core Power(W)", "GT Power(W)", "DRAM Power(W)", "(CPUs+GT+SA) Power(W)","CPU Temp(C)", "Avg Num Cores Active", "CPU Core Energy(J)", "GT Energy(J)", "DRAM Energy(J)", "(CPUs+GT+SA) Energy(J)"]

scala_idle_avgs = scala_idle_data[metrics_columns].mean()
erlang_idle_avgs = erlang_idle_data[metrics_columns].mean()
scala_bench_avgs = scala_bench_data[metrics_columns].mean()
erlang_bench_avgs = erlang_bench_data[metrics_columns].mean()

scala_net_avgs = scala_bench_avgs - scala_idle_avgs
erlang_net_avgs = erlang_bench_avgs - erlang_idle_avgs


print("Net Average Metrics Comparison:\n")
print("Scala Benchmark:")
print(scala_net_avgs, "\n")
print("Erlang Benchmark:")
print(erlang_net_avgs)