# WORKFLOW 2

In [None]:
# IMPORTS
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import json

In [None]:
# GLOBAL VARIABLES
execution_count = 1  # Adjust as needed
window_size = 100
window_size_std = 400

dates = ["2025-02-03", "2025-02-04", "2025-02-05", "2025-02-06", "2025-02-07", "2025-02-08", "2025-02-09"]

folder_to_save = "./experiment_plots"

optimized_color = sns.color_palette("Blues", n_colors=1)[0]  # Main blue shade
baseline_color = sns.color_palette("Oranges", n_colors=1)[0]  # Main orange shade


theme_style = "darkgrid"
color_palette_style = "deep"
palette = sns.color_palette(color_palette_style)
sns.set_theme(style=theme_style)

In [None]:
# PREPARE DATA GROUPED

# prepare optimized data
df_list = []
for date in dates:
    file_path = f'./logs_analysis/{date}/execution_{execution_count}/{date}-optimizedFunction-logs.csv'
    
    if os.path.exists(file_path):  # Check if file exists to avoid errors
        df = pd.read_csv(file_path)
        df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert timestamp to datetime
        df['day'] = df['timestamp'].dt.date  # Extract date to a new column
        df['duration_minutes'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds() / 60  # Normalize time
        df['smoothed_execution_time'] = df['execution_time'].rolling(window=window_size).mean()

        df_list.append(df)
# Concatenate all dataframes
df_all_days_optimized = pd.concat(df_list, ignore_index=True)


# prepare baseline data
df_list = []
for date in dates:
    file_path = f'./logs_analysis/{date}/execution_{execution_count}/{date}-baselineFunction-logs.csv'
    
    if os.path.exists(file_path):  # Check if file exists to avoid errors
        df = pd.read_csv(file_path)
        df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert timestamp to datetime
        df['day'] = df['timestamp'].dt.date  # Extract date to a new column
        df['duration_minutes'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds() / 60  # Normalize time
        df['smoothed_execution_time'] = df['execution_time'].rolling(window=window_size).mean()

        df_list.append(df)
# Concatenate all dataframes
df_all_days_baseline = pd.concat(df_list, ignore_index=True)



# GROUP ALL DAYS TOGETHER
df_optimized_all_days = df_all_days_optimized
# Sort the dataframe by duration_minutes to ensure proper rolling calculations
df_optimized_all_days = df_optimized_all_days.sort_values(by='duration_minutes')

df_baseline_all_days = df_all_days_baseline
# Sort the dataframe by duration_minutes to ensure proper rolling calculations
df_baseline_all_days = df_baseline_all_days.sort_values(by='duration_minutes')


# Compute rolling median & std for optimized function
df_optimized_all_days['smoothed_median'] = df_optimized_all_days['execution_time'].rolling(window=window_size, center=True).median()
df_optimized_all_days['smoothed_std'] = df_optimized_all_days['execution_time'].rolling(window=window_size_std, center=True).std()

# Compute rolling median & std for baseline function
df_baseline_all_days['smoothed_median'] = df_baseline_all_days['execution_time'].rolling(window=window_size, center=True).median()
df_baseline_all_days['smoothed_std'] = df_baseline_all_days['execution_time'].rolling(window=window_size_std, center=True).std()

# Drop NaNs
df_optimized_all_days = df_optimized_all_days.dropna(subset=['smoothed_median', 'smoothed_std'])
df_baseline_all_days = df_baseline_all_days.dropna(subset=['smoothed_median', 'smoothed_std'])

# Get max duration
max_duration = max(df_optimized_all_days['duration_minutes'].max(), df_baseline_all_days['duration_minutes'].max())

In [None]:
# PREPARE DATA FOR EACH METRIC (IMPROVEMENTS)

# Dictionary to store improvement values for each metric
metric_improvements = {}

# READ FILES
for date in dates:
    data_dir = f"./logs_analysis/{date}/execution_{execution_count}/"
    file_path = os.path.join(data_dir, f"table.json")
    try:
        with open(file_path, "r") as file:
            data = json.load(file)
            for entry in data:
                metric = entry["Metric"]
                improvement = entry["% Improvement"]  # Keep original value
                
                if metric not in metric_improvements:
                    metric_improvements[metric] = []

                # Store the (date, improvement) tuple
                metric_improvements[metric].append((date, improvement))

    except FileNotFoundError:
        print(f"Warning: File for {date} not found, skipping.")


# PREPARE METRICS DATA FOR PLOTS 

# Convert metric_improvements dictionary into a DataFrame
df_list = []
for metric, values in metric_improvements.items():
    for date, improvement in values:
        df_list.append({"Date": date, "Metric": metric, "% Improvement": improvement})

df = pd.DataFrame(df_list)

# Convert Date column to datetime format for proper plotting
df["Date"] = pd.to_datetime(df["Date"])

# Define relevant metrics for plotting
relevant_metrics = ["Function Execution Time", "Download Duration", "Linear Regression Execution Time"]

# Filter data for the first plot
df_filtered = df[df["Metric"].isin(relevant_metrics)]
df_filtered2 = df[df["Metric"].isin(["Function Execution Time", "Linear Regression Execution Time"])]
df_function_execution_time = df[df["Metric"].isin(["Function Execution Time"])]

# Convert 'dates' list to datetime format before creating the mapping
dates_datetime = pd.to_datetime(dates)  # Ensures correct format
# Create mapping using datetime keys
date_mapping = {date: f"Day {i+1}" for i, date in enumerate(sorted(dates_datetime))}
# Apply the mapping to the 'day' column
df_filtered['day_label'] = df_filtered['Date'].map(date_mapping)
df_filtered2['day_label'] = df_filtered2['Date'].map(date_mapping)
df_function_execution_time['day_label'] = df_function_execution_time['Date'].map(date_mapping)


# Define color palette for consistency
num_days = df["Date"].nunique()

# Define y axis label
y_axis_label = "% Improvement Over Baseline"

In [None]:
# ANALYSING BENCHMARK PASSED AND BM >

# Define a list to store the benchmark statistics for each file
benchmark_stats_list = []

# Function to extract benchmark numbers (as already defined)
def extract_bm_passed(value):
    if isinstance(value, str) and value.startswith("BM PASSED:"):
        return float(value.split(":")[1].strip())
    return None

def extract_bm_failed(value):
    if isinstance(value, str) and value.startswith("BM:"):
        return float(value.split(":")[1].split(">")[0].strip())
    return None

def extract_bm_threshold(value):
    if isinstance(value, str) and value.startswith("BM:") and ">" in value:
        return float(value.split(">")[1].strip())
    return None


failed_benchmark_count_array = []
failed_benchmark_count_percentage_array = []
bm_threshold_array = []
# Loop through all dates and compute benchmark stats
for date in dates:
    file_path = f'./logs_analysis/{date}/execution_{execution_count}/{date}-optimizedFunction-logs.csv'
    
    if os.path.exists(file_path):  # Check if file exists to avoid errors
        
        df = pd.read_csv(file_path)

        # Apply benchmark extraction functions
        df["bm_passed"] = df["benchmark_duration"].apply(extract_bm_passed)
        df["bm_failed"] = df["benchmark_duration"].apply(extract_bm_failed)
        bm_threshold = df["benchmark_duration"].apply(extract_bm_threshold).dropna().iloc[0]

        # Compute statistics
        bm_passed_avg = np.nanmean(df["bm_passed"])
        bm_passed_median = np.nanmedian(df["bm_passed"])
        bm_failed_avg = np.nanmean(df["bm_failed"])
        bm_failed_median = np.nanmedian(df["bm_failed"])

        # Append stats as a tuple to the list
        benchmark_stats_list.append((date, bm_passed_avg, bm_passed_median, bm_failed_avg, bm_failed_median, bm_threshold))

        # Count failed benchmarks
        failed_benchmark_count = df["bm_failed"].notna().sum()
        failed_benchmark_count_array.append(failed_benchmark_count)
        
        total_count = len(df)
        failed_benchmark_percentage = (failed_benchmark_count / total_count) * 100 if total_count > 0 else 0
        failed_benchmark_count_percentage_array.append(failed_benchmark_percentage)

        bm_threshold_array.append(bm_threshold)


# Convert benchmark stats list to DataFrame
stats_df = pd.DataFrame(benchmark_stats_list, columns=["Date", "bm_passed_avg", "bm_passed_median", "bm_failed_avg", "bm_failed_median", "bm_threshold"])
# Ensure that the day_label is added to the stats_df DataFrame
stats_df['day_label'] = stats_df['Date'].map(date_mapping)

# Melt the DataFrame
df_melted_avg = stats_df.melt(id_vars=["Date", "day_label"], 
                          value_vars=["bm_passed_avg", "bm_failed_avg", "bm_threshold"], 
                          var_name="Metric", 
                          value_name="Value")

df_melted_median = stats_df.melt(id_vars=["Date", "day_label"], 
                          value_vars=["bm_passed_median", "bm_failed_median", "bm_threshold"], 
                          var_name="Metric", 
                          value_name="Value")

df_bm_and_failedavg = stats_df.melt(id_vars=["Date", "day_label"], 
                             value_vars=["bm_failed_avg", "bm_threshold"],
                             var_name="Metric", 
                             value_name="Value")

##### GRAPHS FOR EXECUTIONS: SINGULAR AND GROUPED

In [None]:
# OPTIMIZED EXECUTIONS
sns.set_theme(style=theme_style)
num_days = df_all_days_optimized["day"].nunique()
palette = sns.color_palette("light:b", n_colors=num_days+2)[1:-1]

# Create the plot
plt.figure(figsize=(12, 6))

# Plot execution time for different days
sns.lineplot(
    data=df_all_days_optimized, 
    x='duration_minutes', 
    y='smoothed_execution_time', 
    hue='day', 
    lw=2, 
    palette=palette
)

# Add labels and title
plt.xlabel('Duration (min)')
plt.ylabel('Optimized Execution Time (ms)')
plt.title('Optimized Execution Time Over Multiple Days (Smoothed)')

# Set x-axis and y-axis to start at 0
plt.xlim(left=0, right=max_duration)
plt.ylim(bottom=0)

# Add a legend
plt.legend(title="Day")

# Shrink layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "execution-times-optimized.pdf")
plt.savefig(plot_filename)
# SHOW PLOT
plt.show()

In [None]:
# BASELINE EXECUTIONS
sns.set_theme(style=theme_style)
# palette = sns.color_palette("YlOrBr", n_colors=df_all_days_optimized["day"].nunique())
num_days = df_all_days_optimized["day"].nunique()
palette = sns.color_palette("light:r", n_colors=num_days+2)[1:-1]

# Create the plot
plt.figure(figsize=(12, 6))

# Plot execution time for different days
sns.lineplot(
    data=df_all_days_optimized, 
    x='duration_minutes', 
    y='smoothed_execution_time', 
    hue='day', 
    lw=2, 
    palette=palette
)

# Add labels and title
plt.xlabel('Duration (min)')
plt.ylabel('Baseline Execution Time (ms)')
plt.title('Baseline Execution Time Over Multiple Days (Smoothed)')

# Set x-axis and y-axis to start at 0
plt.xlim(left=0, right=max_duration)
plt.ylim(bottom=0)

# Add a legend
plt.legend(title="Day")

# Shrink layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "execution-times-baseline.pdf")
plt.savefig(plot_filename)
# SHOW PLOT
plt.show()

In [None]:
# BASELINE AND OPTIMIZED EXECUTIONS IN ONE GRAPH
sns.set_theme(style=theme_style)
num_days = df_all_days_optimized["day"].nunique()

# Define color palettes
optimized_palette = sns.color_palette("light:b", n_colors=num_days+2)[1:-1]
baseline_palette = sns.color_palette("light:r", n_colors=num_days+2)[1:-1]

plt.figure(figsize=(12, 6))

# Plot optimized execution times
optimized_lines = sns.lineplot(
    data=df_all_days_optimized,
    x='duration_minutes',
    y='smoothed_execution_time',
    hue='day',
    lw=2,
    palette=optimized_palette
)

# Plot baseline execution times
baseline_lines = sns.lineplot(
    data=df_all_days_baseline,
    x='duration_minutes',
    y='smoothed_execution_time',
    hue='day',
    lw=2,
    palette=baseline_palette
)

# Labels and title
plt.xlabel('Duration (min)')
plt.ylabel('Execution Time (ms)')
plt.title('Optimized vs. Baseline Execution Time Over Multiple Days (Smoothed)')

plt.xlim(left=0, right=max_duration)
plt.ylim(bottom=0)

# Add a legend
plt.legend(title="Day")

# Shrink layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "execution-times-optimized-and-baseline.pdf")
plt.savefig(plot_filename)

# SHOW PLOT
plt.show()

In [None]:
# GROUPED OPTIMIZED RESULTS WITH ERROR BAND

# Plot the data
sns.set_theme(style=theme_style)
palette = sns.color_palette(color_palette_style)

plt.figure(figsize=(12, 6))

# Plot smoothed median execution time
sns.lineplot(
    data=df_optimized_all_days,
    x='duration_minutes',
    y='smoothed_median',
    color=palette[0],
    label='Optimized Function (Smoothed Median)',
    lw=2
)

# Add error band using standard deviation
plt.fill_between(
    df_optimized_all_days['duration_minutes'],
    df_optimized_all_days['smoothed_median'] - df_optimized_all_days['smoothed_std'],
    df_optimized_all_days['smoothed_median'] + df_optimized_all_days['smoothed_std'],
    color=palette[0],
    alpha=0.2,
    label='Standard Deviation'
)

# Add labels and title
plt.xlabel('Duration (min)')
plt.ylabel('Execution Time (ms)')
plt.title('Smoothed Median Execution Time with Error Band (Optimized Function)')

# Set x-axis and y-axis to start at 0
plt.xlim(left=0, right=max_duration)
plt.ylim(bottom=0)

# Add a legend
plt.legend()

# Shrink layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "grouped-optimized-error-bands.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# GROUPED BASELINE RESULTS WITH ERROR BAND

# Plot the data
sns.set_theme(style=theme_style)
palette = sns.color_palette(color_palette_style)

plt.figure(figsize=(12, 6))

# Plot smoothed median execution time
sns.lineplot(
    data=df_baseline_all_days,
    x='duration_minutes',
    y='smoothed_median',
    color=palette[1],
    label='Optimized Function (Smoothed Median)',
    lw=2
)

# Add error band using standard deviation
plt.fill_between(
    df_baseline_all_days['duration_minutes'],
    df_baseline_all_days['smoothed_median'] - df_baseline_all_days['smoothed_std'],
    df_baseline_all_days['smoothed_median'] + df_baseline_all_days['smoothed_std'],
    color=palette[1],
    alpha=0.2,
    label='Standard Deviation'
)

# Add labels and title
plt.xlabel('Duration (min)')
plt.ylabel('Execution Time (ms)')
plt.title('Smoothed Median Execution Time with Error Band (Optimized Function)')

# Set x-axis and y-axis to start at 0
plt.xlim(left=0, right=max_duration)
plt.ylim(bottom=0)

# Add a legend
plt.legend()

# Shrink layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "grouped-baseline-error-bands.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# GROUPED RESULTS WITH ERROR BANDS

# Set up plot
sns.set_theme(style=theme_style)
palette = sns.color_palette(color_palette_style)

plt.figure(figsize=(12, 6))

# Optimized function
sns.lineplot(
    data=df_optimized_all_days, x='duration_minutes', y='smoothed_median',
    color=palette[0], label='Optimized Function (Smoothed Median)', lw=2
)
plt.fill_between(
    df_optimized_all_days['duration_minutes'],
    df_optimized_all_days['smoothed_median'] - df_optimized_all_days['smoothed_std'],
    df_optimized_all_days['smoothed_median'] + df_optimized_all_days['smoothed_std'],
    color=palette[0], alpha=0.2, label='Optimized Std Dev (Smoothed)'
)

# Baseline function
sns.lineplot(
    data=df_baseline_all_days, x='duration_minutes', y='smoothed_median',
    color=palette[1], label='Baseline Function (Smoothed Median)', lw=2
)
plt.fill_between(
    df_baseline_all_days['duration_minutes'],
    df_baseline_all_days['smoothed_median'] - df_baseline_all_days['smoothed_std'],
    df_baseline_all_days['smoothed_median'] + df_baseline_all_days['smoothed_std'],
    color=palette[1], alpha=0.2, label='Baseline Std Dev (Smoothed)'
)

# Labels and title
plt.xlabel('Duration (min)')
plt.ylabel('Execution Time (ms)')
plt.title('Execution Time Comparison: Optimized vs. Baseline')

# Axis limits
plt.xlim(left=0, right=max_duration)
plt.ylim(bottom=0)

# Legend
plt.legend()

# Layout adjustment
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "grouped-optimized-and-baseline-error-bands.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

##### TABLES

In [None]:
# IMPROVEMENT OVER ALL DAYS

# Convert DATA into Pandas DataFrames
tables = {}
for metric, values in metric_improvements.items():
    df = pd.DataFrame(values, columns=["Date", "% Improvement"])
    df.sort_values("Date", inplace=True)
    
    # Save table
    safe_metric_name = metric.lower().replace(" ", "_")
    file_name = f"table-{safe_metric_name}.json"
    output_path = os.path.join(folder_to_save, file_name)
    df.to_json(output_path, orient="records", indent=4)

    # Apply formatting (1 decimal place for % Improvement)
    styled_df = df.style.format({"% Improvement": "{:.2f}%"}) \
                        .set_table_styles([{
                            'selector': 'thead th', 
                            'props': [
                                ('font-size', '14px'), 
                                ('text-align', 'center'),
                                ('font-weight', 'bold')
                            ],
                        }]) \
                        .set_properties(**{
                            'text-align': 'center',
                            'font-size': '12px',
                            'padding': '7px',
                        }) \
                        .hide(axis="index")
    
    tables[metric] = styled_df  # Store formatted DataFrame for each metric

# DISPLAY TABLES
for metric, styled_df in tables.items():
    print(f"\n{metric}")
    display(styled_df)  # Shows the styled table nicely in Jupyter Notebook

In [None]:
# AVG IMPROVEMENT FOR EACH METRIC

# CALCULATE
avg_improvements = {
    metric: sum(improvement for _, improvement in values) / len(values) 
    for metric, values in metric_improvements.items() if values
}

# AVG INTO DATA FRAME
avg_df = pd.DataFrame(avg_improvements.items(), columns=["Metric", "Average % Improvement"])
styled_avg_df = avg_df.style.format({"Average % Improvement": "{:.2f}%"}) \
                            .set_table_styles([{
                                'selector': 'thead th', 
                                'props': [
                                    ('font-size', '14px'), 
                                    ('text-align', 'center'),
                                    ('font-weight', 'bold')
                                ],
                            }]) \
                            .set_properties(**{
                                'text-align': 'center',
                                'font-size': '12px',
                                'padding': '7px',
                            }) \
                            .hide(axis="index")

print("\nAverage Improvements Across All Dates")
display(styled_avg_df)

# SAVE AVG TABLE
file_name = "table-avg.json"
output_path = os.path.join(folder_to_save, file_name)
avg_df.to_json(output_path, orient="records", indent=4)

In [None]:
# MEDIAN IMPROVEMENT FOR EACH METRIC
median_improvements = {
    metric: np.median([improvement for _, improvement in values])  # Using np.median
    for metric, values in metric_improvements.items() if values
}

# CONVERT TO DATAFRAME
median_df = pd.DataFrame(median_improvements.items(), columns=["Metric", "Median % Improvement"])
styled_median_df = median_df.style.format({"Median % Improvement": "{:.2f}%"}) \
                                  .set_table_styles([{
                                      'selector': 'thead th',
                                      'props': [
                                          ('font-size', '14px'),
                                          ('text-align', 'center'),
                                          ('font-weight', 'bold')
                                      ],
                                  }]) \
                                  .set_properties(**{
                                      'text-align': 'center',
                                      'font-size': '12px',
                                      'padding': '7px',
                                  }) \
                                  .hide(axis="index")

print("\nMedian Improvements Across All Dates")
display(styled_median_df)

# SAVE MEDIAN TABLE
file_name = "table-median.json"
output_path = os.path.join(folder_to_save, file_name)
median_df.to_json(output_path, orient="records", indent=4)

##### GRAPHS FOR METRICS

In [None]:
# 1ST FUNC EXEC TIME, DOWNLOAD DURATION, LR EXEC TIME
plt.figure(figsize=(12, 6))

sns.lineplot(
    data=df_filtered, 
    x="day_label", 
    y="% Improvement", 
    hue="Metric", 
    lw=2, 
    palette=palette[:3],
    marker="o"
)

# Add labels and title
plt.xlabel(None)
plt.ylabel(y_axis_label)
plt.title("Improvement Over Time for All Three Metrics")

# Format x-axis ticks
plt.xticks(rotation=45)

# Start y-axis at 0
# plt.ylim(bottom=0)

# Add a legend
plt.legend(title="Metric")

# Adjust layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "improvement_all_metrics.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# 2ND FUNC EXEC TIME vs LR EXECUTION TIME

plt.figure(figsize=(12, 6))

sns.lineplot(
    data=df_filtered2, 
    x="day_label", 
    y="% Improvement", 
    hue="Metric", 
    lw=2, 
    palette=palette[:2],  # Use only the first two colors for consistency
    marker="o"
)

# Add labels and title
plt.xlabel(None)
plt.ylabel(y_axis_label)
# plt.title("Comparison: Function Execution Time vs. Linear Regression Execution Time")

# Format x-axis ticks
plt.xticks(rotation=45)

# Start y-axis at 0
plt.ylim(bottom=0)

# Add a legend
plt.legend(title="Metric")

# Adjust layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "improvement_function_exec_vs_linear_regression.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# 3RD BOXPLOT: FUNC EXEC TIME, DOWNLOAD DURATION, LR EXEC TIME

# --- Boxplot for Each Metric ---
plt.figure(figsize=(12, 6))

sns.boxplot(
    data=df_filtered, 
    x="Metric", 
    y="% Improvement", 
    hue="Metric",  # Assign Metric to hue
    palette=palette[:3],  # Use the chosen palette
    width=0.5
)

# Add labels and title
# plt.xlabel("Metric")
plt.xlabel(None)
# plt.xticks(rotation=25)
plt.ylabel(y_axis_label)
# plt.title("Distribution of % Improvement Over Baseline for Each Metric")

# plt.ylim(bottom=0)
# plt.ylim(df_filtered["% Improvement"].min() - 1, df_filtered["% Improvement"].max() + 1)

# Adjust layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "boxplot_improvement_by_metric.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# 4TH FUNCTION EXEC TIME vs FAILED BM COUNT -> COUNT
plt.figure(figsize=(8, 4))

# First y-axis (Percentage Improvement)
ax1 = plt.gca()  # Get current axis
sns.lineplot(
    data=df_function_execution_time, 
    x="day_label", 
    y="% Improvement", 
    hue="Metric", 
    lw=2, 
    palette=palette[:1],  # Use only the first two colors for consistency
    marker="o",
    ax=ax1
)

# Set labels for the first y-axis
ax1.set_xlabel(None)
ax1.set_ylabel(y_axis_label)
ax1.tick_params(axis='y')

# Start y-axis at 0
ax1.set_ylim(bottom=0)

# Add a second y-axis (Milliseconds)
ax2 = ax1.twinx()
ax2.plot(
    df_function_execution_time["day_label"].unique(), 
    failed_benchmark_count_array, 
    color=palette[3], 
    linestyle="--", 
    marker="s", 
    label="Failed BM attempts"
)

# Set labels for the second y-axis
ax2.set_ylabel("Count")
ax2.tick_params(axis='y')

# Add legends
ax1.legend(title="Metric", loc="lower left")
ax2.legend(title="Threshold", loc="lower right")

ax2.set_ylim(bottom=0)

# Adjust layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "function_execution-vs-failed_bm_count.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# 5TH FUNCTION EXEC TIME vs FAILED BM COUNT -> PERCENTAGE
plt.figure(figsize=(8, 4))

# First y-axis (Percentage Improvement)
ax1 = plt.gca()  # Get current axis
sns.lineplot(
    data=df_function_execution_time, 
    x="day_label", 
    y="% Improvement", 
    hue="Metric", 
    lw=2, 
    palette=palette[:1],  # Use only the first two colors for consistency
    marker="o",
    ax=ax1
)

# Set labels for the first y-axis
ax1.set_xlabel(None)
ax1.set_ylabel(y_axis_label)
ax1.tick_params(axis='y')

# Start y-axis at 0
ax1.set_ylim(bottom=0)

# Add a second y-axis (Milliseconds)
ax2 = ax1.twinx()
ax2.plot(
    df_function_execution_time["day_label"].unique(), 
    failed_benchmark_count_percentage_array, 
    color=palette[3], 
    linestyle="--", 
    marker="s", 
    label="Failed BM attempts"
)

# Set labels for the second y-axis
ax2.set_ylabel("% Failed BM attempts")
ax2.tick_params(axis='y')

# Add legends
ax1.legend(title="Metric", loc="lower left")
ax2.legend(title="Threshold", loc="lower right")

ax2.set_ylim(bottom=0)

# Adjust layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "function_execution-vs-failed_bm_percentage.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# 6TH ANALYSE FAILED BM ATTEMPTS

for date in dates:
    file_path = f'./logs_analysis/{date}/execution_{execution_count}/{date}-optimizedFunction-logs.csv'
    df = pd.read_csv(file_path)

    # Convert timestamp to datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Determine the experiment start time (earliest timestamp)
    experiment_start = df['timestamp'].min()

    # Filter passed and failed benchmarks and store their relative minutes
    passed_timestamps = []
    failed_timestamps = []

    # Extract the relative minute for each pass
    for _, row in df.iterrows():
        if isinstance(row['benchmark_duration'], str):
            if row['benchmark_duration'].startswith("BM PASSED:"):
                timestamp = (row['timestamp'] - experiment_start).total_seconds() / 60
                passed_timestamps.append(timestamp)
            elif row['benchmark_duration'].startswith("BM:"):
                timestamp = (row['timestamp'] - experiment_start).total_seconds() / 60
                failed_timestamps.append(timestamp)

    # Create a DataFrame for seaborn visualization
    df_plot = pd.DataFrame({
        'Timestamp': failed_timestamps,
        'Benchmark Type': ['Failed'] * len(failed_timestamps)
    })

    # Plot with Seaborn
    plt.figure(figsize=(12, 2))
    sns.scatterplot(
        data=df_plot, 
        x='Timestamp', 
        y='Benchmark Type', 
        hue='Benchmark Type', 
        palette=palette[:1], 
        s=50
    )

    # Customize plot
    plt.title(f"Failed Benchmark Attempts on {date}")
    plt.xlabel("Relative Time (Minutes)")
    plt.ylabel("Benchmark Type")
    plt.grid(True)
    plt.show()

In [None]:
# 7TH THRESHOLD, FAILED AVG, % FAILED BM 
plt.figure(figsize=(8, 6))

# First y-axis (bm_threshold_array)
ax1 = plt.gca()  # Get current axis

sns.lineplot(
    data=df_bm_and_failedavg, 
    x="day_label", 
    y="Value", 
    hue="Metric", 
    lw=2, 
    marker="o",
    ax=ax1
)

# Set labels for the first y-axis
ax1.set_xlabel(None)
ax1.set_ylabel("milliseconds")
ax1.tick_params(axis='y')

# Start y-axis at 0
ax1.set_ylim(bottom=0)

# Second y-axis (Failed Benchmark Count)
ax2 = ax1.twinx()
ax2.plot(
    df_filtered2["day_label"].unique(), 
    failed_benchmark_count_percentage_array, 
    color=palette[3], 
    linestyle="--", 
    marker="s", 
    label="% Failed BM attempts"
)

# Set labels for the second y-axis
ax2.set_ylabel("% Failed BM attempts")
ax2.tick_params(axis='y')

# Add legends
ax1.legend(title="Metric", loc="lower left")
ax2.legend(title="Threshold", loc="lower right")

ax2.set_ylim(bottom=0)

# Adjust layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "improvement_function_exec_vs_linear_regression.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# 8TH FUNC EXEC TIME vs LR EXEC TIME vs THRESHOLD
plt.figure(figsize=(12, 6))

# First y-axis (Percentage Improvement)
ax1 = plt.gca()  # Get current axis
sns.lineplot(
    data=df_filtered2, 
    x="day_label", 
    y="% Improvement", 
    hue="Metric", 
    lw=2, 
    palette=palette[:2],  # Use only the first two colors for consistency
    marker="o",
    ax=ax1
)

# Set labels for the first y-axis
ax1.set_xlabel(None)
ax1.set_ylabel(y_axis_label)
ax1.tick_params(axis='y')

# Start y-axis at 0
ax1.set_ylim(bottom=0)

# Add a second y-axis (Milliseconds)
ax2 = ax1.twinx()
ax2.plot(
    df_filtered2["day_label"].unique(), 
    bm_threshold_array, 
    color=palette[3], 
    linestyle="--", 
    marker="s", 
    label="Threshold (ms)"
)

# Set labels for the second y-axis
ax2.set_ylabel("Threshold (ms)")
ax2.tick_params(axis='y')

# Add legends
ax1.legend(title="Metric", loc="upper left")
ax2.legend(title="Threshold", loc="upper right")

# ax2.set_ylim(bottom=0)

# Adjust layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "function_exec-vs-lr_exec-vs-threshold.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# 9TH FUNC EXEC TIME vs THRESHOLD
plt.figure(figsize=(12, 6))

# First y-axis (Percentage Improvement)
ax1 = plt.gca()  # Get current axis
sns.lineplot(
    data=df_function_execution_time, 
    x="day_label", 
    y="% Improvement", 
    hue="Metric", 
    lw=2, 
    palette=palette[:1],  # Use only the first two colors for consistency
    marker="o",
    ax=ax1
)

# Set labels for the first y-axis
ax1.set_xlabel(None)
ax1.set_ylabel(y_axis_label)
ax1.tick_params(axis='y')

# Start y-axis at 0
ax1.set_ylim(bottom=0)

# Add a second y-axis (Milliseconds)
ax2 = ax1.twinx()
ax2.plot(
    df_filtered2["day_label"].unique(), 
    bm_threshold_array, 
    color=palette[3], 
    linestyle="--", 
    marker="s", 
    label="Threshold (ms)"
)

# Set labels for the second y-axis
ax2.set_ylabel("Threshold (ms)")
ax2.tick_params(axis='y')

# Add legends
ax1.legend(title="Metric", loc="upper left")
ax2.legend(title="Threshold", loc="upper right")

# ax2.set_ylim(bottom=0)

# Adjust layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "function_exec-vs-threshold.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# Passed Avg, Failed Avg, Threshold
plt.figure(figsize=(12, 6))

sns.lineplot(
    data=df_melted_avg, 
    x="day_label", 
    y="Value", 
    hue="Metric", 
    lw=2, 
    palette=palette[:3],
    marker="o"
)

# Add labels and title
plt.xlabel(None)
plt.ylabel("Benchmark Value")
plt.title("Benchmark Metrics Over Time (Passed Avg, Failed Avg, Threshold)")

# Format x-axis ticks
# plt.xticks(rotation=45)

# Add a legend
plt.legend(title="Metric")

# Adjust layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "benchmark_metrics_over_time_avg.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

In [None]:
# Passed Median, Failed Median, Threshold
plt.figure(figsize=(12, 6))

sns.lineplot(
    data=df_melted_median, 
    x="day_label", 
    y="Value", 
    hue="Metric", 
    lw=2, 
    palette=palette[:3],
    marker="o"
)

# Add labels and title
plt.xlabel(None)
plt.ylabel("Benchmark Value")
plt.title("Benchmark Metrics Over Time (Passed Median, Failed Median, Threshold)")

# Add a legend
plt.legend(title="Metric")

# Adjust layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "benchmark_metrics_over_time_median.pdf")
plt.savefig(plot_filename)

# Show plot
plt.show()

##### Single day

In [None]:
# SETUP DATA FOR PLOT

df_optimized = pd.read_csv(f"./logs_analysis/2025-02-04/execution_{execution_count}/2025-02-04-optimizedFunction-logs.csv")
df_baseline = pd.read_csv(f"./logs_analysis/2025-02-04/execution_{execution_count}/2025-02-04-baselineFunction-logs.csv")

# Convert 'timestamp' col to datetime
df_optimized['timestamp'] = pd.to_datetime(df_optimized['timestamp'])
df_baseline['timestamp'] = pd.to_datetime(df_baseline['timestamp'])
# Create col duration in minutes for x-axis
df_optimized['duration_minutes'] = (df_optimized['timestamp'] - df_optimized['timestamp'].min()).dt.total_seconds() / 60
df_baseline['duration_minutes'] = (df_baseline['timestamp'] - df_baseline['timestamp'].min()).dt.total_seconds() / 60
# Create col for smoothed execution time
df_optimized['smoothed_execution_time'] = df_optimized['execution_time'].rolling(window=75).mean()
df_baseline['smoothed_execution_time'] = df_baseline['execution_time'].rolling(window=75).mean()
# Create col for smoothed lr-duration
df_optimized = df_optimized[df_optimized['lr_duration'] != 0] # filter out zeros
df_optimized['smoothed_lr_duration'] = df_optimized['lr_duration'].rolling(window=75).mean()
df_baseline['smoothed_lr_duration'] = df_baseline['lr_duration'].rolling(window=75).mean()

df_optimized = df_optimized[df_optimized['lr_duration'] != 0]

In [None]:
# EXECUTION TIME ROLLING AVG

sns.set_theme(style="darkgrid")
palette = sns.color_palette("deep")

# Create the plot
plt.figure(figsize=(12, 6))

# Plot optimized function execution time
sns.lineplot(data=df_optimized, x='duration_minutes', y='smoothed_execution_time', color=palette[0], label='Optimized Function (Smoothed)', lw=2)

# Plot baseline function execution time
sns.lineplot(data=df_baseline, x='duration_minutes', y='smoothed_execution_time', color=palette[1], label='Baseline Function (Smoothed)', lw=2)

# Add labels and title
plt.xlabel('Duration (min)')
plt.ylabel('Execution Time (ms)')
plt.title('Comparison of Optimized vs Baseline Function Execution Times (Smoothed)')

# Set x-axis and y-axis to start at 0
plt.xlim(left=0, right=max_duration)
plt.ylim(bottom=0)

# Add a legend
plt.legend()

# Shrink layout
plt.tight_layout()

# SAVE PLOT
plot_filename = os.path.join(folder_to_save, "2025-02-04-execution-times-rolling-avg.pdf")
plt.savefig(plot_filename)
# SHOW PLOT
plt.show() 