# WORKFLOW 3

In [1]:
# IMPORTS
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os

In [2]:
# GLOBAL VARIABLES
theme_style = "darkgrid"
color_palette_style = "deep"
palette = sns.color_palette(color_palette_style)

# Set a warm-up phase (e.g., ignore the first 10 executions)
warmup_executions = 200  # Adjust as needed

days = [
    "2025-02-03", "2025-02-04", "2025-02-05", "2025-02-06", "2025-02-07", "2025-02-08", "2025-02-09"
]
day_labels = [
    "Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6", "Day 7"
]

window_size = 100

folder_break_even = "./experiment_plots/break-even-analysis"
file_ending = ".pdf"

In [3]:
# setup directories
os.makedirs(folder_break_even, exist_ok=True)

##### FUNCTIONS

In [4]:
# format minutes to time
def format_minutes_to_time(minutes):
    """Convert decimal minutes to MM:SS format."""
    total_seconds = int(minutes * 60)
    mm = total_seconds // 60
    ss = total_seconds % 60
    return f"{mm:02}:{ss:02}"

In [5]:
# calculate break even day
def calculate_break_even_day(baseline_df, optimized_df, warmup_executions=0):
    # Sort and prepare data
    baseline_df = baseline_df.sort_values(by="timestamp").reset_index(drop=True)
    optimized_df = optimized_df.sort_values(by="timestamp").reset_index(drop=True)

    # Convert timestamp to datetime
    baseline_df["timestamp"] = pd.to_datetime(baseline_df["timestamp"])
    optimized_df["timestamp"] = pd.to_datetime(optimized_df["timestamp"])

    # Compute cumulative execution time
    baseline_df["cumulative_time"] = baseline_df["execution_time"].cumsum()
    optimized_df["cumulative_time"] = optimized_df["execution_time"].cumsum()

    # Compute elapsed time in minutes
    baseline_df["duration_minutes"] = (baseline_df["timestamp"] - baseline_df["timestamp"].min()).dt.total_seconds() / 60
    optimized_df["duration_minutes"] = (optimized_df["timestamp"] - optimized_df["timestamp"].min()).dt.total_seconds() / 60

    # Trim the optimized function to match the length of the baseline function
    optimized_df_trimmed = optimized_df.iloc[:len(baseline_df)].reset_index(drop=True)

    # Find break-even point (ignore first N executions)
    break_even_index = (
        (optimized_df_trimmed["cumulative_time"] < baseline_df["cumulative_time"])
        & (optimized_df_trimmed.index >= warmup_executions)
    ).idxmax()

    return baseline_df, optimized_df, break_even_index

In [6]:
# calculate break even each day
def calculate_break_even_each_day():
    optimized_df_list = []
    baseline_df_list = []
    # List to hold break-even points for each day
    break_even_points = []
    break_even_indices = []

    # Loop through each day and calculate the break-even point
    for day in days:
        # Load the data for the current day
        baseline_df = pd.read_csv(f"./logs_analysis/{day}/execution_1/{day}-baselineFunction-logs.csv")
        optimized_df = pd.read_csv(f"./logs_analysis/{day}/execution_1/{day}-optimizedFunction-logs.csv")
        
        # Get the break-even point for this day
        baseline_df, optimized_df, break_even_index = calculate_break_even_day(baseline_df, optimized_df, warmup_executions)

        # Store the dataframes for later use
        baseline_df_list.append(baseline_df)
        optimized_df_list.append(optimized_df)

        # Calculate the break-even time and append it to the list
        break_even_time = baseline_df["duration_minutes"].iloc[break_even_index]
        break_even_points.append(break_even_time)
        break_even_indices.append(break_even_index)

    return optimized_df_list, baseline_df_list, break_even_points, break_even_indices

In [7]:
# gather grouped data
def gather_optimized_all_days():
    # prepare optimized data
    df_list = []
    for date in days:
        file_path = f'./logs_analysis/{date}/execution_1/{date}-optimizedFunction-logs.csv'
        if os.path.exists(file_path):  # Check if file exists to avoid errors
            df = pd.read_csv(file_path)
            df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert timestamp to datetime
            df['duration_minutes'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds() / 60  # Normalize time
            df['smoothed_execution_time'] = df['execution_time'].rolling(window=window_size).mean()

            df_list.append(df)
    
    # Concatenate all dataframes
    df_optimized_all_days = pd.concat(df_list, ignore_index=True)
    # Sort the dataframe by duration_minutes to ensure proper rolling calculations
    df_optimized_all_days = df_optimized_all_days.sort_values(by=['duration_minutes', 'duration_minutes'])
    # Compute rolling median & std for optimized function
    df_optimized_all_days['smoothed_median'] = df_optimized_all_days['execution_time'].rolling(window=window_size, center=True).median()
    # Drop NaNs
    df_optimized_all_days = df_optimized_all_days.dropna(subset=['smoothed_median'])

    df_optimized_all_days = df_optimized_all_days.reset_index(drop=True)
    return df_optimized_all_days

def gather_baseline_all_days():
    # prepare optimized data
    df_list = []
    for date in days:
        file_path = f'./logs_analysis/{date}/execution_1/{date}-baselineFunction-logs.csv'
        if os.path.exists(file_path):  # Check if file exists to avoid errors
            df = pd.read_csv(file_path)
            df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert timestamp to datetime
            df['duration_minutes'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds() / 60  # Normalize time
            df['smoothed_execution_time'] = df['execution_time'].rolling(window=window_size).mean()

            df_list.append(df)
    
    # Concatenate all dataframes
    df_baseline_all_days = pd.concat(df_list, ignore_index=True)
    # Sort the dataframe by duration_minutes to ensure proper rolling calculations
    df_baseline_all_days = df_baseline_all_days.sort_values(by=['duration_minutes', 'duration_minutes'])
    # Compute rolling median & std for optimized function
    df_baseline_all_days['smoothed_median'] = df_baseline_all_days['execution_time'].rolling(window=window_size, center=True).median()
    # Drop NaNs
    df_baseline_all_days = df_baseline_all_days.dropna(subset=['smoothed_median'])

    df_baseline_all_days = df_baseline_all_days.reset_index(drop=True)
    return df_baseline_all_days

##### ALL DAYS INDIVIDUALLY

In [8]:
# execute break even for all 7 days
break_even_points = []

# Loop through each day and calculate the break-even point
for idx, day in enumerate(days):
    # Load the data for the current day
    baseline_df = pd.read_csv(f"./logs_analysis/{day}/execution_1/{day}-baselineFunction-logs.csv")
    optimized_df = pd.read_csv(f"./logs_analysis/{day}/execution_1/{day}-optimizedFunction-logs.csv")

    # Get the break-even point for this day
    baseline_df, optimized_df, break_even_index = calculate_break_even_day(baseline_df, optimized_df, warmup_executions)

    # Calculate the break-even time
    break_even_time = baseline_df["duration_minutes"].iloc[break_even_index]
    break_even_points.append(break_even_time)

    # Calculate the break-even time
    break_even_time = baseline_df["duration_minutes"].iloc[break_even_index]
    break_even_points.append(break_even_time)


# Calculate the average or median break-even point across all days
avg_break_even = sum(break_even_points) / len(break_even_points)
median_break_even = pd.Series(break_even_points).median()

print(f"Average break-even time: {avg_break_even:.2f} min")
print(f"Median break-even time: {median_break_even:.2f} min")

Average break-even time: 7.31 min
Median break-even time: 6.70 min


##### ALL DAYS GROUPED

In [9]:
# GROUPED FIGURE
all_optimized_df = gather_optimized_all_days()
all_baseline_df = gather_baseline_all_days()

# Compute cumulative execution time
all_optimized_df["cumulative_time"] = all_optimized_df["execution_time"].cumsum()
all_baseline_df["cumulative_time"] = all_baseline_df["execution_time"].cumsum()


# Calculate break evens for each day
optimized_df_list, baseline_df_list, break_even_points, break_even_indices = calculate_break_even_each_day()
# Calculate the average and median break-even times
avg_break_even = sum(break_even_points) / len(break_even_points)
median_break_even = pd.Series(break_even_points).median()
median_run_amount = pd.Series(break_even_indices).median()


print(sorted(break_even_points))
print(sorted(break_even_indices))
print(f"Median run amount: {median_run_amount}")

[np.float64(1.28449505), np.float64(2.4433270333333335), np.float64(6.05216795), np.float64(6.699711033333333), np.float64(7.483290533333333), np.float64(7.868791933333333), np.float64(19.3455568)]
[200, 389, 938, 1023, 1206, 1230, 3093]
Median run amount: 1023.0
