In [None]:
import os
import ast
import math
import numpy as np
import pandas as pd

new_method_df = pd.read_csv(os.curdir + '/results/single-queries/tpch/2025-05-13-22H/results.csv')
old_method_df = pd.read_csv(os.curdir + '/results/single-queries/tpch/2025-05-10-15H/results.csv')

In [None]:
def add_avg_and_uncertainty(
        df:pd.DataFrame, 
        time_columns:list[str],
        avg_col_name:str,
        uncertainty_col_name:str
    ) -> pd.DataFrame:
    def _compute_for_row(row):
        # Pull out the execution time values
        times = row[time_columns].astype(float).values
        
        # Calculate mean
        mean_val = times.mean()

        # Get standard deviation
        std_dev = times.std(ddof=1)

        # Standard error of the mean
        std_error = std_dev / math.sqrt(len(times))

        # Decide how many decimal places to keep
        decimals = 0
        if std_error > 0:
            decimals = -int(math.floor(math.log10(std_error)))
        
        # Round accordingly
        rounded_error = round(std_error, decimals)
        rounded_mean = round(mean_val, decimals)

        return pd.Series({"Average": rounded_mean, "Uncertainty": rounded_error})

    # Apply per-row function to the df
    df[[avg_col_name, uncertainty_col_name]] = df.apply(_compute_for_row, axis=1)
    return df

# Get rounded mean and uncertainty for both result dfs
new_method_df = add_avg_and_uncertainty(
    df=new_method_df,
    time_columns=[f"Iteration {i}" for i in range(1, 5)],
    avg_col_name="New Method, Time",
    uncertainty_col_name="New Method, Uncertainty"
)
old_method_df = add_avg_and_uncertainty(
    df=old_method_df,
    time_columns=[f"Iteration {i}" for i in range(1, 5)],
    avg_col_name="Old Method, Time",
    uncertainty_col_name="Old Method, Uncertainty"
)
new_method_df.head(25)

In [None]:
# Drop individual query execution time columns
# Execution time columns
new_method_df.drop(columns=[f"Iteration {i}" for i in range(5)] + ["Average (last 4 runs)"], inplace=True)
old_method_df.drop(columns=[f"Iteration {i}" for i in range(5)] + ["Average (last 4 runs)"], inplace=True)

# Set index columns
new_method_df.set_index(["Query", "Materialization"], inplace=True)
old_method_df.set_index(["Query", "Materialization"], inplace=True)

# Merge
results_df = new_method_df.join(old_method_df, on=["Query", "Materialization"])

# Reset index
results_df.reset_index(inplace=True)

In [None]:
# Add a new column telling which method is faster (with uncertainty)
def label_fastest_method(df):
    time_col_new = "New Method, Time"
    time_col_old = "Old Method, Time"
    uncertainty_col_new = "New Method, Uncertainty"
    uncertainty_col_old = "Old Method, Uncertainty"

    new_upper = df[time_col_new] + df[uncertainty_col_new]
    new_lower = df[time_col_new] - df[uncertainty_col_new]
    old_upper = df[time_col_old] + df[uncertainty_col_old]
    old_lower = df[time_col_old] - df[uncertainty_col_old]
    
    is_new_faster = new_upper < old_lower
    is_old_faster = old_upper < new_lower

    df["Fastest Method"] = np.select(
        [is_new_faster, is_old_faster],
        ["New", "Old"],
        default="Uncertain"
    )
    return df

results_df = label_fastest_method(results_df)
results_df.head()

In [None]:

# Add column with time difference and uncertainty
results_df['Time Difference'] = results_df['New Method, Time'] - results_df['Old Method, Time']
results_df["Time Difference, Uncertainty"] = results_df['New Method, Uncertainty']**2 + results_df['Old Method, Uncertainty']**2
results_df.head()

In [None]:
# Convert the materialization column into an actual list
def parse_materialization(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return None
    try:
        # Safely evaluate Python literal (e.g. "['str1', 'str2']") into a list
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        # Fallback: leave it unchanged or handle differently
        return x
    

results_df["Materialization"] = results_df["Materialization"].apply(
    parse_materialization)

# Add length of materialization
results_df['Materialization Length'] = results_df['Materialization'].apply(len)

In [None]:
# Split into 0, 1, 2, and 3 columns materialized
m0_df = results_df[results_df["Materialization Length"] == 0]
m1_df = results_df[results_df["Materialization Length"] == 1]
m2_df = results_df[results_df["Materialization Length"] == 2]
m3_df = results_df[results_df["Materialization Length"] == 3]
m0_df.head(25)

In [None]:
fastest_counts = results_df['Fastest Method'].value_counts()
print("Overall")
print(fastest_counts)

fastest_counts = m0_df['Fastest Method'].value_counts()
print("-------------------------")
print("0 columns materialized")
print(fastest_counts)

fastest_counts = m1_df['Fastest Method'].value_counts()
print("-------------------------")
print("1 columns materialized")
print(fastest_counts)

fastest_counts = m2_df['Fastest Method'].value_counts()
print("-------------------------")
print("2 columns materialized")
print(fastest_counts)

fastest_counts = m3_df['Fastest Method'].value_counts()
print("-------------------------")
print("3 columns materialized")
print(fastest_counts)


In [None]:
new_sum = results_df['New Method, Time'].sum()
old_sum = results_df['Old Method, Time'].sum()

print("Overall")
print(f"New Method sum: {new_sum}")
print(f"Old Method sum: {old_sum}")

new_sum = m0_df['New Method, Time'].sum()
old_sum = m0_df['Old Method, Time'].sum()
print("-------------------------")
print("0 columns materialized")
print(f"New Method sum: {new_sum}")
print(f"Old Method sum: {old_sum}")

new_sum = m1_df['New Method, Time'].sum()
old_sum = m1_df['Old Method, Time'].sum()
print("-------------------------")
print("1 columns materialized")
print(f"New Method sum: {new_sum}")
print(f"Old Method sum: {old_sum}")

new_sum = m2_df['New Method, Time'].sum()
old_sum = m2_df['Old Method, Time'].sum()
print("-------------------------")
print("2 columns materialized")
print(f"New Method sum: {new_sum}")
print(f"Old Method sum: {old_sum}")

new_sum = m3_df['New Method, Time'].sum()
old_sum = m3_df['Old Method, Time'].sum()
print("-------------------------")
print("3 columns materialized")
print(f"New Method sum: {new_sum}")
print(f"Old Method sum: {old_sum}")

In [None]:
def group_by_materialization(df: pd.DataFrame):
    agg_df = df.groupby("Materialization Length").agg(
        Sum_Time_Diff=("Time Difference", "sum"),
        Time_Diff_Uncertainty=("Time Difference, Uncertainty", lambda x: (x**2).sum()),
        New_Fastest_Count=('Fastest Method', lambda x: (x == 'New').sum()),
        Old_Fastest_Count=('Fastest Method', lambda x: (x == 'Old').sum()),
        Uncertain_Fastest_Count=('Fastest Method', lambda x: (x == 'Uncertain').sum())
    )

    agg_df["Combined_Uncertainty"] = np.sqrt(agg_df["Time_Diff_Uncertainty"])
    agg_df.drop(columns=["Time_Diff_Uncertainty"], inplace=True)

    def _round_row(row):
        unc = row["Combined_Uncertainty"]
        decimals = 0
        if unc > 0:
            decimals = -int(math.floor(np.log10(unc)))
        
        row["Rounded_Uncertainty"] = round(unc, decimals)
        row["Rounded_Sum_Time_Diff"] = round(row["Sum_Time_Diff"], decimals)

        return row
    return agg_df.apply(_round_row, axis=1)

grouped_by_materialization = group_by_materialization(results_df)
grouped_by_materialization.head(20)

In [None]:
# Group by Query and Materialization Length
grouped_by_query_and_materialization = results_df.groupby(['Query', 'Materialization Length']).agg(
    Sum_Time_Difference=('Time Difference', 'sum'),
    New_Fastest_Count=('Fastest Method', lambda x: (x == 'New').sum()),
    Old_Fastest_Count=('Fastest Method', lambda x: (x == 'Old').sum()),
    Uncertain_Fastest_Count=('Fastest Method', lambda x: (x == 'Uncertain').sum())
).reset_index()

grouped_by_query_and_materialization[(grouped_by_query_and_materialization["Sum_Time_Difference"] < -5) | (grouped_by_query_and_materialization["Sum_Time_Difference"] > 5)].sort_values("Sum_Time_Difference",ascending=False).head(50)

In [None]:
grouped_by_query = grouped_by_query_and_materialization.groupby(['Query']).agg(
    Sum_Time_Difference=('Sum_Time_Difference', 'sum'),
    New_Fastest_Count=('New_Fastest_Count', 'sum'),
    Old_Fastest_Count=('Old_Fastest_Count', 'sum'),
    Uncertain_Fastest_Count=('Uncertain_Fastest_Count', 'sum')
).reset_index()

grouped_by_query.sort_values("Sum_Time_Difference").head(21)

In [None]:

print(sorted(results_df[results_df["Fastest Method"] == "New"]["Query"].unique()
))
# results_df[(results_df["Fastest Method"] == "New") & (~results_df['Query'].isin(['q1', 'q18', 'q21']))].sort_values("Time Difference", ascending=False).head(50)
# results_df[(results_df["Fastest Method"] == "Old") & (~results_df['Query'].isin(['q8', 'q7', 'q20', 'q11', 'q9', 'q21', 'q17', 'q2', 'q10']))].sort_values("Time Difference", ascending=False).head(50)
results_df[results_df["Fastest Method"] == "Old"].sort_values("Time Difference", ascending=False).head(50)