In [17]:
import os
import ast
import pandas as pd

RESULTS_PATH = os.curdir + '/results/single-queries/tpch/2025-05-10-15H'

results_df = pd.read_csv(RESULTS_PATH + '/results.csv')

In [None]:
# Only keep query, materialization and average execution time (renamed)
results_df.rename({"Average (last 4 runs)":"Execution Time"},axis=1, inplace=True)
results_df = results_df[["Query", "Materialization", "Execution Time"]]

# Convert the materialization column into an actual list
def parse_materialization(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return None
    try:
        # Safely evaluate Python literal (e.g. "['str1', 'str2']") into a list
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        # Fallback: leave it unchanged or handle differently
        return x
    

results_df["Materialization"] = results_df["Materialization"].apply(
    parse_materialization)

results_df[results_df["Query"] == 'q7'].head()

In [None]:
# Split into 0, 1, 2, and 3 columns materialized
m0_df = results_df[results_df["Materialization"].str.len() == 0]
m1_df = results_df[results_df["Materialization"].str.len() == 1]
m2_df = results_df[results_df["Materialization"].str.len() == 2]
m3_df = results_df[results_df["Materialization"].str.len() == 3]
m2_df[m2_df["Query"] == 'q7'].head()

In [None]:
# Build a new df. First, for all 1-materializations, get the improvement from 0
expanded_results = m1_df.copy()
expanded_results_m1: pd.DataFrame = pd.merge(
    m1_df,
    m0_df[["Query", "Execution Time"]].rename({"Execution Time":"Previous Time"},axis=1),
    on="Query",
    how="left"
)
expanded_results_m1["Improvement"] = expanded_results_m1["Previous Time"] - \
    expanded_results_m1["Execution Time"]
for c in expanded_results_m1.columns:
    print(c)
expanded_results_m1["Previous Materializations"] = [[] for _ in range(len(expanded_results_m1.index))]
expanded_results_m1["Materialization"] = expanded_results_m1.apply(lambda row: row["Materialization"][0], axis=1)
expanded_results_m1[expanded_results_m1["Query"] == 'q7'].head()

In [None]:
# Expand m2_df s.t. each column in the materialization is counted both as Materialization and as Previous Materialization
m2_records = []
for _, row in m2_df.iterrows():
    a,b = row["Materialization"]
    m2_records.append(
        {
            "Query": row["Query"],
            "Execution Time": row["Execution Time"],
            "Materialization": a,
            "Previous Materializations": [b]
        }
    )
    m2_records.append(
        {
            "Query": row["Query"],
            "Execution Time": row["Execution Time"],
            "Materialization": b,
            "Previous Materializations": [a]
        }
    )

expanded_results_m2 = pd.DataFrame(m2_records)

# Copy m1
m1_df_copy = m1_df.copy()
m1_df_copy.rename({"Execution Time": "Previous Time"}, axis=1, inplace=True)

# Make the materialization into a string for merging
m1_df_copy["merge_key"] = m1_df_copy.apply(
    lambda row: str(row["Materialization"]), axis=1)
expanded_results_m2["merge_key"] = expanded_results_m2.apply(
    lambda row: str(row["Previous Materializations"]), axis=1)

expanded_results_m2 = pd.merge(
    expanded_results_m2,
    m1_df_copy[["Query", "Previous Time", "merge_key"]],
    on=["Query", "merge_key"]
)

# Calculate the time difference
expanded_results_m2["Improvement"] = expanded_results_m2["Previous Time"] - \
    expanded_results_m2["Execution Time"]

# Drop the merge_key
expanded_results_m2.drop("merge_key", axis=1, inplace=True)
expanded_results_m2[expanded_results_m2["Query"] == 'q7'].head()

In [None]:
# Expand m3_df s.t. each column in the materialization is counted both as Materialization and as Previous Materialization
m3_records = []
for _, row in m3_df.iterrows():
    a, b, c = row["Materialization"]
    m3_records.append(
        {
            "Query": row["Query"],
            "Execution Time": row["Execution Time"],
            "Materialization": a,
            "Previous Materializations": [b, c]
        }
    )
    m3_records.append(
        {
            "Query": row["Query"],
            "Execution Time": row["Execution Time"],
            "Materialization": b,
            "Previous Materializations": [a, c]
        }
    )
    m3_records.append(
        {
            "Query": row["Query"],
            "Execution Time": row["Execution Time"],
            "Materialization": c,
            "Previous Materializations": [a, b]
        }
    )

expanded_results_m3 = pd.DataFrame(m3_records)

# Copy m2
m2_df_copy = m2_df.copy()
m2_df_copy.rename({"Execution Time": "Previous Time"}, axis=1, inplace=True)

# Make the materialization into a string for merging
m2_df_copy["merge_key"] = m2_df_copy.apply(
    lambda row: str(row["Materialization"]), axis=1)
expanded_results_m3["merge_key"] = expanded_results_m3.apply(
    lambda row: str(row["Previous Materializations"]), axis=1)

expanded_results_m3 = pd.merge(
    expanded_results_m3,
    m2_df_copy[["Query", "Previous Time", "merge_key"]],
    on=["Query", "merge_key"]
)

# Calculate the time difference
expanded_results_m3["Improvement"] = expanded_results_m3["Previous Time"] - \
    expanded_results_m3["Execution Time"]

# Drop the merge_key
expanded_results_m3.drop("merge_key", axis=1, inplace=True)
# remove empty entries
expanded_results_m3.dropna(inplace=True)
expanded_results_m3[expanded_results_m3["Query"] == 'q7'].head()

In [23]:
expanded_results = pd.concat([expanded_results_m1, expanded_results_m2, expanded_results_m3])
expanded_results.to_csv(RESULTS_PATH + '/expanded_results.csv')