In [1]:
import pandas as pd
import numpy as np

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


import testing.tpch.setup as tpch_setup

In [None]:
# Small dataset, 500 queries - m1, m2, m3 ...
RESULTS_PATH = f"{os.path.curdir}/results/load-based-N-fields/tpch/2025-03-26-15H/"

results_df = pd.read_csv(RESULTS_PATH + "meta_results.csv")
results_df = results_df[["Test", "Load", "Total query time",
                         "Materialized Columns", "Materialization"]]

results_df.head(50)



In [None]:
# Only look at load-based tests
results_df = results_df[results_df["Test"] != "full_materialization"]
results_df = results_df[results_df["Test"] != "schema_based_materialization"]
results_df = results_df[results_df["Test"] != "load_based_m20"]
results_df = results_df[results_df["Test"] != "load_based_m25"]
results_df = results_df[results_df["Test"] != "load_based_m30"]
results_df = results_df[results_df["Test"] != "load_based_m35"]
results_df

In [None]:
# Get the materialized column
results_df["Materialized Column"] = results_df.apply(
    lambda row: row["Materialization"].strip('[').strip(']').split(', ')[-1].strip("'"), axis=1)
results_df

In [None]:
# Add the datatype of each materialized column from COLUMN_MAP
results_df["Column Type"] = results_df.apply(
    lambda row: tpch_setup.COLUMN_MAP[row["Materialized Column"]]["type"] 
    if row["Materialized Column"] in tpch_setup.COLUMN_MAP else "Unknown", 
    axis=1)



scale_factor = 0.5
tpch_table_sizes = {
    "customer": int(150000 * scale_factor),
    "lineitem": int(6000000 * scale_factor),
    "orders": int(1500000 * scale_factor),
    "part": int(200000 * scale_factor),
    "partsupp": int(800000 * scale_factor),
    "supplier": int(10000 * scale_factor),
    "nation": 25,  # Not scaled
    "region": 5    # Not scaled
}


def get_table_size(column_name):
    if column_name.startswith("c_"):
        return tpch_table_sizes["customer"]
    elif column_name.startswith("l_"):
        return tpch_table_sizes["lineitem"]
    elif column_name.startswith("o_"):
        return tpch_table_sizes["orders"]
    elif column_name.startswith("p_"):
        return tpch_table_sizes["part"]
    elif column_name.startswith("ps_"):
        return tpch_table_sizes["partsupp"]
    elif column_name.startswith("s_"):
        return tpch_table_sizes["supplier"]
    elif column_name.startswith("n_"):
        return tpch_table_sizes["nation"]
    elif column_name.startswith("r_"):
        return tpch_table_sizes["region"]
    else:
        return 0


results_df["Table Size"] = results_df["Materialized Column"].apply(get_table_size)
results_df.head(30)

In [6]:
for n in range(10):

    filename = f"q4_m400_l{n}.csv"
    load_df = pd.read_csv(os.path.join(RESULTS_PATH, filename))

    load_df.drop(columns=["schema_based_materialization", "full_materialization",
                     "load_based_m20", "load_based_m25", "load_based_m30", "load_based_m35"], inplace=True)

    if "Unnamed: 0" in load_df.columns:
        load_df.drop(columns=["Unnamed: 0"], inplace=True)

    cols_to_process = [col for col in load_df.columns if col != 'q']

      # Compute means and sums per query
    df_mean = load_df.groupby('q')[cols_to_process].mean()
    df_sum = load_df.groupby('q')[cols_to_process].sum()


    # Transpose and label
    df_mean_t = df_mean.transpose().reset_index().rename(columns={'index': 'Test'})
    df_sum_t = df_sum.transpose().reset_index().rename(columns={'index': 'Test'})

    # Now, df_transposed has columns: "Test", "q1", "q2", ..., "q21"
    matching_indices = results_df[results_df["Load"] == n].index

    for idx in matching_indices:
        test_name = results_df.at[idx, "Test"]

        row_mean = df_mean_t[df_mean_t["Test"] == test_name]
        row_sum = df_sum_t[df_sum_t["Test"] == test_name]

        if not row_mean.empty and not row_sum.empty:
            for i in range(1, 22):
                q_col = f"q{i}"
                if q_col in row_mean.columns:
                    results_df.at[idx, f"{q_col}_mean"] = row_mean.iloc[0][q_col]
                    results_df.at[idx, f"{q_col}_sum"] = row_sum.iloc[0][q_col]
        else:
            print(f"No match found for Test='{test_name}' at Load={n}")





In [None]:
queries = tpch_setup.QUERIES

results_df_q = results_df.copy()
results_df_q = results_df_q[["Load", "Materialized Columns","Materialized Column", "Column Type", "Table Size", "q1_mean", "q1_sum", "q2_mean", "q2_sum", "q3_mean", "q3_sum", "q4_mean", "q4_sum", "q6_mean", "q6_sum", "q7_mean", "q7_sum", "q8_mean", "q8_sum", "q9_mean", "q9_sum", "q10_mean", "q10_sum", "q11_mean", "q11_sum", "q12_mean", "q12_sum", "q13_mean", "q13_sum", "q14_mean", "q14_sum", "q15_mean", "q15_sum", "q16_mean", "q16_sum", "q17_mean", "q17_sum", "q18_mean", "q18_sum", "q19_mean", "q19_sum", "q20_mean", "q20_sum", "q21_mean", "q21_sum"]]
results_df_q.sort_values(["Load", "Materialized Columns"], inplace=True)


# Make a new 21 new columns that represent the mean time in the previous materialization
for i in range(1, 22):
    if i == 5:
        continue
    # Apply the condition row by row
    q_col_mean = f"q{i}_mean"
    q_col_sum = f"q{i}_sum"
    # 1) compute the diff across all rows
    results_df_q[f"Pre-materialization q{i} mean"] = results_df_q[q_col_mean].shift(1)
    results_df_q[f"Pre-materialization q{i} sum"] = results_df_q[q_col_sum].shift(1)


# Make a new 21 new columns that represent the reduction in execution time for each query
for i in range(1, 22):
    if i == 5:
        continue
    # Apply the condition row by row
    q_col_mean = f"q{i}_mean"

    # 1) compute the diff across all rows
    results_df_q[f"Time gain q{i} mean"] = -results_df_q[q_col_mean].diff()

    # 2) mask out the rows where you don't care about time‐gain
    mask = results_df_q["Materialized Column"].isin(queries[f"q{i}"].columns_used())
    results_df_q.loc[~mask, f"Time gain q{i} mean"] = np.nan
    


# For time gain on sum:
for i in range(1, 22):
    if i == 5:
        continue
    q_col_sum = f"q{i}_sum"

    # 1) compute the diff across all rows
    results_df_q[f"Time gain q{i} sum"] = -results_df_q[q_col_sum].diff()

    # 2) mask out the rows where you don't care about time‐gain
    mask = results_df_q["Materialized Column"].isin(queries[f"q{i}"].columns_used())
    results_df_q.loc[~mask, f"Time gain q{i} sum"] = np.nan

# TODO: Implement no materialization
results_df_q = results_df_q[results_df_q["Materialized Columns"] != 1]
# Get base columns and dynamically build query time gain columns
base_cols = ["Materialized Column", "Load", "Materialized Columns", "Column Type", "Table Size"]
time_gain_cols = [f"Time gain q{i} {stat}" for i in range(1,22) if i != 5 for stat in ["mean", "sum"]]
pre_mat_cols = [f"Pre-materialization q{i} {stat}" for i in range(1,22) if i != 5 for stat in ["mean", "sum"]]
results_df_q[base_cols + time_gain_cols + pre_mat_cols].head(50)









In [None]:
expanded_results_df_q = pd.DataFrame()
for index, row in results_df_q.iterrows():
    for i in range(1, 22):
        if i == 5:  # Skip q5 since it was excluded earlier
            continue


        new_row = {
            'Load': row['Load'], 
            'Materialized Column': row['Materialized Column'],
            'Materialized Columns': row['Materialized Columns'],
            "Column Type": row["Column Type"],
            "Table Size": row["Table Size"],
            'Query': f'q{i}',
            'Time gain mean': row[f'Time gain q{i} mean'],
            'Time gain sum': row[f'Time gain q{i} sum'],
            "Pre-materialization mean": row[f"Pre-materialization q{i} mean"],
            "Pre-materialization sum": row[f"Pre-materialization q{i} sum"]
        }
        expanded_results_df_q = pd.concat([expanded_results_df_q, pd.DataFrame([new_row])], ignore_index=True)



def get_field_frequency_for_query(query_name, materialized_column):
    q = queries[query_name]
    cols = q.columns_used()
    return cols.count(materialized_column)

def get_field_join_frequency_for_query(query_name, materialized_column):
    q = queries[query_name]
    cols = q.columns_used_with_position()["join"]
    return cols.count(materialized_column)

def get_field_where_frequency_for_query(query_name, materialized_column):
    q = queries[query_name]
    cols = q.columns_used_with_position()["where"]
    return cols.count(materialized_column)

def get_field_select_frequency_for_query(query_name, materialized_column):
    q = queries[query_name]
    cols = q.columns_used_with_position()["select"]
    return cols.count(materialized_column)



queries = tpch_setup.QUERIES

expanded_results_df_q["Total Frequency"] = expanded_results_df_q.apply(
    lambda row: get_field_frequency_for_query(row["Query"], row["Materialized Column"]), axis=1
)

expanded_results_df_q["Join Frequency"] = expanded_results_df_q.apply(
    lambda row: get_field_join_frequency_for_query(row["Query"], row["Materialized Column"]), axis=1
)

expanded_results_df_q["Where Frequency"] = expanded_results_df_q.apply(
    lambda row: get_field_where_frequency_for_query(row["Query"], row["Materialized Column"]), axis=1
)

expanded_results_df_q["Select Frequency"] = expanded_results_df_q.apply(
    lambda row: get_field_select_frequency_for_query(row["Query"], row["Materialized Column"]), axis=1
)


temp_df = expanded_results_df_q.dropna(subset=["Time gain mean", "Time gain sum"])
# temp_df.sort_values(["Select Frequency", "Query", "Materialized Columns"], ascending=False).head(50)
temp_df[(temp_df["Select Frequency"] > temp_df["Join Frequency"]) & (temp_df["Select Frequency"] > temp_df["Where Frequency"])].sort_values("Time gain mean", ascending=False).head(50)








In [None]:
temp_df_new = temp_df[["Load", "Query"]]

# Count number of materialized columns per load/query combination
temp_df_new = temp_df_new.groupby(["Load", "Query"]).size().reset_index(name='count')
temp_df_new.sort_values("count", ascending=False).head(50)

* Datatype
* Tabellstørrelse
* Kjøretid før materialisering
* Er motparten materialisert?



Materialized column
Materialized columns
Query
Time gain
Time
Prev time

Rangere materialiseringene innad i spørring
Lage percentiler