In [7]:
import pandas as pd
import numpy as np

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


import testing.tpch.setup as tpch_setup

In [None]:
# Small dataset, 500 queries - m1, m2, m3 ...
RESULTS_PATH = f"{os.path.curdir}/results/load-based-N-fields/tpch/2025-03-26-15H/"

results_df = pd.read_csv(RESULTS_PATH + "meta_results.csv")
results_df = results_df[["Test", "Load", "Total query time",
                         "Materialized Columns", "Materialization"]]

results_df.head(50)



In [None]:
import pandas as pd
import random

queries = tpch_setup.QUERIES

rows = []
for _ in range(100):  # Generate 100 rows of dummy data
    query_name = random.choice(list(queries.keys()))
    query_columns = queries[query_name].columns_used()

    new_materialized = random.choice(query_columns)

    # Generate previously materialized columns, excluding the new materialized column
    possible_prev_columns = [col for col in query_columns if col != new_materialized]
    prev_materialized = random.sample(possible_prev_columns, k=min(random.randint(0, 5), len(possible_prev_columns)))

    prev_time = round(random.uniform(1, 10), 2)
    time_gain = round(random.uniform(0.1, prev_time), 2)
    time = round(prev_time - time_gain, 2)

    rows.append({
        'Query': query_name,
        'Materialized Column': new_materialized,
        'Previously Materialized Columns': prev_materialized,
        'Time gain': time_gain,
        'Time': time,
        'Prev time': prev_time
    })

# Create the DataFrame
df = pd.DataFrame(rows)

# Display the first few rows of the DataFrame
df.head(20)

In [None]:
# Sort by query and time gain, then add rank within each query group
df['Rank'] = df.groupby('Query')['Time gain'].rank(ascending=False).astype(int)

# Calculate percentage rank within each query group (0-100%)
df['Percentile'] = df.groupby('Query')['Time gain'].rank(pct=True).round(2)

df.sort_values(['Query', 'Time gain'], ascending=[True, False])


In [11]:
def get_field_frequency_for_query(query_name, materialized_column):
    q = queries[query_name]
    cols = q.columns_used()
    return cols.count(materialized_column)

def get_field_join_frequency_for_query(query_name, materialized_column):
    q = queries[query_name]
    cols = q.columns_used_in_join()
    if materialized_column in cols:
        return len(cols[materialized_column])
    else:
        return 0

def get_field_where_frequency_for_query(query_name, materialized_column):
    q = queries[query_name]
    cols = q.columns_used_with_position()["where"]
    return cols.count(materialized_column)

def get_field_select_frequency_for_query(query_name, materialized_column):
    q = queries[query_name]
    cols = q.columns_used_with_position()["select"]
    return cols.count(materialized_column)



queries = tpch_setup.QUERIES

df["Total Frequency"] = df.apply(
    lambda row: get_field_frequency_for_query(row["Query"], row["Materialized Column"]), axis=1
)

df["Join Frequency"] = df.apply(
    lambda row: get_field_join_frequency_for_query(row["Query"], row["Materialized Column"]), axis=1
)

df["Where Frequency"] = df.apply(
    lambda row: get_field_where_frequency_for_query(row["Query"], row["Materialized Column"]), axis=1
)

df["Select Frequency"] = df.apply(
    lambda row: get_field_select_frequency_for_query(row["Query"], row["Materialized Column"]), axis=1
)