In [None]:
import os
import pandas as pd

from testing.twitter import setup

QUERIES = setup.QUERIES
COLUMN_MAP = setup.COLUMN_MAP

RESULTS_PATH = os.curdir + "/results/single-queries/tpch/2025-05-10-15H/"

results_df = pd.read_csv(RESULTS_PATH + '/expanded_results.csv')

if "Unnamed: 0" in results_df.columns:
    results_df = results_df.drop(columns=["Unnamed: 0"])

results_df

In [None]:
# Add table size to results
scale_factor = 0.5
tpch_table_sizes = {
    "customer": int(150000 * scale_factor),
    "lineitem": int(6000000 * scale_factor),
    "orders": int(1500000 * scale_factor),
    "part": int(200000 * scale_factor),
    "partsupp": int(800000 * scale_factor),
    "supplier": int(10000 * scale_factor),
    "nation": 25,  # Not scaled
    "region": 5    # Not scaled
}


def get_table_size(column_name):
    if column_name.startswith("c_"):
        return tpch_table_sizes["customer"]
    elif column_name.startswith("l_"):
        return tpch_table_sizes["lineitem"]
    elif column_name.startswith("o_"):
        return tpch_table_sizes["orders"]
    elif column_name.startswith("p_"):
        return tpch_table_sizes["part"]
    elif column_name.startswith("ps_"):
        return tpch_table_sizes["partsupp"]
    elif column_name.startswith("s_"):
        return tpch_table_sizes["supplier"]
    elif column_name.startswith("n_"):
        return tpch_table_sizes["nation"]
    elif column_name.startswith("r_"):
        return tpch_table_sizes["region"]
    else:
        return 0
    
def get_table_name(column_name):
    if column_name.startswith("c_"):
        return "customer"
    elif column_name.startswith("l_"):
        return "lineitem"
    elif column_name.startswith("o_"):
        return "orders"
    elif column_name.startswith("p_"):
        return "part"
    elif column_name.startswith("ps_"):
        return "partsupp"
    elif column_name.startswith("s_"):
        return "supplier"
    elif column_name.startswith("n_"):
        return "nation"
    elif column_name.startswith("r_"):
        return "region"
    else:
        return "unknown"
    
results_df["Table size"] = results_df.apply(lambda row: get_table_size(row["Materialization"]), axis=1)

In [None]:
# Add rank and percentile within query and globally
# Sort by query and time gain, then add rank within each query group
results_df['Query Rank'] = results_df.groupby('Query')['Improvement'].rank(ascending=False).astype(int)

# Calculate percentage rank within each query group (0-100%)
results_df['Query Percentile'] = results_df.groupby('Query')['Improvement'].rank(pct=True).round(2)

# Sort by query and time gain, then add rank within each query group
results_df['Global Rank'] = results_df['Improvement'].rank(ascending=False).astype(int)

# Calculate percentage rank within each query group (0-100%)
results_df['Global Percentile'] = results_df['Improvement'].rank(pct=True).round(2)


In [None]:
# Sort the dataframe by improvement
results_df.sort_values("Improvement", ascending=False, inplace=True)

In [None]:
# Add query usage frequency
def get_field_frequency_for_query(query_name, materialized_column):
    q = QUERIES[query_name]
    cols = q.columns_used()
    return cols.count(materialized_column)

def get_field_join_frequency_for_query(query_name, materialized_column):
    q = QUERIES[query_name]
    cols = q.columns_used_in_join()
    if materialized_column in cols:
        return len(cols[materialized_column])
    else:
        return 0


def get_field_where_frequency_for_query(query_name, materialized_column):
    q = QUERIES[query_name]
    cols = q.columns_used_with_position()["where"]
    return cols.count(materialized_column)

def get_field_select_frequency_for_query(query_name, materialized_column):
    q = QUERIES[query_name]
    cols = q.columns_used_with_position()["select"]
    return cols.count(materialized_column)

def get_field_group_by_frequency_for_query(query_name, materialized_column):
    q = QUERIES[query_name]
    cols = q.columns_used_with_position()["group_by"]
    return cols.count(materialized_column)

def get_field_order_by_frequency_for_query(query_name, materialized_column):
    q = QUERIES[query_name]
    cols = q.columns_used_with_position()["order_by"]
    return cols.count(materialized_column)

def get_self_join_frequency_for_query(query_name, materialized_column):
    q = QUERIES[query_name]
    cols = q.columns_used_with_position()
    if "self_join" in cols and materialized_column in cols["self_join"].keys():
        return cols["self_join"][materialized_column]
    return 0


results_df["Total Frequency"] = results_df.apply(
    lambda row: get_field_frequency_for_query(row["Query"], row["Materialization"]), axis=1
)

results_df["Join Frequency"] = results_df.apply(
    lambda row: get_field_join_frequency_for_query(row["Query"], row["Materialization"]), axis=1
)

results_df["Where Frequency"] = results_df.apply(
    lambda row: get_field_where_frequency_for_query(row["Query"], row["Materialization"]), axis=1
)

results_df["Select Frequency"] = results_df.apply(
    lambda row: get_field_select_frequency_for_query(row["Query"], row["Materialization"]), axis=1
)
results_df["Group By Frequency"] = results_df.apply(
    lambda row: get_field_group_by_frequency_for_query(row["Query"], row["Materialization"]), axis=1
)
results_df["Order By Frequency"] = results_df.apply(
    lambda row: get_field_order_by_frequency_for_query(row["Query"], row["Materialization"]), axis=1
)

results_df["Self Join Frequency"] = results_df.apply(
    lambda row: get_self_join_frequency_for_query(row["Query"], row["Materialization"]), axis=1
)


In [None]:
results_df[(results_df["Query"] == 'q11') & (results_df["Materialization"] == 'ps_suppkey')]

## Negative Values
Some of the materializations give negative results

In [None]:
treshold = -0.05
negative_improvement_df = results_df[results_df["Improvement"] < treshold]
print(f'There are {len(negative_improvement_df)} with improvement less than {treshold}')
print(f'The queries with negative improvmement are {negative_improvement_df["Query"].unique()}')
print(f'The fields whose improvement was negative are {negative_improvement_df["Materialization"].unique()}')

# Only Single Joins

In [None]:
single_join_df = results_df[(results_df["Join Frequency"] == 1) & (results_df["Where Frequency"] == 0)]
sj_negative_improvement_df = single_join_df[single_join_df["Improvement"] < treshold]
print(single_join_df[single_join_df["Global Percentile"]>0.8]["Query"].unique())

In [None]:
sj_q18 = single_join_df[(single_join_df["Query"] == 'q18') & (single_join_df["Materialization"] == 'l_orderkey')]

# sj_q18 = single_join_df[single_join_df["Query"] == 'q3']
sj_q18.head()

In [None]:
sj_q18.tail()

In [None]:
print(f'There are {len(sj_negative_improvement_df)} with improvement less than {treshold}')
print(f'The queries with negative improvmement are {sj_negative_improvement_df["Query"].unique()}')
print(f'The fields whose improvement was negative are {sj_negative_improvement_df["Materialization"].unique()}')

# Single Where

In [None]:
single_where_df = results_df[(results_df["Join Frequency"] == 0) & (results_df["Where Frequency"] == 1)]
print(single_where_df[single_where_df["Global Percentile"]>0.8]["Query"].unique())

In [None]:
single_where_query_df = single_where_df[single_where_df["Query"] == 'q16']
single_where_query_df = single_where_query_df[single_where_query_df["Materialization"] == 'ps_suppkey']
# single_where_query_df[single_where_query_df["Materialization"] == 'o_orderstatus'].head()
# single_where_query_df[single_where_query_df["Previous Materializations"] == "['l_commitdate']"]
# single_where_query_df[single_where_query_df["Previous Materializations"] == "[]"]
# single_where_query_df[single_where_query_df["Materialization"] == 'p_container']
single_where_query_df
# TODO 16 - ps_suppkey

In [None]:
# single_where_query_df[single_where_df["Materialization"] == 'o_orderstatus'].tail(25)
single_where_query_df.tail(25)

# Things to investigate 
q10, n_nationkey gives best performance when n_name materialized

- q21, n_nationkey negative
- q21, hvorfor er s_nationkey alltid dritbra

In [None]:
# Look 

In [None]:
treshold = 1.5
positive_df = results_df[results_df["Improvement"] > treshold]
print(f'There are {len(positive_df)} with improvement less than {treshold}')
print(f'The queries with negative improvmement are {positive_df["Query"].unique()}')
print(f'The fields whose improvement was negative are {positive_df["Materialization"].unique()}')
positive_df[positive_df['Query'] == 'q20'].head(10)

# Queries
Print queries and materialization queries

In [None]:
query_names = ['q14']
# materializations = [['l_orderkey'], ['l_orderkey', 'o_custkey'], ['o_orderkey'], ['o_orderkey', 'o_custkey'],['o_totalprice'], ['o_totalprice', 'o_custkey'] ]
# materializations = [['o_custkey', 's_nationkey'], ['o_custkey', 's_nationkey', 'o_orderkey'], ['l_discount', 's_nationkey'], ['l_discount', 's_nationkey', 'o_orderkey']]
# materializations = [[],['s_nationkey']]
materializations = [['delete_status_userIdStr']]

In [None]:
for materialization in materializations:
    for query_name in query_names:
        query_obj = QUERIES[query_name]
        m = 0
        update_stmt = "UPDATE test_table SET "
        print(f"################# {query_name.upper()}, {materialization} #################")
        for field in materialization:
            field_obj = COLUMN_MAP[field]

            print(f"ALTER TABLE test_table DROP COLUMN IF EXISTS {field};")
            print(f"ALTER TABLE test_table ADD {field} {field_obj['type']};")
            update_stmt += f"{field} = {field_obj['access']}, "
        update_stmt = update_stmt[:-2] + ";"
        print(update_stmt)
        # Create the field-materialization setup for this test
        fields = []
        for field, access_query in COLUMN_MAP.items():
            fields.append(
                (field, access_query, field in materialization))
        
        print(QUERIES[query_name].get_query(fields=fields))
