In [2113]:
import ast
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)



from queries.query import Query
import testing.tpch.setup as tpch_setup

In [None]:
# Small dataset, 500 queries - m1, m2, m3 ...
RESULTS_PATH = f"{os.path.curdir}/results/load-based-N-fields/tpch/2025-03-26-15H/"

results_df = pd.read_csv(RESULTS_PATH + "meta_results.csv")
results_df = results_df[["Test", "Load", "Total query time",
                         "Materialized Columns", "Materialization"]]
results_df.info()




In [None]:
results_df["Test"].unique()

In [None]:
# Only look at load-based tests
results_df = results_df[results_df["Test"] != "full_materialization"]
results_df = results_df[results_df["Test"] != "schema_based_materialization"]
results_df = results_df[results_df["Test"] != "load_based_m20"]
results_df = results_df[results_df["Test"] != "load_based_m25"]
results_df = results_df[results_df["Test"] != "load_based_m30"]
results_df = results_df[results_df["Test"] != "load_based_m35"]
results_df

In [None]:
# Get the materialized column
results_df["Materialized Column"] = results_df.apply(
    lambda row: row["Materialization"].strip('[').strip(']').split(', ')[-1].strip("'"), axis=1)
results_df

In [None]:
# Add the datatype of each materialized column from COLUMN_MAP
results_df["Column Type"] = results_df.apply(
    lambda row: tpch_setup.COLUMN_MAP[row["Materialized Column"]]["type"] 
    if row["Materialized Column"] in tpch_setup.COLUMN_MAP else "Unknown", 
    axis=1)



scale_factor = 0.5
tpch_table_sizes = {
    "customer": int(150000 * scale_factor),
    "lineitem": int(6000000 * scale_factor),
    "orders": int(1500000 * scale_factor),
    "part": int(200000 * scale_factor),
    "partsupp": int(800000 * scale_factor),
    "supplier": int(10000 * scale_factor),
    "nation": 25,  # Not scaled
    "region": 5    # Not scaled
}


def get_table_size(column_name):
    if column_name.startswith("c_"):
        return tpch_table_sizes["customer"]
    elif column_name.startswith("l_"):
        return tpch_table_sizes["lineitem"]
    elif column_name.startswith("o_"):
        return tpch_table_sizes["orders"]
    elif column_name.startswith("p_"):
        return tpch_table_sizes["part"]
    elif column_name.startswith("ps_"):
        return tpch_table_sizes["partsupp"]
    elif column_name.startswith("s_"):
        return tpch_table_sizes["supplier"]
    elif column_name.startswith("n_"):
        return tpch_table_sizes["nation"]
    elif column_name.startswith("r_"):
        return tpch_table_sizes["region"]
    else:
        return 0


results_df["Table Size"] = results_df["Materialized Column"].apply(get_table_size)
results_df.head(30)

In [2119]:
# Get the reduction in execution time
results_df_current = results_df.copy()

# Extract relevant columns
results_df_current = results_df_current[["Load", "Materialized Column",
                         "Total query time", "Materialized Columns", "Column Type", "Table Size"]]

# Make sure the tests are sorted based on 1. Load, 2. Materialized Columns
results_df_current.sort_values(["Load", "Materialized Columns"], inplace=True)

# Get the reduction in execution time
results_df_current["Time gain"] = results_df_current["Total query time"].shift(
    1) - results_df_current["Total query time"]

# Remove all tests where only one column was materialized
results_df_current = results_df_current[results_df_current["Materialized Columns"] != 1]

# Extract relevant columns
results_df_current = results_df_current[["Load", "Materialized Column",
                         "Time gain", "Materialized Columns", "Column Type", "Table Size"]]


results_df_current["Rounded time gain"] = results_df_current.apply(lambda row: int(round(row["Time gain"], -1)), axis=1)

results_df = results_df_current


In [2120]:
# Add frequency of occurence for the materialized column across the load

load_nos = results_df["Load"].unique()

loads = defaultdict(list)

for load_no in load_nos:
    with open(f"{os.path.curdir}/loads/without_q5/load{load_no}.txt", mode="r") as file:
        load_string = file.read()
    load = load_string.strip("['").strip("']").split("', '")
    assert "q5" not in load
    loads[load_no] = load

query_freq = {load_no: {q: load.count(q) for q in load}  for load_no, load in loads.items()}

In [2121]:
queries = tpch_setup.QUERIES

In [None]:
# Add frequency of occurence for the materialized column across the load

def get_field_frequency(load_no, materialized_column):
    load = loads[load_no]
    frequency = 0
    for query_name in set(load):
        cols = queries[query_name].columns_used()
        if materialized_column in cols:
            frequency += load.count(query_name)

    return frequency

def get_total_field_frequency(load_no, materialized_column):
    load = loads[load_no]
    frequency = 0
    for query_name in set(load):
        cols = queries[query_name].columns_used()
        if materialized_column in cols:
            frequency_in_query = cols.count(materialized_column)
            frequency += frequency_in_query * load.count(query_name)

    return frequency

results_df["Frequency"] = results_df.apply(
    lambda row: get_field_frequency(row["Load"], row["Materialized Column"]), axis=1
)

results_df["Total Frequency"] = results_df.apply(
    lambda row: get_total_field_frequency(row["Load"], row["Materialized Column"]), axis=1
)

results_df


In [2123]:
# Add frequency of occurence in a join for the materialized column across the load

def get_field_join_frequency(load_no, materialized_column):
    load = loads[load_no]
    frequency = 0
    for query_name in set(load):
        cols = queries[query_name].columns_used_with_position()["join"]
        if materialized_column in cols:
            frequency += load.count(query_name)

    return frequency

def get_field_total_join_frequency(load_no, materialized_column):
    load = loads[load_no]
    frequency = 0
    for query_name in set(load):
        cols = queries[query_name].columns_used_with_position()["join"]
        if materialized_column in cols:
            frequency_in_query = cols.count(materialized_column)
            frequency += frequency_in_query * load.count(query_name)

    return frequency

results_df["Join Frequency"] = results_df.apply(
    lambda row: get_field_join_frequency(row["Load"], row["Materialized Column"]), axis=1
)
results_df["Total Join Frequency"] = results_df.apply(
    lambda row: get_field_total_join_frequency(row["Load"], row["Materialized Column"]), axis=1
)


In [2124]:
# Add frequency of occurence in a where for the materialized column across the load

def get_field_where_frequency(load_no, materialized_column):
    load = loads[load_no]
    frequency = 0
    for query_name in set(load):
        cols = queries[query_name].columns_used_with_position()["where"]
        if materialized_column in cols:
            frequency += load.count(query_name)

    return frequency

def get_field_total_where_frequency(load_no, materialized_column):
    load = loads[load_no]
    frequency = 0
    for query_name in set(load):
        cols = queries[query_name].columns_used_with_position()["where"]
        if materialized_column in cols:
            frequency_in_query = cols.count(materialized_column)
            frequency += frequency_in_query * load.count(query_name)

    return frequency


results_df["Where Frequency"] = results_df.apply(
    lambda row: get_field_where_frequency(row["Load"], row["Materialized Column"]), axis=1
)
results_df["Total Where Frequency"] = results_df.apply(
    lambda row: get_field_total_where_frequency(row["Load"], row["Materialized Column"]), axis=1
)

In [2125]:
# Add frequency of occurence in a select for the materialized column across the load

def get_field_select_frequency(load_no, materialized_column):
    load = loads[load_no]
    frequency = 0
    for query_name in set(load):
        cols = queries[query_name].columns_used_with_position()["select"]
        if materialized_column in cols:
            frequency += load.count(query_name)

    return frequency

def get_field_total_select_frequency(load_no, materialized_column):
    load = loads[load_no]
    frequency = 0
    for query_name in set(load):
        cols = queries[query_name].columns_used_with_position()["select"]
        if materialized_column in cols:
            frequency_in_query = cols.count(materialized_column)
            frequency += frequency_in_query * load.count(query_name)

    return frequency


results_df["Select Frequency"] = results_df.apply(
    lambda row: get_field_select_frequency(row["Load"], row["Materialized Column"]), axis=1
)
results_df["Total Select Frequency"] = results_df.apply(
    lambda row: get_field_total_select_frequency(row["Load"], row["Materialized Column"]), axis=1
)



In [2126]:
# Add frequency of occurence in a group_by for the materialized column across the load

def get_field_absolute_group_by_frequency(load_no, materialized_column):
    load = loads[load_no]
    frequency = 0
    for query_name in set(load):
        cols = queries[query_name].columns_used_with_position()["group_by"]
        if materialized_column in cols:
            frequency += load.count(query_name)

    return frequency


results_df["Group By Frequency, Absolute"] = results_df.apply(
    lambda row: get_field_absolute_group_by_frequency(row["Load"], row["Materialized Column"]), axis=1
)

In [2127]:
# Add frequency of occurence in a order_by for the materialized column across the load

def get_field_absolute_order_by_frequency(load_no, materialized_column):
    load = loads[load_no]
    frequency = 0
    for query_name in set(load):
        cols = queries[query_name].columns_used_with_position()["order_by"]
        if materialized_column in cols:
            frequency += load.count(query_name)

    return frequency


results_df["Order By Frequency, Absolute"] = results_df.apply(
    lambda row: get_field_absolute_order_by_frequency(row["Load"], row["Materialized Column"]), axis=1
)

In [None]:
sorted_results_df = results_df.sort_values("Rounded time gain", ascending=False)
sorted_results_df[sorted_results_df["Load"] == 0].head()


In [2129]:
sorted_results_df = results_df.sort_values("Rounded time gain", ascending=False)
# sorted_results_df[[
#     "Rounded time gain",
#     "Materialized Column", 
#     "Frequency", 
#     "Join Frequency", 
#     "Where Frequency, Absolute",
#     "Select Frequency, Absolute",
#     "Group By Frequency, Absolute",
#     "Order By Frequency, Absolute", 
# ]].head(30)
#sorted_results_df = sorted_results_df.sort_values(["Load", "Materialized Columns"], inplace=False)
# sorted_results_df = sorted_results_df[sorted_results_df["Load"] == 7]

# # Calculate correlations with Rounded time gain
# correlations = results_df[["Rounded time gain", "Total Frequency", "Total Join Frequency", 
#                           "Total Where Frequency", "Total Select Frequency", "Table Size"]].corr()["Rounded time gain"]

# print("Correlations with Rounded time gain:")
# print(correlations.sort_values(ascending=False))

# # Perform multiple linear regression
# from sklearn.linear_model import LinearRegression
# from sklearn.preprocessing import StandardScaler

# # Prepare features and target
# X = results_df[["Total Frequency", "Total Join Frequency", "Total Where Frequency", 
#                 "Total Select Frequency", "Table Size"]]
# y = results_df["Rounded time gain"]

# # Scale the features
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Fit regression model
# reg = LinearRegression().fit(X_scaled, y)

# # Get feature importance scores
# importance = pd.DataFrame({
#     'Feature': X.columns,
#     'Coefficient': reg.coef_
# })
# print("\nFeature importance from linear regression:")
# print(importance.sort_values('Coefficient', ascending=False))







In [None]:
import matplotlib.pyplot as plt

results_df.sort_values("Time gain", ascending=False, inplace=True)
results_df_head = results_df.copy()

# Where vs Join - Classic
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Join Frequency'],
    y=results_df_head['Where Frequency'], 
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Join Frequency')
plt.ylabel('Where Frequency')
plt.title('Query Performance Impact by Where and Join Clause Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Where vs Join - Total
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Total Join Frequency'],
    y=results_df_head['Total Where Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis', 
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Total Join Frequency')
plt.ylabel('Total Where Frequency')
plt.title('Query Performance Impact by Total Where and Join Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()




# List of frequency columns to plot
frequency_columns = [
    'Total Frequency',
    'Total Join Frequency',
    'Total Where Frequency',
    'Total Select Frequency',
]

# Create scatter plots for each frequency column vs. 'Rounded time gain'
for col in frequency_columns:
    plt.figure(figsize=(8, 6))
    plt.scatter(results_df_head[col], results_df_head['Time gain'], alpha=0.7)
    plt.xlabel(col)
    plt.ylabel("Time gain")
    plt.title(f"Scatter Plot: {col} vs Time gain")
    plt.grid(True)
    plt.show()





In [None]:
# Join vs Select - Classic
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Select Frequency'],
    y=results_df_head['Join Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Select Frequency')
plt.ylabel('Join Frequency')
plt.title('Query Performance Impact by Select and Join Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Join vs Select - Total
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Total Select Frequency'],
    y=results_df_head['Total Join Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Total Select Frequency')
plt.ylabel('Total Join Frequency')
plt.title('Query Performance Impact by Total Select and Join Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Select vs Where - Classic
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Select Frequency'],
    y=results_df_head['Where Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Select Frequency')
plt.ylabel('Where Frequency')
plt.title('Query Performance Impact by Select and Where Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Select vs Where - Total
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Total Select Frequency'],
    y=results_df_head['Total Where Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Total Select Frequency')
plt.ylabel('Total Where Frequency')
plt.title('Query Performance Impact by Total Select and Where Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:

# Join vs Overall Frequency - Classic
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Join Frequency'],
    y=results_df_head['Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Join Frequency')
plt.ylabel('Frequency')
plt.title('Query Performance Impact by Join and Overall Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Join vs Overall Frequency - Total
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Total Join Frequency'],
    y=results_df_head['Total Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Total Join Frequency')
plt.ylabel('Total Frequency')
plt.title('Query Performance Impact by Total Join and Overall Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Where vs Overall Frequency - Classic
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Where Frequency'],
    y=results_df_head['Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Where Frequency')
plt.ylabel('Frequency')
plt.title('Query Performance Impact by Where and Overall Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Where vs Overall Frequency - Total
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Total Where Frequency'],
    y=results_df_head['Total Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Total Where Frequency')
plt.ylabel('Total Frequency')
plt.title('Query Performance Impact by Total Where and Overall Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Select vs Overall Frequency - Classic
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Select Frequency'],
    y=results_df_head['Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Select Frequency')
plt.ylabel('Frequency')
plt.title('Query Performance Impact by Select and Overall Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Select vs Overall Frequency - Total
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    x=results_df_head['Total Select Frequency'],
    y=results_df_head['Total Frequency'],
    s=results_df_head['Time gain'] * 10,
    c=results_df_head['Time gain'],
    cmap='viridis',
    alpha=0.6
)
plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Total Select Frequency')
plt.ylabel('Total Frequency')
plt.title('Query Performance Impact by Total Select and Overall Frequency\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Bubble chart: Table Size vs Time Gain Classic
plt.figure(figsize=(12, 8))

scatter = plt.scatter(
    x=results_df['Table Size'],
    y=results_df['Frequency'],
    s=results_df['Time gain'] * 10,  # Keep bubble size proportional to time gain
    c=results_df['Time gain'],  # Color by time gain
    cmap='viridis',
    alpha=0.6
)

plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Table Size (number of rows)')
plt.ylabel('Frequency')
plt.title('Query Performance Impact by Table Size\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Bubble chart: Table Size vs Time Gain Total
plt.figure(figsize=(12, 8))

scatter = plt.scatter(
    x=results_df['Table Size'],
    y=results_df['Total Frequency'],
    s=results_df['Time gain'] * 10,  # Keep bubble size proportional to time gain
    c=results_df['Time gain'],  # Color by time gain
    cmap='viridis',
    alpha=0.6
)

plt.colorbar(scatter, label='Time Gain (ms)')
plt.xlabel('Table Size (number of rows)')
plt.ylabel('Total Frequency')
plt.title('Query Performance Impact by Table Size\n(Bubble size and color indicate time gain)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()




In [None]:
import seaborn as sns  # only for getting nice categorical colors

results_df.sort_values("Time gain", ascending=False, inplace=True)
results_df_head = results_df.copy()
# results_df_head = results_df.head(50)

# Violin plot: Column Type vs Time Gain
plt.figure(figsize=(12, 8))

# Get unique column types
column_types = results_df['Column Type'].unique()

# Create violin plot with labels for column type
violin_plot = plt.violinplot(
    [results_df[results_df['Column Type'] == dtype]['Time gain'] 
     for dtype in column_types],
    showmeans=True,
)

# Set x-axis ticks and labels
plt.xticks(range(1, len(column_types) + 1), column_types, rotation=45, ha='right')

plt.xlabel('Column Type')
plt.ylabel('Time Gain (ms)')
plt.title('Distribution of Time Gain by Column Type')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()



import pandas as pd
import matplotlib.pyplot as plt

# 1) Aggregate
stats = results_df.groupby('Column Type')['Time gain'] \
          .agg(['mean','std','count','median','quantile']) \
          .sort_values('mean', ascending=True)

# 2) Bar chart of the means
plt.figure(figsize=(6,4))
plt.barh(stats.index, stats['mean'], xerr=stats['std'], alpha=0.7)
plt.xlabel("Average Time Gain (ms)")
plt.title("Average Performance Gain by Column Type")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1) Sort once
df_sorted = results_df.sort_values("Time gain", ascending=False)

# 2) Build a single, global color map over all types
all_types  = df_sorted['Column Type'].unique()
palette    = sns.color_palette('tab10', len(all_types))
color_map  = dict(zip(all_types, palette))

# 3) Define your three subsets
subsets = {
    "All Columns":             df_sorted,
    "Top 50 by Time Gain":     df_sorted.head(50),
    "Bottom 50 by Time Gain":  df_sorted.tail(50)
}

# 4) Loop and plot, reusing the same color_map
for title, subdf in subsets.items():
    colors = subdf['Column Type'].map(color_map)
    
    plt.figure(figsize=(8,6))
    plt.scatter(
        subdf['Join Frequency'],
        subdf['Where Frequency'],
        s=subdf['Time gain'] * 10,
        c=colors,
        alpha=0.7,
        edgecolors='w'
    )
    plt.xlabel("Join Frequency")
    plt.ylabel("Where Frequency")
    plt.title(f"Time‐gain by Join/Where Frequency\n({title})")
    
    # Legend using the same color_map
    for t, col in color_map.items():
        plt.scatter([], [], c=[col], label=t, alpha=0.7, s=100)
    plt.legend(title="Column Type", bbox_to_anchor=(1.05,1), loc='upper left')
    
    plt.tight_layout()
    plt.show()



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1) Sort once
df_sorted = results_df.sort_values("Time gain", ascending=False)

# 2) Build a single, global color map over all types
all_types  = df_sorted['Column Type'].unique()
palette    = sns.color_palette('tab10', len(all_types))
color_map  = dict(zip(all_types, palette))

# 3) Define your three subsets
subsets = {
    "All Columns":             df_sorted,
    "Top 50 by Time Gain":     df_sorted.head(50),
    "Bottom 50 by Time Gain":  df_sorted.tail(50)
}

# 4) Loop and plot, reusing the same color_map
for title, subdf in subsets.items():
    colors = subdf['Column Type'].map(color_map)
    
    plt.figure(figsize=(8,6))
    plt.scatter(
        subdf['Total Join Frequency'],
        subdf['Total Where Frequency'],
        s=subdf['Time gain'] * 10,
        c=colors,
        alpha=0.7,
        edgecolors='w'
    )
    plt.xlabel("Total Join Frequency")
    plt.ylabel("Total Where Frequency")
    plt.title(f"Time‐gain by Join/Where Frequency\n({title})")
    
    # Legend using the same color_map
    for t, col in color_map.items():
        plt.scatter([], [], c=[col], label=t, alpha=0.7, s=100)
    plt.legend(title="Column Type", bbox_to_anchor=(1.05,1), loc='upper left')
    
    plt.tight_layout()
    plt.show()


In [2140]:
# grouped_results_df = results_df[[
#     'Materialized Column',
#     'Time gain',
#     'Rounded time gain',
#     'Frequency',
#     'Join Frequency',
#     'Where Frequency',
#     'Select Frequency',
#     'Group By Frequency',
#     'Order By Frequency'
# ]].groupby("Materialized Column").agg(['max', 'min', 'sum']).reset_index()

# # Optional: Flatten the MultiIndex columns for easier access
# grouped_results_df.columns = ['_'.join(col).strip(
#     '_') for col in grouped_results_df.columns.values]

# grouped_results_df.sort_values("Time gain_sum", inplace=True, ascending=False)


# grouped_results_df.head(30)

In [None]:
import matplotlib.pyplot as plt

# List of frequency columns to plot
frequency_columns = [
    'Frequency_sum',
    'Join Frequency_sum',
    'Where Frequency, Absolute_sum',
    'Select Frequency, Absolute_sum',
    'Group By Frequency, Absolute_sum',
    'Order By Frequency, Absolute_sum'
]

# Create scatter plots for each frequency column vs. 'Rounded time gain'
for col in frequency_columns:
    plt.figure(figsize=(8, 6))
    plt.scatter(grouped_results_df[col], grouped_results_df['Time gain_sum'], alpha=0.7)
    plt.xlabel(col)
    plt.ylabel("Time gain, summed")
    plt.title(f"Scatter Plot: {col} vs Time gain, summed")
    plt.grid(True)
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Compute aggregated statistics for Time gain by Materialized Column
grouped = results_df.groupby("Materialized Column")["Time gain"].agg(
    ['mean', 'std', 'min', 'max']).reset_index()

plt.figure(figsize=(10, 6))
plt.bar(grouped["Materialized Column"],
        grouped["mean"], yerr=grouped["std"], capsize=5)
plt.xlabel("Materialized Column")
plt.ylabel("Average Time gain")
plt.title("Average Time gain by Materialized Column (with std error bars)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
# Prepare data: one list of Time gain values per Materialized Column
data = [group["Time gain"].values for _,
        group in results_df.groupby("Materialized Column")]
labels = sorted(results_df["Materialized Column"].unique())
plt.boxplot(data, labels=labels)
plt.xlabel("Materialized Column")
plt.ylabel("Time gain")
plt.title("Distribution of Time gain by Materialized Column")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [314]:
# import matplotlib.pyplot as plt

# # Define the frequency usage columns
# usage_cols = [
#     'Join Frequency',
#     'Where Frequency',
#     'Select Frequency',
#     'Group By Frequency',
#     'Order By Frequency'
# ]

# # Compute the dominant usage type for each row


# def dominant_usage(row):
#     # idxmax returns the column name with the maximum value among the usage columns
#     return row[usage_cols].idxmax()


# results_df['Dominant Usage'] = results_df.apply(dominant_usage, axis=1)

# # Group by the dominant usage and compute the average Rounded time gain
# grouped_usage = results_df.groupby('Dominant Usage')[
#     'Rounded time gain'].mean()

# plt.figure(figsize=(8, 6))
# plt.bar(grouped_usage.index, grouped_usage.values)
# plt.xlabel('Dominant Usage Type')
# plt.ylabel('Average Rounded Time Gain')
# plt.title('Average Performance Gain by Dominant Usage Type')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

In [315]:
# import statsmodels.api as sm

# # Prepare the predictor variables and the target
# X = results_df[usage_cols]
# y = results_df['Rounded time gain']
# X = sm.add_constant(X)  # add an intercept

# # Fit the regression model
# model = sm.OLS(y, X).fit()
# print(model.summary())

# # Plot the regression coefficients (ignoring the constant)
# coefs = model.params[1:]

# plt.figure(figsize=(8, 6))
# plt.bar(coefs.index, coefs.values)
# plt.xlabel('Frequency Metric')
# plt.ylabel('Coefficient')
# plt.title(
#     'Regression Coefficients for Frequency Metrics Predicting Rounded Time Gain')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

In [316]:
# # Group the data by Dominant Usage and extract the Rounded time gain values
# grouped_data = [group['Rounded time gain'].values for name,
#                 group in results_df.groupby('Dominant Usage')]
# labels = list(results_df.groupby('Dominant Usage').groups.keys())

# plt.figure(figsize=(8, 6))
# plt.boxplot(grouped_data, labels=labels)
# plt.xlabel('Dominant Usage Type')
# plt.ylabel('Rounded Time Gain')
# plt.title('Distribution of Rounded Time Gain by Dominant Usage Type')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

In [169]:
# create new df which is sorted on 1) join frequency, 2) load
r_df = results_df.copy()
r_df = r_df[r_df["Join Frequency"] > 0][["Load", "Materialized Column", "Rounded time gain", "Join Frequency", "Materialized Columns", "Table Size", "Total Join Frequency"]]

# Extract the part after underscore for sorting
r_df['col_suffix'] = r_df['Materialized Column'].str.split('_').str[-1]

# Sort by Load and then alphabetically by the column suffix
r_df = r_df.sort_values(by=['Load', 'col_suffix', "Materialized Columns"])

# Drop the temporary column used for sorting
r_df = r_df.drop('col_suffix', axis=1)

r_df.to_csv("r_df.csv", index=False)

In [None]:
r_df.head(50)

# Display summary statistics of numeric columns
print("\nSummary Statistics:")
r_df.describe()
