In [None]:
import ast
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

RESULTS_PATH = "../results/tpch/2025-03-05-15H"


# Read the CSV file
meta_results = pd.read_csv(RESULTS_PATH + "/meta_results.csv")

# Function to count the number of materialized fields from the string representation
def count_materialized_fields(s: str):
    try:
        #TODO Remove this when the bug in perform_load_test is fixed
        if s.startswith("dict_keys"):
            s_list = s[len("dict_keys("):-1]
            fields = ast.literal_eval(s_list)
        else:
            # Converts the string representation to a python list
            fields = ast.literal_eval(s)
        return len(fields)
    except Exception as e:
        print(e)

# Create a new column with the number of materialized fields
meta_results["num_materialized_fields"] = meta_results["Materialization"].apply(count_materialized_fields)

# Exclude rows where Test is full_materialization
meta_results_no_full = meta_results[meta_results["Test"] != "full_materialization"]


# Create the scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(meta_results_no_full["num_materialized_fields"], meta_results_no_full["Database size"])
plt.xlabel("Number of Materialized Fields")
plt.ylabel("Database Size")
plt.title("Scatter Plot: Number of Materialized Fields vs. Database Size")
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(meta_results["num_materialized_fields"], meta_results["Time taken"])
plt.xlabel("Number of Materialized Fields")
plt.ylabel("Time Taken to Materialize")
plt.title("Scatter Plot: Number of Materialized Fields vs. Time Taken to Materialize")
plt.grid(True)
plt.show()

In [None]:
subset = meta_results[meta_results["Total query time"] > 0]
plt.figure(figsize=(8, 6))
plt.scatter(subset["num_materialized_fields"], subset["Total query time"])
plt.xlabel("Number of Materialized Fields")
plt.ylabel("Total Query Time")
plt.title("Scatter Plot: Number of Materialized Fields vs. Total Query Time")
plt.grid(True)
plt.show()

In [None]:
import matplotlib.patches as mpatches


# Define the tests to include
tests_to_include = [
    "full_materialization",
    "schema_based_materialization",
    "load_based_t0.33",
    "load_based_t0.5"
]

df_filtered_on_tests = meta_results[meta_results["Test"].isin(tests_to_include)]

df_filtered_on_tests_and_load = df_filtered_on_tests[(df_filtered_on_tests["Query proportion"] == 3) & (df_filtered_on_tests["Majority proportion"] == 80)]

# Pivot the DataFrame so that each Load is an index and each Test is a column with "Time taken" as values
pivot_df = df_filtered_on_tests_and_load.pivot(index="Load", columns="Test", values="Total query time").sort_index()

colors = {
    "full_materialization": "#4C72B0",          # muted blue
    "schema_based_materialization": "#55A868",  # muted green
    "load_based_t0.33": "#C44E52",               # muted red
    "load_based_t0.5": "#8172B3"                 # muted purple
}

fig, ax = plt.subplots(figsize=(10, 6))
loads = pivot_df.index.values  # expected loads: 0, 1, 2, 3, 4
x = np.arange(len(loads))        # group positions
width = 0.2                    # width of each bar

# Loop over each load group
for i, load in enumerate(loads):
    # Extract the row for the load group and sort by "Time taken" (ascending)
    group = pivot_df.loc[load]
    sorted_group = group.sort_values()  # sorts tests by time taken
    # Plot each bar in sorted order (left to right)
    for j, test in enumerate(sorted_group.index):
        ax.bar(x[i] + j * width, sorted_group[test], width, color=colors[test])

# Set x-ticks in the center of each group
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(loads)
ax.set_xlabel("Load")
ax.set_ylabel("Total query time")
ax.set_title("Total query time for 3|80")

# Create a custom legend (order as in tests_to_include)
legend_handles = [mpatches.Patch(color=colors[test], label=test) for test in tests_to_include]
ax.legend(handles=legend_handles)

plt.show()


In [None]:


df_filtered_on_tests["combination"] = df_filtered_on_tests["Query proportion"].astype(str) + "/" + df_filtered_on_tests["Majority proportion"].astype(str)

# First, calculate both the mean and standard deviation (and count) for each group
grouped = df_filtered_on_tests.groupby(["combination", "Test"])["Total query time"] \
    .agg(mean="mean", std="std", count="count").reset_index()

# Calculate the standard error of the mean
grouped["stderr"] = grouped["std"] / np.sqrt(grouped["count"])

# Pivot the DataFrame for means and for standard errors separately
pivot_df = grouped.pivot(index="combination", columns="Test", values="mean").sort_index()
stderr_df = grouped.pivot(index="combination", columns="Test", values="stderr").sort_index()

colors = {
    "full_materialization": "#4C72B0",          # muted blue
    "schema_based_materialization": "#55A868",  # muted green
    "load_based_t0.33": "#C44E52",              # muted red
    "load_based_t0.5": "#8172B3"                # muted purple
}

fig, ax = plt.subplots(figsize=(10, 6))
combinations_sorted = pivot_df.index.values  # These are our x-axis groups
x = np.arange(len(combinations_sorted))
width = 0.2  # width of each bar

# Loop over each combination group
for i, comb in enumerate(combinations_sorted):
    # Extract the row for the current combination for both mean and uncertainty, drop missing values
    group = pivot_df.loc[comb].dropna()
    group_stderr = stderr_df.loc[comb].dropna()
    # Sort tests by average total query time (ascending)
    sorted_group = group.sort_values()
    # Reorder uncertainties to match the sorted order
    sorted_stderr = group_stderr[sorted_group.index]
    # Plot each bar in the sorted order for this combination with error bars
    for j, test in enumerate(sorted_group.index):
        ax.bar(x[i] + j * width, sorted_group[test], width, color=colors[test],
               yerr=sorted_stderr[test], capsize=5)

# Set x-ticks in the center of each group
# tests_to_include should be defined as a list of tests in the order you want them in the legend
ax.set_xticks(x + width * (len(tests_to_include) - 1) / 2)
ax.set_xticklabels(combinations_sorted)
ax.set_xlabel("Query proportion / Majority proportion")
ax.set_ylabel("Average Total Query Time")
ax.set_title("Average Total Query Time Across Combinations\n(Each Bar is Averaged Over 5 Instances with Uncertainty)")
ax.grid(True, axis='y')

# Create a custom legend (the order is as defined in tests_to_include)
legend_handles = [mpatches.Patch(color=colors[test], label=test) for test in tests_to_include]
ax.legend(handles=legend_handles)

plt.show()
