In [None]:
import os
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Barchart Workload

In [None]:
RESULTS_PATH = os.curdir + "/results/load-based-N-fields/tpch/2025-03-26-15H"
results_df = pd.read_csv(RESULTS_PATH + '/meta_results.csv')
results_df.tail()

In [None]:
# Function to count the number of materialized fields from the string representation
def count_materialized_fields(s: str):
    try:
        # Converts the string representation to a python list
        fields = ast.literal_eval(s)
        return len(fields)
    except Exception as e:
        print(e)


# Create a new column with the number of materialized fields
results_df["num_materialized_fields"] = results_df["Materialization"].apply(
    count_materialized_fields)

subset = results_df[(results_df["Total query time"] > 0)]
# Subset rows where Test starts with "load_based_t"
load_based = subset[subset["Test"].str.startswith("load_based_m")]

# Subset rows with Test equal to "schema_based_materialization"
schema_based = subset[subset["Test"] == "schema_based_materialization"]

# Merge the two dataframes on the columns "Query proportion", "Majority proportion", and "Load"
merged = pd.merge(
    load_based,
    schema_based,
    on=["Query proportion", "Majority proportion", "Load"],
    suffixes=('_load', '_schema')
)
# Compute the ratio between the total query time of the load_based row and the corresponding schema_based row
merged["query_time_ratio"] = merged["Total query time_load"] / \
    merged["Total query time_schema"]

# Define new color mapping for all tests (adjust as needed)
colors = {
    "schema_based_materialization": "#FF7F0E",  # Bright orange for clear visibility

    "no_materialization":  "#1f77b4",  # Blue
    "load_based_m1":  "#2ca02c",  # Green
    "load_based_m2":  "#d62728",  # Red
    "load_based_m3":  "#9467bd",  # Purple
    "load_based_m4":  "#8c564b",  # Brown

    "load_based_m5":  "#1f77b4",  # Blue
    "load_based_m6":  "#2ca02c",  # Green
    "load_based_m7":  "#d62728",  # Red
    "load_based_m8":  "#9467bd",  # Purple
    "load_based_m9":  "#8c564b",  # Brown

    "load_based_m10": "#1f77b4",  # Blue
    "load_based_m11": "#2ca02c",  # Green
    "load_based_m12": "#d62728",  # Red
    "load_based_m13": "#9467bd",  # Purple
    "load_based_m14": "#8c564b",  # Brown

    "load_based_m15": "#1f77b4",  # Blue
    "load_based_m20": "#2ca02c",  # Green
    "load_based_m25": "#d62728",  # Red
    "load_based_m30": "#9467bd",  # Purple
    "load_based_m35": "#8c564b",  # Brown
}

# Define the test order using all keys from the colors dictionary
test_order = list(colors.keys())


def shorten_label_into_number(test:str):
    if test == "schema_based_materialization":
        return 20
    elif test.startswith("load_based_"):
        return int((test.split("_")[-1])[1:])
    elif test.lower() == "no_materialization":
        return 0
    else:
        return test


# Filter the DataFrame for the tests in test_order
df_current = results_df[results_df["Test"].isin(test_order)].copy()

# Group by Test to compute statistics for Total query time
grouped_total_query_time = df_current.groupby("Test")["Total query time"].agg(
    mean="mean", std="std", count="count", max="max", min="min"
).reset_index()
grouped_total_query_time["stderr"] = grouped_total_query_time["std"] / \
    np.sqrt(grouped_total_query_time["count"])

# Ensure that the tests are ordered as in test_order (this line is kept if needed for other purposes)
grouped_total_query_time["Test"] = pd.Categorical(
    grouped_total_query_time["Test"], categories=test_order, ordered=True
)

# Instead of sorting by mean, we sort by the numeric value from shorten_label_into_number
grouped_total_query_time = grouped_total_query_time.sort_values(
    by="Test", key=lambda col: col.map(shorten_label_into_number)
)

# Helper function to shorten the labels


def shorten_label(test):
    if test == "schema_based_materialization":
        return "s16"
    elif test.startswith("load_based_"):
        return test.split("_")[-1]
    elif test.lower() == "no_materialization":
        return "m0"
    else:
        return test


# Create the list of shortened labels
short_labels = [shorten_label(test)
                for test in grouped_total_query_time["Test"]]

# Add shortened label to grouped_total_query_time
grouped_total_query_time["test label"] = grouped_total_query_time.apply(lambda row: shorten_label(row["Test"]), axis=1)

# Common x positions and bar width
x = np.arange(len(grouped_total_query_time))
bar_width = 0.6

# Define plot configurations in a loop
plot_configs = [
    {"mode": "errorbar",
        "title": "Average Total Query Time for each Test (with Uncertainty)"},
    # {"mode": "minmax",
    #     "title": "Total Query Time for each Test (with Min and Max Values)"}
]

for config in plot_configs:
    fig, ax = plt.subplots(figsize=(12, 6))
    mode = config["mode"]

    if mode == "errorbar":
        # Plot bars with error bars for standard error
        ax.bar(
            x,
            grouped_total_query_time["mean"],
            bar_width,
            color=[colors[test] for test in grouped_total_query_time["Test"]],
            yerr=grouped_total_query_time["stderr"],
            capsize=5
        )
    # elif mode == "minmax":
    #     # Plot bars without error bars
    #     ax.bar(
    #         x,
    #         grouped_total_query_time["mean"],
    #         bar_width,
    #         color=[colors[test] for test in grouped_total_query_time["Test"]],
    #         capsize=5
    #     )
    #     # Overlay dashed lines and markers for min and max values
    #     for i, (_, row) in enumerate(grouped_total_query_time.iterrows()):
    #         ax.plot([x[i], x[i]], [row["min"], row["max"]],
    #                 linestyle="--", color="black", linewidth=1.5)
    #         ax.scatter(x[i], row["min"], color="black", marker="v", s=50)
    #         ax.scatter(x[i], row["max"], color="black", marker="^", s=50)

    ax.set_xticks(x)
    ax.set_xticklabels(short_labels, rotation=45)
    ax.set_xlabel("Test")
    ax.set_ylabel("Average Total Query Time (s)")
    ax.set_title(config["title"])
    ax.grid(True, axis='y')

In [None]:
def _mean(label):
    return round(grouped_total_query_time[grouped_total_query_time["test label"] == label]["mean"].iloc[0],2)
def _err(label):
    err = round(grouped_total_query_time[grouped_total_query_time["test label"] == label]["stderr"].iloc[0],2)
    return f"{err},{err}"

gigantic_f_string = f"""
\\begin{{tikzpicture}}
        \\begin{{axis}}[
            ybar,   
            axis lines=box,
            width=1\\textwidth,   
            xtick pos=bottom,
            ytick pos=left,
            height=6cm,               
            xlabel={{Test}},
            ylabel={{Total Query Time (s)}},
            title={{Load 1: Total Query Time}},
            symbolic x coords={{
                m0, 
                m1, 
                m2,
                m3,
                m4,
                m5,
                m6,
                m7,
                m8,
                m9,
                m10,
                m11,
                m12,
                m13,
                m14,
                m15,
                s16,
                m20,
                m25,
                m30,
                m35
            }},
            xtick={{
                m0, 
                m1, 
                m2,
                m3,
                m4,
                m5,
                m6,
                m7,
                m8,
                m9,
                m10,
                m11,
                m12,
                m13,
                m14,
                m15,
                s16,
                m20,
                m25,
                m30,
                m35
            }},           
            x tick label style={{
                rotate=45,
                anchor=east
            }},
            ymin=0,
            ytick={{0, 250, 500, 750, 1000, 1250, 1500, 1750, 2000}},
            ymajorgrids=true
        ]
        
        \\addplot+[
            draw=none,
            fill=red, 
            bar shift=0pt,
            error bars/.cd,
                y dir=both,
                y explicit,
                error bar style={{black}}
        ] coordinates {{                
            (s16,{_mean("s16")}) +- ({_err("s16")})
        }};

        \\addplot+[
            draw=none,
            fill=green, 
            bar shift=0pt,
            error bars/.cd,
                y dir=both,
                y explicit,
                error bar style={{black}}
        ] coordinates {{  
            (m20,{_mean("m20")}) +- ({_err("m20")})           
            (m25,{_mean("m25")}) +- ({_err("m25")})           
            (m30,{_mean("m30")}) +- ({_err("m30")})           
            (m35,{_mean("m35")}) +- ({_err("m35")})           
        }};
        
      \\addplot+[
        bar shift=0pt,
        draw=none, 
        fill=blue,
        error bars/.cd,
                y dir=both,
                y explicit,
                error bar style={{black}}
      ] coordinates {{
        (m0,{_mean("m0")}) +- ({_err("m0")})
        (m1,{_mean("m1")}) +- ({_err("m1")})
        (m2,{_mean("m2")}) +- ({_err("m2")})
        (m3,{_mean("m3")}) +- ({_err("m3")})
        (m4,{_mean("m4")}) +- ({_err("m4")})
        (m5,{_mean("m5")}) +- ({_err("m5")})
        (m6,{_mean("m6")}) +- ({_err("m6")})
        (m7,{_mean("m7")}) +- ({_err("m7")})
        (m8,{_mean("m8")}) +- ({_err("m8")})
        (m9,{_mean("m9")}) +- ({_err("m9")})
        (m10,{_mean("m10")}) +- ({_err("m10")})
        (m11,{_mean("m11")}) +- ({_err("m11")})
        (m12,{_mean("m12")}) +- ({_err("m12")})
        (m13,{_mean("m13")}) +- ({_err("m13")})
        (m14,{_mean("m14")}) +- ({_err("m14")})
        (m15,{_mean("m15")}) +- ({_err("m15")})
      }};
        \\end{{axis}}
        \\end{{tikzpicture}}
"""
print(gigantic_f_string)

In [None]:
grouped_total_query_time.head(50)
