In [198]:
import os
import pandas as pd
import ast
import matplotlib.pyplot as plt
import numpy as np

from testing.tpch import setup as tpch_setup


In [199]:
def load_results_df():
    RESULTS_PATH = f"{os.curdir}/results/single-queries/tpch/2025-03-26-15H/"
    results_df = pd.read_csv(RESULTS_PATH + "write_times.csv")

    # Clean up any unnamed columns
    if "Unnamed: 0" in results_df.columns:
        results_df = results_df.drop(columns=["Unnamed: 0"])

    # Change name of Write time to Write Time
    results_df = results_df.rename(columns={"Write time": "Write Time"})


    # Parse previous materializations into list
    results_df["Materialization"] = results_df["Materialization"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else []
    )

    # Parse DB Size into tuple of ints
    results_df["DB Size"] = results_df["DB Size"].apply(
        lambda x: tuple(map(int, x.strip('()').strip().split(', ')))
    )


    return results_df


In [200]:
def add_number_materialized_fields(df):
    df["N Materialized Fields"] = df["Materialization"].apply(len)
    return df




In [201]:
def plot_scatter(x, y, title, xlabel, ylabel, colorby=None, cmap='tab20', alpha=0.5, figsize=(12, 8)):
    """Create a scatter plot with optional color coding"""
    fig = plt.figure(figsize=figsize)

    if colorby is not None:
        categories = colorby.astype('category').cat.codes
        scatter = plt.scatter(x, y, c=categories, alpha=alpha, cmap=cmap)

        # Add colorbar
        colorbar = plt.colorbar()
        colorbar.set_ticks(range(len(colorby.unique())))
        colorbar.set_ticklabels(sorted(colorby.unique()))
    else:
        scatter = plt.scatter(x, y, alpha=alpha)

    # Add labels and title
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

    # Add grid
    plt.grid(True, linestyle='--', alpha=0.7)

    # Adjust layout
    plt.tight_layout()

    return fig, plt.gca()

# Main

In [None]:
results_df = load_results_df()
results_df = add_number_materialized_fields(results_df)

results_df

In [None]:

fig, ax = plot_scatter(
    results_df["N Materialized Fields"],
    results_df["Materialization Time"],
    "Mat time / N mat fields",
    "N mat fields",
    "Mat time"
)

In [None]:
fig, ax = plot_scatter(
    results_df["N Materialized Fields"],
    results_df["Write Time"],
    "Write time / N mat fields",
    "N mat fields",
    "Mat time"
)

In [None]:
fig, ax = plot_scatter(
    results_df["N Materialized Fields"],
    results_df["DB Size"].apply(lambda x: x[2]),
    "DB Size / N mat fields", 
    "N mat fields",
    "DB Size (bytes)"
)