In [None]:
import os
import pandas as pd
import ast
import matplotlib.pyplot as plt
import numpy as np

from testing.tpch import setup as tpch_setup

plt.rcParams["font.family"] = "cmr10"
plt.rcParams["font.serif"] = ["cmr10"]
plt.rcParams["font.size"] = 16

In [None]:
SCALE_FACTOR = 2

TPCH_TABLE_SIZES = {
    "lineitem": int(6000000 * SCALE_FACTOR),
    "orders": int(1500000 * SCALE_FACTOR),
    "partsupp": int(800000 * SCALE_FACTOR),
    "part": int(200000 * SCALE_FACTOR),
    "customer": int(150000 * SCALE_FACTOR),
    "supplier": int(10000 * SCALE_FACTOR),
    "nation": 25,  # Not scaled
    "region": 5    # Not scaled
}

TPCH_TABLE_PROPORTIONS = {
    "customer": TPCH_TABLE_SIZES["customer"] / sum(TPCH_TABLE_SIZES.values()),
    "lineitem": TPCH_TABLE_SIZES["lineitem"] / sum(TPCH_TABLE_SIZES.values()),
    "orders": TPCH_TABLE_SIZES["orders"] / sum(TPCH_TABLE_SIZES.values()),
    "part": TPCH_TABLE_SIZES["part"] / sum(TPCH_TABLE_SIZES.values()),
    "partsupp": TPCH_TABLE_SIZES["partsupp"] / sum(TPCH_TABLE_SIZES.values()),
    "supplier": TPCH_TABLE_SIZES["supplier"] / sum(TPCH_TABLE_SIZES.values()),
    "nation": TPCH_TABLE_SIZES["nation"] / sum(TPCH_TABLE_SIZES.values()),
    "region": TPCH_TABLE_SIZES["region"] / sum(TPCH_TABLE_SIZES.values())
}

In [None]:
table_prefix_to_name = {
    "ps": "partsupp",
    "c": "customer",
    "l": "lineitem",
    "n": "nation",
    "o": "orders",
    "p": "part",
    "r": "region",
    "s": "supplier",
}

def get_table_name(field):
    prefix = field.split("_")[0]
    return table_prefix_to_name.get(prefix, "Unknown")

def get_table_size(table):
    return TPCH_TABLE_SIZES[table]

def get_table_proportions(table_name):
    return TPCH_TABLE_PROPORTIONS.get(table_name, 0)

In [None]:
def load_write_load_df():
    # RESULTS_PATH = f"{os.curdir}/results/load-based-N-fields/tpch/2025-03-26-15H/"
    # Read each line as a JSON document since columns vary
    RESULTS_PATH = f"{os.curdir}/results/write-performance/final/"
    write_load_df = pd.read_csv(RESULTS_PATH + "write_times.csv")
    write_load_df["Document Table"] = write_load_df.apply(lambda row: next(col.split('_')[0] for col in row.index if pd.notna(row[col])), axis=1)
    write_load_df["Document Table"] = write_load_df["Document Table"].apply(get_table_name)

    return write_load_df

In [None]:
def add_column_types(results_df):
    results_df["Column Type"] = results_df.apply(
        lambda row: tpch_setup.COLUMN_MAP[row["Test"]]["type"]
        if row["Test"] in tpch_setup.COLUMN_MAP else "Unknown",
        axis=1
    )

    return results_df

def add_field_frequency_in_TPCH(results_df):
    results_df["Field Frequency"] = results_df.apply(lambda row: get_field_frequency_in_TPCH(row["Table Name"]), axis=1)
    return results_df

def get_field_frequency_in_TPCH(table_name):
    return get_table_proportions(table_name)

def add_table_name(results_df):
    results_df["Table Name"] = results_df.apply(lambda row: get_table_name(row["Test"]), axis=1)
    return results_df

def add_table_size(results_df):
    results_df["Table Size"] = results_df.apply(lambda row: get_table_size(row["Table Name"]), axis=1)
    return results_df



In [None]:
def load_results_df():
    # RESULTS_PATH = f"{os.curdir}/results/load-based-N-fields/tpch/2025-03-26-15H/"
    # results_df = pd.read_csv(RESULTS_PATH + "write_times.csv")
    RESULTS_PATH = f"{os.curdir}/results/write-performance/final/"
    results_df = pd.read_csv(RESULTS_PATH + "write_times.csv")

    # Clean up any unnamed columns
    if "Unnamed: 0" in results_df.columns:
        results_df = results_df.drop(columns=["Unnamed: 0"])

    # Change name of Write time to Write Time
    results_df = results_df.rename(columns={"Write time": "Write Time"})


    # Parse previous materializations into list
    results_df["Materialization"] = results_df["Materialization"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else []
    )

    # # Parse DB Size into tuple of ints
    # results_df["DB Size"] = results_df["DB Size"].apply(
    #     lambda x: tuple(map(int, x.strip('()').strip().split(', ')))
    # )

    # results_df["DB Size N Blocks"] = results_df["DB Size"].apply(lambda x: x[0])

    for col in results_df.columns:
        print(col)

        # Parse DB Size into tuple of ints
    results_df["DB Size Before"] = results_df["DB Size Before"].apply(
        lambda x: tuple(map(int, x.strip('()').strip().split(', ')))
    )

    # Parse DB Size into tuple of ints
    results_df["DB Size After"] = results_df["DB Size After"].apply(
        lambda x: tuple(map(int, x.strip('()').strip().split(', ')))
    )

    results_df["DB Size Before N Blocks"] = results_df["DB Size Before"].apply(lambda x: x[0])
    results_df["DB Size After N Blocks"] = results_df["DB Size After"].apply(lambda x: x[0])
    results_df["DB Size N Blocks Increase"] = results_df["DB Size After N Blocks"] - results_df["DB Size Before N Blocks"]


    return results_df


def add_number_materialized_fields(df):
    df["N Materialized Fields"] = df["Materialization"].apply(len)
    return df


# Plot functions

In [None]:
def plot_scatter(x, y, title, xlabel, ylabel, colorby=None, cmap='tab10', alpha=0.5, figsize=(12, 8)):
    """Create a scatter plot with optional color coding"""
    fig = plt.figure(figsize=figsize)

    if colorby is not None:
        categories = colorby.astype('category').cat.codes
        scatter = plt.scatter(x, y, c=categories, alpha=alpha, cmap=cmap)

        # Add colorbar
        colorbar = plt.colorbar()
        colorbar.set_ticks(range(len(colorby.unique())))
        colorbar.set_ticklabels(sorted(colorby.unique()))
    else:
        scatter = plt.scatter(x, y, alpha=alpha)

    # Add labels and title
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

    # Add grid
    plt.grid(True, linestyle='--', alpha=0.7)

    # Adjust layout
    plt.tight_layout()

    return fig, plt.gca()

In [None]:
def plot_violin(data_dict, title, xlabel, ylabel, figsize=(12, 8), showmeans=True):
    """Create a violin plot from a dictionary of data"""
    fig = plt.figure(figsize=figsize)

    # Convert dict values to list
    data_values = list(data_dict.values())

    # Create violin plot
    violin_plot = plt.violinplot(data_values, showmeans=showmeans)

    # Set x-axis ticks and labels
    plt.xticks(
        range(1, len(data_dict) + 1),
        list(data_dict.keys()),
        rotation=45,
        ha='right'
    )

    # Add labels and title
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

    # Add counts above each violin
    ax = plt.gca()
    try:
        max_height = max([max(v) if len(v) > 0 else 0 for v in data_values])
        # Get the current y-axis limits
        ymin, ymax = ax.get_ylim()
        # Place text just below the upper limit
        text_y = ymax - (ymax - ymin) * 0.02
        for i, (key, values) in enumerate(data_dict.items(), 1):
            ax.text(i, text_y,
                    f'n={len(values)}', ha='center', va='top')
    except ValueError:
        # Handle empty data
        pass

    # Adjust layout
    plt.tight_layout()

    return fig, plt.gca()

# Main

In [None]:
results_df = load_results_df()
results_df = add_number_materialized_fields(results_df)

results_df = results_df[(results_df["Test"].str.startswith("frequ")) | (results_df["Test"] == "no_materialization")]

In [None]:
write_load_df = load_write_load_df()
write_load_table_df = write_load_df["Document Table"]

# Et lite problem her er at fordelingen er helt jevn. Det gjør det veldig vanskelig å se effekten av frekvens.
write_load_count = write_load_df["Document Table"].value_counts()


In [None]:
results_df

# DB Size

In [None]:
fig, ax = plot_scatter(
    results_df["N Materialized Fields"],
    results_df["DB Size After N Blocks"],
    "Database Size / Number of Materialized Fields", 
    "Number of Materialized Fields",
    "Database Size (Data Blocks)",
    colorby=results_df["Load"]
)

# Materialization time

In [None]:
fig, ax = plot_scatter(
    results_df["N Materialized Fields"],
    results_df["Materialization Time"],
    "Time to Materialize / Number of Materialized Fields",
    "Number of Materialized Fields",
    "Materialization Time (s)"
)

# Write time

In [None]:
fig, ax = plot_scatter(
    results_df["N Materialized Fields"],
    results_df["Write Time"],
    "Time to Write 5000 Documents / Number of Materialized Fields",
    "Number of Materialized Fields",
    "Write Time (s)"
)