In [None]:
import matplotlib as mpl
mpl.use("pgf")
mpl.rcParams.update({
    "pgf.texsystem": "lualatex",
    # "pgf.preamble": "\n".join([
    #      r"\usepackage[utf8x]{inputenc}",
    #      r"\usepackage[T1]{fontenc}",
    # ]),
})

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.cm as cm
import pickle
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.optimize import curve_fit
from scipy.interpolate import interp1d
from scipy.ndimage import gaussian_filter1d
import scienceplots

plt.style.use(["science", "no-latex"])

In [None]:
max_dim = 25

In [None]:
pruning_config_hashes = {
    "none": "fdda88f15729a5f42b5c1dd5e2f95aaf",
    "each-operation": "374102aefbd5748a5ac334ca9be92bd1",
    "each-step": "048896136dd1358b8ce8269be3283ff4",
    "on-finish": "73c4344e9c575c9f83c4d7f6eaca3cf1",
}

In [None]:
target_labels = {
    "postgres": "DuBio (Temp. Table)",
    "postgres_inmemory": "DuBio (In Mem.)",
    "databricks": "Doubtless"
}

def get_target_label(target):
    return target_labels[target]

def get_experiment_label(experiment):
    return f"\\texttt{{{experiment.upper()}}}"

prune_labels = {
    "none": "None",
    "each-step": "Each Step",
    "each-operation": "Each Operation",
    "on-finish": "On Finish",
}

def get_prune_label(prune_method):
    return prune_labels[prune_method]

In [None]:
targets = ["postgres", "postgres_inmemory", "databricks"]
experiments = ["count", "sum", "avg", "min", "max"]

experiment_results = {}

for target in targets:
    experiment_results[target] = {}
    for experiment in experiments:
        experiment_results[target][experiment] = {}
        for prune_method, hashname in pruning_config_hashes.items():
            execution_times = np.load(
                f"../experiment_results/{target}/{experiment}/{hashname}/execution_times.npy",
                allow_pickle=True
            )
            experiment_results[target][experiment][prune_method] = execution_times

experiment_results

In [None]:
def draw_timeout(ax, resize=True):
    ax.axhline(15*60, linestyle=(0, (1, 3)), color="black")
    ax.text(0.02, 15*60 * 1.15, "Timeout", ha="left", va="bottom", transform=ax.get_yaxis_transform())

    if resize:
        ylim_bottom, ylim_top = ax.get_ylim()
        ax.set_ylim(ylim_bottom, 15*60 * 5)

def draw_legend(ax, name, resize=True, loc="lower right", fontsize=None):
    ax.legend(title=name, loc=loc, fontsize=fontsize, title_fontsize=fontsize)

    if resize:
        xlim_left, xlim_right = ax.get_xlim()
        ax.set_xlim(xlim_left, xlim_right * 10 ** 2)

## Order w.r.t Alternatives (Mappings)

In [None]:
def draw_order(ax, execution_times, prune_method, fixed_variable):
    if len(execution_times) <= fixed_variable:
        return
    times = execution_times[fixed_variable - 1]
    xrange = np.arange(1, len(times) + 1)

    y = times[~np.isnan(times)]
    x = xrange[~np.isnan(times)]

    line, = ax.plot(x, y, label=prune_method)
    ax.set_title(f"{get_target_label(target)} Aggregations")

    max_not_nan_index = (~np.isnan(times)).cumsum().argmax()
    size = times.shape[0]
    if max_not_nan_index != size - 1:
        # Experiments were aborted due to timeout, add an X
        ax.plot(xrange[max_not_nan_index], times[max_not_nan_index], "-x", color=line.get_color())

In [None]:
%matplotlib inline

variables = 8 # Aka "tuples"

for target in targets:
    for i, experiment in enumerate(experiments):
        plt.tight_layout()
        fig, ax = plt.subplots(figsize=(4,3))
        for prune_method in reversed(list(pruning_config_hashes.keys())):
            execution_times = np.array(
                experiment_results[target][experiment][prune_method],
                dtype=float
            )[0:max_dim, 0:max_dim]

            if np.all(np.isnan(execution_times)):
                continue

            draw_order(ax, execution_times, prune_method, variables)
            
        #draw_timeout(ax, resize=False)
        draw_legend(ax, "Pruning Method", resize=False, loc="best")
        
        print(target)
        display(fig)
        # plt.savefig(f"../figures/agg_scatter_{target}_{experiment}.pgf", bbox_inches="tight")
        plt.close()

## Order w.r.t Variables (Tuples)

In [None]:
%matplotlib inline

alternatives = 3 # Aka "mappings"

for target in targets:
    for i, experiment in enumerate(experiments):
        plt.tight_layout()
        fig, ax = plt.subplots(figsize=(4,3))
        for prune_method in reversed(list(pruning_config_hashes.keys())):
            execution_times = np.array(
                experiment_results[target][experiment][prune_method],
                dtype=float
            )[0:max_dim, 0:max_dim]

            if np.all(np.isnan(execution_times)):
                continue

            draw_order(ax, np.transpose(execution_times), prune_method, alternatives)
            
        #draw_timeout(ax, resize=False)
        draw_legend(ax, "Pruning Method", resize=False, loc="best")
        
        print(target)
        display(fig)
        # plt.savefig(f"../figures/agg_scatter_{target}_{experiment}.pgf", bbox_inches="tight")
        plt.close()

## Combined Aggregation Plots w.r.t Alternatives

In [None]:
variables = 8 # Aka "tuples"

for target in targets:
    plt.tight_layout()
    fig, ax = plt.subplots(figsize=(4,3))
    for i, experiment in enumerate(experiments):
        execution_times = np.array(
            experiment_results[target][experiment]["none"],
            dtype=float
        )[0:10, 0:10]

        if np.all(np.isnan(execution_times)):
            continue

        draw_order(ax, execution_times, f"\\texttt{{{experiment.upper()}}}", alternatives)
            
    #draw_timeout(ax, resize=False)
    draw_legend(ax, "Aggregation", resize=False, loc="best")
    ax.set_xlabel("Alternatives per Variable")
    ax.set_ylabel("Computation Time (Seconds)")
    
    print(target)
    display(fig)
    plt.savefig(f"../figures/agg_combined_across_alternatives_{target}.pgf", bbox_inches="tight")
    plt.close()

## Combined Aggregation Plots w.r.t Variables

In [None]:
alternatives = 3 # Aka "mappings"

for target in targets:
    plt.tight_layout()
    fig, ax = plt.subplots(figsize=(4,3))
    for i, experiment in enumerate(experiments):
        execution_times = np.array(
            experiment_results[target][experiment]["none"],
            dtype=float
        )[0:10, 0:10]

        if np.all(np.isnan(execution_times)):
            continue

        draw_order(ax, np.transpose(execution_times), f"\\texttt{{{experiment.upper()}}}", alternatives)
            
    #draw_timeout(ax, resize=False)
    draw_legend(ax, "Aggregation", resize=False, loc="best")
    ax.set_xlabel("Random Variables")
    ax.set_ylabel("Computation Time (Seconds)")
    
    print(target)
    display(fig)
    plt.savefig(f"../figures/agg_combined_across_variables_{target}.pgf", bbox_inches="tight")
    plt.close()

## Combined Line Plots

In [None]:
def draw_scatter(ax, execution_times, prune_method):
    xrange = np.arange(1, execution_times.shape[0] + 1)
    yrange = np.arange(1, execution_times.shape[1] + 1)

    X, Y = np.meshgrid(xrange, yrange)
    world_counts = (Y ** X).flatten()
    flat_execution_times = execution_times.flatten()
    areas = Y.flatten() ** 2

    ax.set_title(f"{target}\n{experiment}")
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.scatter(world_counts, flat_execution_times, s=areas, label=prune_method)

def draw_lines(ax, execution_times, prune_method):
    xrange = np.arange(1, execution_times.shape[0] + 1)
    
    ax.set_title(f"{target}\n{experiment}")
    ax.set_xscale("log")
    ax.set_yscale("log")

    color = None
    for i, times in enumerate(execution_times):
        alternatives = i + 1
        world_counts = alternatives ** xrange

        kwargs = {
            "alpha": 0.8,
        }

        if i == 0:
            line, = ax.plot(world_counts, times, label=prune_method, **kwargs)
            color = line.get_color()
        else:
             ax.plot(world_counts, times, color=color, **kwargs)


def draw_avg_banded(ax, execution_times, prune_method,
               sigma=1.5, num_points=150):
    """
    Parameters
    ----------
    sigma: float
        Standard deviation (in grid-point units) for Gaussian smoothing of the
        **mean** line. 0 = no smoothing.
    num_points : int
        Resolution of the common x-grid.
    """
    # ------------------------------------------------------------------ #
    # 1. Collect (x, y) pairs for every individual run
    # ------------------------------------------------------------------ #
    data = []
    for i, times in enumerate(execution_times):
        times = np.asarray(times)
        variables = i + 1
        xrange = np.arange(1, len(times) + 1, dtype=np.float64)
        world_counts = xrange ** variables
        data.append((world_counts, times))

    if not data:
        return

    # ------------------------------------------------------------------ #
    # 2. Common x-grid (log-spaced because world_counts grow exponentially)
    # ------------------------------------------------------------------ #
    all_x = np.concatenate([x for x, _ in data])
    x_min, x_max = all_x.min(), all_x.max()
    if x_min <= 0 or x_min == x_max:
        x_grid = np.linspace(x_min, x_max, num_points)
    else:
        x_grid = np.logspace(np.log10(x_min), np.log10(x_max), num_points)

    # ------------------------------------------------------------------ #
    # 3. Interpolate every run onto the grid (linear, no extrapolation)
    # ------------------------------------------------------------------ #
    interps = [
        interp1d(x, y, kind='linear', bounds_error=False, fill_value=np.nan)
        for x, y in data
    ]
    y_grid = np.array([f(x_grid) for f in interps])   # (n_runs, n_grid)

    # ------------------------------------------------------------------ #
    # 4. Statistics (ignore NaNs)
    # ------------------------------------------------------------------ #
    # y_mean = np.nanmean(y_grid, axis=0)
    y_mean = np.power(10, np.nanmean(np.log10(y_grid), axis=0))
    # y_mean = np.exp(np.nanmean(np.log(y_grid), axis=0))
    y_lower = np.nanpercentile(y_grid, 5, axis=0)
    y_upper = np.nanpercentile(y_grid, 90, axis=0)

    # ------------------------------------------------------------------ #
    # 5. SMOOTHING
    # ------------------------------------------------------------------ #
    if sigma > 0:
        y_mean = gaussian_filter1d(y_mean, sigma=sigma, mode='nearest')
        y_lower = gaussian_filter1d(y_lower, sigma=sigma * 0.8, mode='nearest')
        y_upper = gaussian_filter1d(y_upper, sigma=sigma * 0.8, mode='nearest')

    # ------------------------------------------------------------------ #
    # 6. Plot
    # ------------------------------------------------------------------ #
    line, = ax.plot(x_grid, y_mean, label=prune_method)
    color = line.get_color()
    ax.fill_between(x_grid, y_lower, y_upper, color=color, alpha=0.25)

    max_not_nan_index = (~np.isnan(y_mean)).cumsum(0).argmax(0)
    size = y_mean.shape[0]
    if max_not_nan_index != size - 1:
        # Experiments were aborted due to timeout, add an X
        ax.plot(x_grid[max_not_nan_index], y_mean[max_not_nan_index], "-x", color=color)

    # if prune_method == "each-step":
    #     non_nan_x = x_grid[~np.isnan(y_mean)]
    #     non_nan_y = y_mean[~np.isnan(y_mean)]
    #     # log_b, log_a = np.polyfit(x_grid[~np.isnan(y_mean)], np.log(y_mean[~np.isnan(y_mean)]), 1, w=np.sqrt(y_mean[~np.isnan(y_mean)]))
    #     fit,_ = curve_fit(lambda t,a,b: a*np.exp(b*t), non_nan_x, non_nan_y, p0=(0.01, 0.001))
    #     exp_y = fit[0] * np.exp(fit[1] * x_grid)
    #     ax.plot(x_grid, exp_y, color="black")
    # ax.plot(x_grid, x_grid, color="black")
    # ax.plot(x_grid, x_grid ** 2, color="black")
    # ax.plot(x_grid, x_grid ** 3, color="black")

    ax.set_title(f"{get_experiment_label(experiment)} Aggregation")   # keep your original title
    ax.set_xscale("log")
    ax.set_yscale("log")

In [None]:
%matplotlib inline

for target in targets:
    for i, experiment in enumerate(experiments):
        plt.tight_layout()
        fig, ax = plt.subplots(figsize=(4,3))
        for prune_method in reversed(list(pruning_config_hashes.keys())):
            execution_times = np.array(
                experiment_results[target][experiment][prune_method],
                dtype=float
            )[0:max_dim, 0:max_dim]

            if np.all(np.isnan(execution_times)):
                continue

            draw_avg_banded(ax, execution_times, get_prune_label(prune_method))
            
        ax.set_xlabel("Possible Worlds ($\\log^{10}$)")
        ax.set_ylabel("Computation Time (Seconds, $\\log^{10}$)")
            
        draw_timeout(ax)
        if i == 0:
            draw_legend(ax, "Pruning Method", fontsize=9)
        
        print(target)
        display(fig)
        # plt.savefig(f"../figures/agg_scatter_{target}_{experiment}.pgf", bbox_inches="tight")
        plt.close()

## Individual 3D Plots

In [None]:
def log_tick_formatter(val, pos=None):
    return f"$10^{{{int(val)}}}$"

for target in targets:
    for experiment in experiments:
        fig, axs = plt.subplots(
            nrows=1,
            ncols=4,
            figsize=(18,4),
            subplot_kw=dict(projection="3d")
        )
        fig.suptitle(f"{target}\n{experiment}")
        for i, prune_method in enumerate(pruning_config_hashes.keys()):
            execution_times = np.array(
                experiment_results[target][experiment][prune_method],
                dtype=float
            )[0:max_dim, 0:max_dim]

            ax = axs[i]
            
            xrange = np.arange(1, execution_times.shape[0] + 1, dtype=np.float64)
            yrange = np.arange(1, execution_times.shape[1] + 1, dtype=np.float64)
    
            X, Y = np.meshgrid(xrange, yrange)
            
            ax.set_title(prune_method)
            ax.plot_surface(X, Y, np.log10(execution_times), cmap="viridis")
    
            ax.set_xlabel("Variables")
            ax.set_ylabel("Alternatives per Variable")
            ax.set_zlabel("Execution Time ($\\log^{10}$)")
    
            ax.zaxis.set_major_formatter(mticker.FuncFormatter(log_tick_formatter))
            ax.zaxis.set_major_locator(mticker.MaxNLocator(integer=True))
    
            ax.set_xticks(xrange, [f"{x}" for x in xrange])
            ax.set_yticks(yrange, [f"{y}" for y in yrange])

        print(target)
        display(fig)
        plt.close()

## Comparison Line Plots

In [None]:
def draw_avg_comparison(ax, execution_times, system, sigma=1.5, num_points=150):
    """
    Parameters
    ----------
    sigma: float
        Standard deviation (in grid-point units) for Gaussian smoothing of the
        **mean** line. 0 = no smoothing.
    num_points : int
        Resolution of the common x-grid.
    """
    # ------------------------------------------------------------------ #
    # 1. Collect (x, y) pairs for every individual run
    # ------------------------------------------------------------------ #
    data = []
    for i, times in enumerate(execution_times):
        times = np.asarray(times)
        variables = i + 1
        xrange = np.arange(1, len(times) + 1, dtype=np.float64)
        world_counts = xrange ** variables
        data.append((world_counts, times))

    if not data:
        return

    # ------------------------------------------------------------------ #
    # 2. Common x-grid (log-spaced because world_counts grow exponentially)
    # ------------------------------------------------------------------ #
    all_x = np.concatenate([x for x, _ in data])
    x_min, x_max = all_x.min(), all_x.max()
    if x_min <= 0 or x_min == x_max:
        x_grid = np.linspace(x_min, x_max, num_points)
    else:
        x_grid = np.logspace(np.log10(x_min), np.log10(x_max), num_points)

    # ------------------------------------------------------------------ #
    # 3. Interpolate every run onto the grid (linear, no extrapolation)
    # ------------------------------------------------------------------ #
    interps = [
        interp1d(x, y, kind='linear', bounds_error=False, fill_value=np.nan)
        for x, y in data
    ]
    y_grid = np.array([f(x_grid) for f in interps])   # (n_runs, n_grid)

    # ------------------------------------------------------------------ #
    # 4. Statistics (ignore NaNs)
    # ------------------------------------------------------------------ #
    # y_mean = np.nanmean(y_grid, axis=0)
    y_mean = np.power(10, np.nanmean(np.log10(y_grid), axis=0))
    # y_mean = np.exp(np.nanmean(np.log(y_grid), axis=0))

    # ------------------------------------------------------------------ #
    # 5. SMOOTHING
    # ------------------------------------------------------------------ #
    if sigma > 0:
        y_mean = gaussian_filter1d(y_mean, sigma=sigma, mode='nearest')

    # ------------------------------------------------------------------ #
    # 6. Plot
    # ------------------------------------------------------------------ #
    line, = ax.plot(x_grid, y_mean, label=system)

    max_not_nan_index = (~np.isnan(y_mean)).cumsum(0).argmax(0)
    size = y_mean.shape[0]
    if max_not_nan_index != size - 1:
        # Experiments were aborted due to timeout, add an X
        ax.plot(x_grid[max_not_nan_index], y_mean[max_not_nan_index], "-x", color=line.get_color())

    ax.set_title(f"{get_experiment_label(experiment)} Aggregation")   # keep your original title
    ax.set_xscale("log")
    ax.set_yscale("log")

In [None]:
%matplotlib inline

for i, experiment in enumerate(experiments):
    plt.tight_layout()
    fig, ax = plt.subplots(figsize=(4,3))
    for target in targets:
        execution_times = np.array(
            experiment_results[target][experiment]["none"],
            dtype=float
        )[0:max_dim, 0:max_dim]

        draw_avg_comparison(ax, execution_times, get_target_label(target))
        
    draw_timeout(ax)
    if i == 0:
        draw_legend(ax, "System")
        
    ax.set_xlabel("Possible Worlds ($\\log^{10}$)")
    ax.set_ylabel("Computation Time (Seconds, $\\log^{10}$)")

    display(fig)
    plt.savefig(f"../figures/agg_line_comparison_{experiment}.pgf", bbox_inches="tight")
    plt.close()

## Variable Alternatives Each Step

In [None]:
mycmap = cm.viridis

def draw_lines_altmap(ax, execution_times, sigma=0.5):
    norm = plt.Normalize(vmin=1, vmax=execution_times.shape[0])
    
    ax.set_title(f"{get_experiment_label(experiment)} Aggregation")
    ax.set_xscale("log")
    ax.set_yscale("log")

    color = None
    for i, times in enumerate(execution_times):
        xrange = np.arange(1, len(times) + 1, dtype=np.float64)
        
        variables = i + 1
        world_counts = xrange ** variables

        if sigma > 0:
            times = gaussian_filter1d(times, sigma=sigma, mode='nearest')

        y = times[~np.isnan(times)]
        x = world_counts[~np.isnan(times)]
            
        line, = ax.plot(x, y, color=mycmap(norm(variables)))
        
        max_not_nan_index = (~np.isnan(times)).cumsum(0).argmax(0)
        size = times.shape[0]
        if max_not_nan_index != size - 1:
            # Experiments were aborted due to timeout, add an X
            ax.plot(world_counts[max_not_nan_index], times[max_not_nan_index], "-x", color=line.get_color())

In [None]:
for target in targets:
    for experiment in experiments:
        plt.tight_layout()
        fig, ax = plt.subplots(figsize=(4,3))
        
        execution_times = np.array(
            experiment_results[target][experiment]["none"],
            dtype=float
        )[0:max_dim, 0:max_dim]

        # if experiment == "min":
        #     print(execution_times)
        #     print(execution_times.shape)
    
        draw_lines_altmap(ax, execution_times)
        
        # draw_timeout(ax)

        ax.set_xlabel("Possible Worlds ($\\log^{10}$)")
        ax.set_ylabel("Computation Time (Seconds, $\\log^{10}$)")
    
        sm = plt.cm.ScalarMappable(cmap=mycmap, norm=plt.Normalize(vmin=1, vmax=execution_times.shape[0]))
        fig.colorbar(sm, ax=ax, label="Random Variables")
    
        display(fig)
        plt.savefig(f"../figures/agg_line_alternatives_{target}_{experiment}.pgf", bbox_inches="tight")
        plt.close()

## System Comparison Table

In [None]:
rows = []

for experiment in experiments:
    for target in targets:
        execution_times = np.array(
            experiment_results[target][experiment]["none"],
            dtype=float
        )[0:max_dim, 0:max_dim]

        max_world_count = None
        max_world_count_time = None
        for i, times in enumerate(execution_times):
            times = np.asarray(times)
            variables = i + 1
            xrange = np.arange(1, len(times) + 1, dtype=np.float64)
            
            world_counts = xrange ** variables
            non_nan_world_counts = world_counts[~np.isnan(times)]
            if len(non_nan_world_counts) == 0:
                continue
            
            non_nan_times = times[~np.isnan(times)]
            max_non_nan_world_count = np.max(non_nan_world_counts)
            max_time_index = np.argmax(non_nan_world_counts)
            max_time = non_nan_times[max_time_index]

            if max_world_count == max_non_nan_world_count and max_time < max_world_count_time:
                max_world_count_time = max_time
            elif max_world_count is None or max_non_nan_world_count > max_world_count:
                max_world_count = max_non_nan_world_count
                max_world_count_time = max_time

        rows.append({
            "Aggregation": experiment.upper(),
            "System": target,
            "Max Possible Worlds": max_world_count,
            "Query Time": max_world_count_time
        })

df = pd.DataFrame(rows)
display(df)
print(df.to_latex(
    index=False,
    formatters={
        "System": get_target_label,
        "Max Possible Worlds": lambda w: f"{w:.0f}",
        "Aggregation": lambda a: f"\\multirow{{3}}{{*}}{{\\texttt{{{a}}}}}"
    },
    float_format="{:.3f}".format
))