In [None]:
import matplotlib as mpl
mpl.use("pgf")
mpl.rcParams.update({
    "pgf.texsystem": "lualatex",
    # "pgf.preamble": "\n".join([
    #      r"\usepackage[utf8x]{inputenc}",
    #      r"\usepackage[T1]{fontenc}",
    # ]),
})

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.cm as cm
import pickle
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.optimize import curve_fit
from scipy.interpolate import interp1d
from scipy.ndimage import gaussian_filter1d
import scienceplots

plt.style.use(["science", "no-latex"])

In [None]:
experiments_data_sizes = {
    "sentence_creation": {
        1: range(1, 8434380+1, 8434380 // step_size + 1),
        2: range(1, 2024422+1, 2024422 // step_size + 1),
        3: range(1, 850446+1, 850446 // step_size + 1),
        4: range(1, 468156+1, 468156 // step_size + 1),
        5: range(1, 288195+1, 288195 // step_size + 1),
        6: range(1, 251544+1, 251544 // step_size + 1),
        7: range(1, 175714+1, 175714 // step_size + 1),
        8: range(1, 141016+1, 141016 // step_size + 1),
        9: range(1, 109665+1, 109665 // step_size + 1),
        10: range(1, 88940+1, 88940 // step_size + 1),
    },
    "sentence_consolidation": {
        1: range(1, 8434380+1, 8434380 // step_size + 1),
        2: range(1, 2024422+1, 2024422 // step_size + 1),
        3: range(1, 850446+1, 850446 // step_size + 1),
        4: range(1, 468156+1, 468156 // step_size + 1),
        5: range(1, 288195+1, 288195 // step_size + 1),
        6: range(1, 251544+1, 251544 // step_size + 1),
        7: range(1, 175714+1, 175714 // step_size + 1),
        8: range(1, 141016+1, 141016 // step_size + 1),
        9: range(1, 109665+1, 109665 // step_size + 1),
        10: range(1, 88940+1, 88940 // step_size + 1),
    },
    "probability_dictionary_creation": {
        1: range(1, 8434380+1, 8434380 // step_size + 1),
        2: range(1, 2024422+1, 2024422 // step_size + 1),
        3: range(1, 850446+1, 850446 // step_size + 1),
        4: range(1, 468156+1, 468156 // step_size + 1),
        5: range(1, 288195+1, 288195 // step_size + 1),
        6: range(1, 251544+1, 251544 // step_size + 1),
        7: range(1, 175714+1, 175714 // step_size + 1),
        8: range(1, 141016+1, 141016 // step_size + 1),
        9: range(1, 109665+1, 109665 // step_size + 1),
        10: range(1, 88940+1, 88940 // step_size + 1),
    },
    "probability_calculation": {
        1: range(1, 274648+1, 274648 // step_size + 1),
        2: range(1, 75262+1, 75262 // step_size + 1),
        3: range(1, 22380+1, 22380 // step_size + 1),
        4: range(1, 29064+1, 29064 // step_size + 1),
        5: range(1, 11070+1, 11070 // step_size + 1),
        6: range(1, 17214+1, 17214 // step_size + 1),
        7: range(1, 3759+1, 3759 // step_size + 1),
        8: range(1, 9304+1, 9304 // step_size + 1),
        9: range(1, 3519+1, 3519 // step_size + 1),
        10: range(1, 4380+1, 4380 // step_size + 1),
    },
}

In [None]:
# targets = ["postgres", "databricks"]
targets = ["databricks"]
# experiments = ["sentence_creation", "sentence_consolidation", "probability_dictionary_creation", "probability_calculation"]
experiments = ["sentence_creation", "sentence_consolidation"]

experiment_results = {}

for target in targets:
    experiment_results[target] = {}
    for experiment in experiments:
        with open(f"../experiment_results/{target}/{experiment}/execution_times.pkl", "rb") as file:
            execution_times = pickle.load(file)
        experiment_results[target][experiment] = execution_times

experiment_results

In [None]:
labels = {
    "sentence_creation": "Sentence Creation",
    "sentence_consolidation": "Sentence Consolidation",
    "probability_dictionary_creation": "Prob. Dict. Creation",
    "probability_calculation": "Prob. Calculation",
}

def get_label(dataset):
    return labels[dataset]

## Over Data Sizes

In [None]:
max_data_size = 500_000
mycmap = cm.viridis

def plot_over_data_size(ax, execution_times, experiment, cluster_sizes):
    norm = plt.Normalize(vmin=np.min(cluster_sizes), vmax=np.max(cluster_sizes))
    
    color = None
    for cluster_size in cluster_sizes:
        times = execution_times[cluster_size]
        
        data_sizes = np.array(experiments_data_sizes[experiment][cluster_size])
        data_sizes = data_sizes[data_sizes <= max_data_size]
        times = times[0:len(data_sizes)]

        ax.plot(data_sizes, times, color=mycmap(norm(cluster_size)))

In [None]:
%matplotlib inline

for target in targets:
    for experiment in experiments:
        plt.tight_layout()
        fig, ax = plt.subplots(figsize=(4,3))
        
        execution_times = experiment_results[target][experiment]

        cluster_sizes = np.arange(1, 11)
        plot_over_data_size(ax, execution_times, experiment, cluster_sizes)
            
        #draw_timeout(ax, resize=False)
        #draw_legend(ax, "Experiment", resize=False, loc="best")
        sm = plt.cm.ScalarMappable(
            cmap=mycmap,
            norm=plt.Normalize(vmin=np.min(cluster_sizes), vmax=np.max(cluster_sizes))
        )
        fig.colorbar(sm, ax=ax, label="Cluster Size")
        ax.set_title(get_label(experiment))
        ax.set_ylabel("Computation Time (Seconds)")
        ax.set_xlabel("Data Size (Rows)")
        
        display(fig)
        plt.savefig(f"../figures/integration_data_sizes_{target}_{experiment}.pgf", bbox_inches="tight")
        plt.close()

## Over Cluster Sizes

In [None]:
data_size_range = np.arange(10_000, 110_000, 10_000)

def plot_over_cluster_size(ax, execution_times, experiment, cluster_sizes, data_size):
    norm = plt.Normalize(vmin=np.min(data_size_range), vmax=np.max(data_size_range))
    
    interps = [
        interp1d(
            experiments_data_sizes[experiment][cluster_size],
            execution_times[cluster_size],
            kind='linear',
            bounds_error=False,
            fill_value="extrapolate"
        )
        for cluster_size in cluster_sizes
    ]
    y = [f(data_size) for f in interps]

    ax.plot(cluster_sizes, y, color=mycmap(norm(data_size)))

In [None]:
%matplotlib inline

for target in targets:
    for experiment in experiments:
        plt.tight_layout()
        fig, ax = plt.subplots(figsize=(4,3))
        
        execution_times = experiment_results[target][experiment]

        cluster_sizes = np.arange(1, 11)

        for data_size in data_size_range:
            plot_over_cluster_size(ax, execution_times, experiment, cluster_sizes, data_size)
            
        #draw_timeout(ax, resize=False)
        #draw_legend(ax, "Experiment", resize=False, loc="best")
        sm = plt.cm.ScalarMappable(
            cmap=mycmap,
            norm=plt.Normalize(vmin=np.min(data_size_range), vmax=np.max(data_size_range))
        )
        fig.colorbar(sm, ax=ax, label="Data Size (Rows)")
        ax.set_title(get_label(experiment))
        ax.set_ylabel("Computation Time (Seconds)")
        ax.set_xlabel("Cluster Size")
        ax.set_ylim(bottom=0)
        #
        display(fig)
        plt.savefig(f"../figures/integration_cluster_sizes_{target}_{experiment}.pgf", bbox_inches="tight")
        plt.close()