In [1]:
!uv add matplotlib --active

[2mResolved [1m290 packages[0m [2min 2ms[0m[0m
[2mAudited [1m285 packages[0m [2min 0.34ms[0m[0m


In [10]:
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import numpy as np
import scipy.stats as stats

In [4]:
targets = ["postgres", "databricks"]
experiments = ["projection", "selection", "cross_join", "inner_join", "deduplication"]

experiment_results = {}

for target in targets:
    experiment_results[target] = {}
    for experiment in experiments:
        with open(f"../experiment_results/{target}/{experiment}/execution_times.pkl", "rb") as file:
            execution_times = pickle.load(file)
        experiment_results[target][experiment] = execution_times

experiment_results

{'postgres': {'projection': {'control': [7.019179,
    6.823811,
    7.01017,
    6.611277,
    7.598873,
    7.363184,
    7.273753,
    7.047453,
    6.754139,
    6.871913,
    7.075005],
   'small_bdds': [7.162543,
    7.079457,
    7.624968,
    7.7874,
    7.485013,
    6.715519,
    6.882292,
    7.226457,
    6.756706,
    6.764177,
    6.890771],
   'large_bdds': [145.864202,
    144.694556,
    150.085721,
    151.102075,
    156.861868,
    150.042722,
    145.563103,
    150.447646,
    145.252299,
    165.235013,
    169.430841],
   'large_strings': [74.574656,
    90.796731,
    77.252515,
    79.707949,
    78.562211,
    84.104308,
    88.824455,
    73.046837,
    88.939362,
    84.986995,
    79.909054]},
  'selection': {'control': [28.186322,
    8.413236,
    6.431541,
    6.910853,
    6.746205,
    6.614529,
    5.965652,
    6.661466,
    5.890489,
    6.716903,
    6.398875],
   'small_bdds': [6.704943,
    6.205651,
    6.562319,
    6.464557,
    6.565338,
   

In [22]:
labels = {
    "projection": "Projection",
    "selection": "Selection",
    "cross_join": "Cartesian Product",
    "inner_join": "Join",
    "deduplication": "Deduplication",
}

def get_label(dataset):
    return labels[dataset]

## Individual Tables

In [26]:
for target in targets:
    rows = []
    for experiment in experiments:
        results = experiment_results[target][experiment]
        for dataset, times in results.items():
            average_time = np.mean(times)
            low, high = stats.t.interval(
                0.95,
                df=len(times)-1,
                loc=average_time,
                scale=np.std(times, ddof=1) / np.sqrt(len(times))
            )
            row = {
                "Experiment": experiment,
                "Dataset": dataset,
                "Average Time": average_time,
                "95% Conf. Int. Low": low,
                "95% Conf. Int. High": high,
            }
            rows.append(row)
    df = pd.DataFrame(rows)
    print(target)
    display(df)
    print(df.to_latex(
        index=False,
        formatters={ "Experiment": get_label },
        float_format="{:.3f}".format
    ))

postgres


Unnamed: 0,Experiment,Dataset,Average Time,95% Conf. Int. Low,95% Conf. Int. High
0,projection,control,7.040796,6.849061,7.232531
1,projection,small_bdds,7.125028,6.875515,7.37454
2,projection,large_bdds,152.23455,146.651147,157.817953
3,projection,large_strings,81.882279,77.825833,85.938725
4,selection,control,8.630552,4.250826,13.010278
5,selection,small_bdds,6.459049,6.285873,6.632225
6,selection,large_bdds,92.30662,84.277873,100.335367
7,selection,large_strings,42.336901,37.883221,46.790582
8,cross_join,control,4.450614,4.283058,4.61817
9,cross_join,small_bdds,5.201058,4.987709,5.414407


\begin{tabular}{llrrr}
\toprule
Experiment & Dataset & Average Time & 95% Conf. Int. Low & 95% Conf. Int. High \\
\midrule
Projection & control & 7.041 & 6.849 & 7.233 \\
Projection & small_bdds & 7.125 & 6.876 & 7.375 \\
Projection & large_bdds & 152.235 & 146.651 & 157.818 \\
Projection & large_strings & 81.882 & 77.826 & 85.939 \\
Selection & control & 8.631 & 4.251 & 13.010 \\
Selection & small_bdds & 6.459 & 6.286 & 6.632 \\
Selection & large_bdds & 92.307 & 84.278 & 100.335 \\
Selection & large_strings & 42.337 & 37.883 & 46.791 \\
Cartesian Product & control & 4.451 & 4.283 & 4.618 \\
Cartesian Product & small_bdds & 5.201 & 4.988 & 5.414 \\
Cartesian Product & medium_bdds & 386.889 & 386.146 & 387.632 \\
Cartesian Product & medium_strings & 13.070 & 12.913 & 13.228 \\
Join & control & 0.318 & 0.298 & 0.338 \\
Join & small_bdds & 0.359 & 0.317 & 0.401 \\
Join & medium_bdds & 4.379 & 4.308 & 4.450 \\
Join & medium_strings & 0.418 & 0.374 & 0.462 \\
Deduplication & control & 17.71

Unnamed: 0,Experiment,Dataset,Average Time,95% Conf. Int. Low,95% Conf. Int. High
0,projection,control,8.09378,7.643423,8.544137
1,projection,small_bdds,7.805642,7.502743,8.108541
2,projection,large_bdds,141.625571,139.836233,143.414909
3,projection,large_strings,296.362285,291.219307,301.505263
4,selection,control,6.92586,6.731948,7.119771
5,selection,small_bdds,6.962744,6.84519,7.080297
6,selection,large_bdds,71.860597,70.327738,73.393456
7,selection,large_strings,150.133414,147.221448,153.045379
8,cross_join,control,6.964751,6.728805,7.200697
9,cross_join,small_bdds,7.374334,7.252491,7.496177


\begin{tabular}{llrrr}
\toprule
Experiment & Dataset & Average Time & 95% Conf. Int. Low & 95% Conf. Int. High \\
\midrule
Projection & control & 8.094 & 7.643 & 8.544 \\
Projection & small_bdds & 7.806 & 7.503 & 8.109 \\
Projection & large_bdds & 141.626 & 139.836 & 143.415 \\
Projection & large_strings & 296.362 & 291.219 & 301.505 \\
Selection & control & 6.926 & 6.732 & 7.120 \\
Selection & small_bdds & 6.963 & 6.845 & 7.080 \\
Selection & large_bdds & 71.861 & 70.328 & 73.393 \\
Selection & large_strings & 150.133 & 147.221 & 153.045 \\
Cartesian Product & control & 6.965 & 6.729 & 7.201 \\
Cartesian Product & small_bdds & 7.374 & 7.252 & 7.496 \\
Cartesian Product & medium_bdds & 347.842 & 344.403 & 351.281 \\
Cartesian Product & medium_strings & 42.327 & 41.253 & 43.401 \\
Join & control & 3.134 & 3.032 & 3.235 \\
Join & small_bdds & 3.181 & 3.079 & 3.283 \\
Join & medium_bdds & 9.945 & 9.810 & 10.081 \\
Join & medium_strings & 7.349 & 7.283 & 7.415 \\
Deduplication & control & 

## Combined Table

In [25]:
rows = []
for experiment in experiments:
    results_postgres = experiment_results["postgres"][experiment]
    results_databricks = experiment_results["databricks"][experiment]
    for dataset in results_postgres.keys():
        if dataset not in ["control", "small_bdds", "medium_bdds", "large_bdds"]:
            continue

        times_postgres = results_postgres[dataset]
        times_databricks = results_databricks[dataset]
        average_time_postgres = np.mean(times_postgres)
        low_postgres, high_postgres = stats.t.interval(
            0.95,
            df=len(times_postgres)-1,
            loc=average_time_postgres,
            scale=np.std(times_postgres, ddof=1) / np.sqrt(len(times_postgres))
        )
        
        times_databricks = results_databricks[dataset]
        times_databricks = results_databricks[dataset]
        average_time_databricks = np.mean(times_databricks)
        low_databricks, high_databricks = stats.t.interval(
            0.95,
            df=len(times_databricks)-1,
            loc=average_time_databricks,
            scale=np.std(times_databricks, ddof=1) / np.sqrt(len(times_databricks))
        )

        t_stat, p_value = stats.ttest_ind(times_databricks, times_postgres)
        
        row = {
            "Experiment": experiment,
            "Dataset": dataset,
            "Doubtless Average Time": average_time_databricks,
            "DuBio Average Time": average_time_postgres,
            "P-Value": p_value
        }
        rows.append(row)
df = pd.DataFrame(rows)
print("combined")
display(df)
print(df.to_latex(
    index=False,
    formatters={ "Experiment": get_label },
    float_format="{:.3f}".format
))

combined


Unnamed: 0,Experiment,Dataset,Doubtless Average Time,DuBio Average Time,P-Value
0,projection,control,8.09378,7.040796,0.0001107077
1,projection,small_bdds,7.805642,7.125028,0.0009659357
2,projection,large_bdds,141.625571,152.23455,0.0006532723
3,selection,control,6.92586,8.630552,0.3965487
4,selection,small_bdds,6.962744,6.459049,3.007667e-05
5,selection,large_bdds,71.860597,92.30662,1.86688e-05
6,cross_join,control,6.964751,4.450614,2.012391e-14
7,cross_join,small_bdds,7.374334,5.201058,1.428083e-14
8,cross_join,medium_bdds,347.842072,386.889146,1.814261e-16
9,inner_join,control,3.133778,0.317838,3.958488e-24


\begin{tabular}{llrrr}
\toprule
Experiment & Dataset & Doubtless Average Time & DuBio Average Time & P-Value \\
\midrule
Projection & control & 8.094 & 7.041 & 0.000 \\
Projection & small_bdds & 7.806 & 7.125 & 0.001 \\
Projection & large_bdds & 141.626 & 152.235 & 0.001 \\
Selection & control & 6.926 & 8.631 & 0.397 \\
Selection & small_bdds & 6.963 & 6.459 & 0.000 \\
Selection & large_bdds & 71.861 & 92.307 & 0.000 \\
Cartesian Product & control & 6.965 & 4.451 & 0.000 \\
Cartesian Product & small_bdds & 7.374 & 5.201 & 0.000 \\
Cartesian Product & medium_bdds & 347.842 & 386.889 & 0.000 \\
Join & control & 3.134 & 0.318 & 0.000 \\
Join & small_bdds & 3.181 & 0.359 & 0.000 \\
Join & medium_bdds & 9.945 & 4.379 & 0.000 \\
Deduplication & control & 6.210 & 17.713 & 0.000 \\
Deduplication & small_bdds & 6.324 & 18.678 & 0.000 \\
Deduplication & medium_bdds & 291.986 & 27.139 & 0.000 \\
\bottomrule
\end{tabular}

