In [33]:
from pathlib import Path
from importlib import reload

import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from scipy.optimize import least_squares

pio.renderers.default = "vscode"
pio.templates.default = "plotly_white"

import processing
from plotting import build_lines_fig, build_comparison_boxplot

reload(processing)
from processing import load_data

## Loading Data

In [34]:
DATA_DIR = Path('../data/results').resolve()
assert DATA_DIR.is_dir()

# OUTPUT_DIR = Path('./output').resolve()
# OUTPUT_DIR.mkdir(exist_ok=True)

In [35]:
nearestx_times_path = DATA_DIR / "times_nearest_x.csv"
hilbert_times_path = DATA_DIR / "times_hilbert_curve.csv"
str_times_path = DATA_DIR / "times_sort_tile_recursive.csv"

nearestx_reads_path = DATA_DIR / "reads_nearest_x.csv"
hilbert_reads_path = DATA_DIR / "reads_hilbert_curve.csv"
str_reads_path = DATA_DIR / "reads_sort_tile_recursive.csv"

nearestx = {
    'times': load_data(nearestx_times_path),
    'reads': load_data(nearestx_reads_path),
}

hilbert = {
    'times': load_data(hilbert_times_path),
    'reads': load_data(hilbert_reads_path),
}

sort_tile_recursive = {
    'times': load_data(str_times_path),
    'reads': load_data(str_reads_path),
}

In [36]:
nearestx['times']

Unnamed: 0,q1,q2,q3,q4,q5,q6,q7,q8,q9,q10,...,q91,q92,q93,q94,q95,q96,q97,q98,q99,q100
2¹⁰,565,66,105,98,87,94,65,65,69,90,...,70,67,63,88,91,66,67,43,45,68
2¹¹,561,291,106,128,114,132,68,91,94,93,...,124,92,36,107,95,88,74,55,73,76
2¹²,792,505,320,232,166,129,37,72,63,177,...,78,80,18,81,79,66,44,35,43,55
2¹³,769,764,500,153,104,130,57,127,120,268,...,135,137,21,166,152,113,81,66,67,92
2¹⁴,979,1066,352,267,199,260,88,233,227,516,...,253,255,35,283,294,207,159,106,114,175
2¹⁵,1412,765,991,809,362,489,161,420,430,793,...,492,493,29,560,550,390,253,187,201,333
2¹⁶,1671,1407,1597,1059,920,1178,271,831,812,908,...,1005,943,29,1077,1039,791,479,351,380,601
2¹⁷,2652,2440,1897,2775,1531,2007,530,1650,1623,2317,...,2314,1894,56,2217,2172,1530,949,659,772,1474
2¹⁸,5574,3656,3688,4775,3287,3913,1104,3856,3447,4167,...,3864,3749,66,4345,4371,3024,1847,1335,1489,2372
2¹⁹,9724,6951,4979,9180,5901,8553,2045,6868,7658,7801,...,7503,7403,101,9210,8609,5994,3633,2621,2909,4923


## Results (Tables)

In [37]:
# TODO: Latex table showing the results with confidence intervals

## Plotting Data

### Comparting time

In [38]:
time_means = pd.concat(
    [
        nearestx["times"].mean(axis=1).rename("nearestx"),
        hilbert["times"].mean(axis=1).rename("hilbert"),
        sort_tile_recursive["times"].mean(axis=1).rename("sort_tile_recursive"),
    ],
    axis=1,
)

times_std = pd.concat(
    [
        nearestx["times"].std(axis=1).rename("nearestx"),
        hilbert["times"].std(axis=1).rename("hilbert"),
        sort_tile_recursive["times"].std(axis=1).rename("sort_tile_recursive"),
    ],
    axis=1,
)

#### Time Comparison

In [39]:
fig = build_lines_fig(
    means=time_means,
    title="Execution time for each Bulk Loading Algorithm",
    yaxis_title="Execution time [µs]",
    scale="log",
)
fig.show()

In [40]:
fig = build_comparison_boxplot(
    means=time_means,
    title="Execution time for each Bulk Loading Algorithm",
    yaxis_title="Execution time [µs]",
    scale="log",
)
fig.show()

#### Reads Comparison

In [41]:
reads_mean = pd.concat(
    [
        nearestx["reads"].mean(axis=1).rename("nearestx"),
        hilbert["reads"].mean(axis=1).rename("hilbert"),
        sort_tile_recursive["reads"].mean(axis=1).rename("sort_tile_recursive"),
    ],
    axis=1,
)

reads_std = pd.concat(
    [
        nearestx["reads"].std(axis=1).rename("nearestx"),
        hilbert["reads"].std(axis=1).rename("hilbert"),
        sort_tile_recursive["reads"].std(axis=1).rename("sort_tile_recursive"),
    ],
    axis=1,
)

In [42]:
fig = build_lines_fig(
    means=reads_mean,
    title="Number of reads for each Bulk Loading Algorithm",
    yaxis_title="Number of reads",
    scale="log",
)
fig.show()

In [43]:
fig = build_comparison_boxplot(
    means=reads_mean,
    title="Number of reads for each Bulk Loading Algorithm",
    yaxis_title="Number of reads",
    scale="log",
)
fig.show()

## Model fitting

In [44]:
def time_function(n: int, c: float, alpha: float) -> float:
    """f(n) = c * n^alpha"""
    return c * n ** alpha

In [45]:
# we minimize the sum of the squared residuals
def residuals(params, x, y):
    c, alpha = params
    return y - time_function(x, c, alpha)

def fit_curve(x, y):
    params0 = [1, 1]
    res_lsq = least_squares(residuals, params0, args=(x, y))
    return res_lsq.x

def build_curve_fig(means, std, title, yaxis_title, scale="linear"):
    fig = go.Figure()
    for col in means.columns:
        x = [2**i for i in range(10, 26)]
        y = means[col].values
        c, alpha = fit_curve(x, y)
        fig.add_trace(
            go.Scatter(
                x=x,
                y=y,
                name=col,
                error_y=dict(
                    type="data",
                    array=std[col].values,
                    visible=True,
                ),
            )
        )
        fig.add_trace(
            go.Scatter(
                x=x,
                y=time_function(x, c, alpha),
                name=f"{col} fit",
                mode="lines",
            )
        )
    fig.update_layout(
        title=title,
        xaxis_title="Number of points",
        yaxis_title=yaxis_title,
        yaxis_type=scale,
    )
    return fig

fig = build_curve_fig(
    means=time_means,
    std=times_std,
    title="Execution time for each Bulk Loading Algorithm",
    yaxis_title="Execution time [µs]",
    scale="log",
)
fig.show()