In [None]:
import ibis
import gcsfs
import plotly.express as px

from ibis_bench.utils.monitor import get_timings_dir

ibis.options.interactive = True
ibis.options.repr.interactive.max_rows = 20
ibis.options.repr.interactive.max_columns = None

px.defaults.template = "plotly_dark"

In [None]:
# YOLO
import warnings

warnings.filterwarnings("ignore")

In [None]:
cloud_logs = True

In [None]:
if cloud_logs:
    BUCKET = "ibis-bench"

    fs = gcsfs.GCSFileSystem()
    fs.ls(f"{BUCKET}/{get_timings_dir()}")[-5:]

In [None]:
con = ibis.connect("duckdb://")

if cloud_logs:
    con.register_filesystem(fs)

In [None]:
glob_path = (
    f"gs://{BUCKET}/{get_timings_dir()}/*.json"
    if cloud_logs
    else f"{get_timings_dir()}/*.json"
)

t = (
    con.read_json(glob_path, ignore_errors=True)
    .mutate(
        timestamp=ibis._["timestamp"].cast("timestamp"),
    )
    .cache()
)
t

In [None]:
t.count()

In [None]:
t["execution_seconds"].sum()

In [None]:
f"runtime minutes: {t['execution_seconds'].sum().to_pandas() / 60:.2f}"

In [None]:
# average execution time per query
t["execution_seconds"].mean()

In [None]:
agg = (
    t.filter(t["sf"] >= 1)
    # .filter((t["system"].contains("duckdb")) | (t["system"].contains("datafusion")))
    # .filter(t["query_number"] == 1)
    .group_by("system", "sf", "n_partitions", "query_number")
    .agg(
        mean_execution_seconds=t["execution_seconds"].mean(),
        max_peak_cpu=t["peak_cpu"].max(),
        max_peak_memory=t["peak_memory"].max(),
    )
    .order_by(
        ibis.desc("sf"),
        ibis.asc("n_partitions"),
        ibis.asc("query_number"),
        ibis.desc("system"),
        ibis.asc("mean_execution_seconds"),
    )
)
agg

In [None]:
t.filter(t["system"] == "polars-lazy").filter(t["sf"] == 150)

In [None]:
t["session_id"].value_counts()

In [None]:
t["system"].value_counts()

In [None]:
sfs = agg.select("sf").distinct().to_pandas()["sf"].tolist()
sfs

In [None]:
category_orders = {
    "query_number": sorted(
        agg.select("query_number").distinct().to_pandas()["query_number"].tolist()
    ),
    "system": sorted(agg.select("system").distinct().to_pandas()["system"].tolist()),
    "n_partitions": sorted(
        agg.select("n_partitions").distinct().to_pandas()["n_partitions"].tolist()
    ),
}

for sf in sorted(sfs):
    c = px.bar(
        agg.filter(agg["sf"] == sf),
        x="query_number",
        y="mean_execution_seconds",
        color="system",
        barmode="group",
        pattern_shape="n_partitions",
        category_orders=category_orders,
        title=f"scale factor: {sf} (~{sf} GB of data in memory; ~{sf*2//5}GB on disk in Parquet)",
    )
    c.show()