In [1]:
import ibis
import gcsfs

from ibis_bench.utils.monitor import get_timings_dir

ibis.options.interactive = True
ibis.options.repr.interactive.max_rows = 20
ibis.options.repr.interactive.max_columns = None

In [2]:
# YOLO
import warnings

warnings.filterwarnings("ignore")

In [3]:
PROJECT = "voltrondata-demo"
BUCKET = "ibis-benchy"

fs = gcsfs.GCSFileSystem(project=PROJECT)
fs.ls(f"{BUCKET}/{get_timings_dir()}")[-5:]

['ibis-benchy/benchy_logs_v2/d53be7c2-3b19-46d9-95e5-0d53d7b48171.json',
 'ibis-benchy/benchy_logs_v2/dd136bd6-8384-4f4f-884e-50317b18a61d.json',
 'ibis-benchy/benchy_logs_v2/e55be847-7ea4-4b40-9ca7-380363a5ac13.json',
 'ibis-benchy/benchy_logs_v2/e75e7e09-046a-436d-904d-d5ca4e69fd02.json',
 'ibis-benchy/benchy_logs_v2/f74cda0c-33cd-4d38-b009-08d431faab86.json']

In [4]:
con = ibis.connect("duckdb://")
con.register_filesystem(fs)

In [5]:
t = (
    con.read_json(f"gs://{BUCKET}/{get_timings_dir()}/*.json")
    .mutate(
        timestamp=ibis._["timestamp"].cast("timestamp"),
    )
    .cache()
)
t

In [6]:
t.count()

[1;36m30[0m

In [7]:
(
    t.filter(t["sf"] >= 1)
    .filter((t["system"].contains("duckdb")) | (t["system"].contains("datafusion")))
    .filter(t["query_number"] == 1)
    .group_by("system", "sf", "n_partitions", "query_number")
    .agg(
        mean_execution_seconds=t["execution_seconds"].mean(),
        max_peak_cpu=t["peak_cpu"].max(),
        max_peak_memory=t["peak_memory"].max(),
    )
    .order_by(
        ibis.desc("sf"),
        ibis.asc("n_partitions"),
        ibis.desc("system"),
        ibis.asc("query_number"),
        ibis.asc("mean_execution_seconds"),
    )
)

In [8]:
t["session_id"].value_counts()