In [None]:
!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 ibis-framework 

In [None]:
import pandas as pd
import polars as pl

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True

In [None]:
from typing import Any

def q2_pandas_native(
    region_ds: Any,
    nation_ds: Any,
    supplier_ds: Any,
    part_ds: Any,
    part_supp_ds: Any,
):
    var1 = 15
    var2 = "BRASS"
    var3 = "EUROPE"

    jn = (
        part_ds.merge(part_supp_ds, left_on="p_partkey", right_on="ps_partkey")
        .merge(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey")
        .merge(nation_ds, left_on="s_nationkey", right_on="n_nationkey")
        .merge(region_ds, left_on="n_regionkey", right_on="r_regionkey")
    )

    jn = jn[jn["p_size"] == var1]
    jn = jn[jn["p_type"].str.endswith(var2)]
    jn = jn[jn["r_name"] == var3]

    gb = jn.groupby("p_partkey", as_index=False)
    agg = gb["ps_supplycost"].min()
    jn2 = agg.merge(jn, on=["p_partkey", "ps_supplycost"])

    sel = jn2.loc[
        :,
        [
            "s_acctbal",
            "s_name",
            "n_name",
            "p_partkey",
            "p_mfgr",
            "s_address",
            "s_phone",
            "s_comment",
        ],
    ]

    sort = sel.sort_values(
        by=["s_acctbal", "n_name", "s_name", "p_partkey"],
        ascending=[False, True, True, True],
    )
    result_df = sort.head(100)

    return result_df  # type: ignore[no-any-return]

In [None]:
from typing import Any
from datetime import datetime
import narwhals as nw

def q2(
    region_ds_raw: Any,
    nation_ds_raw: Any,
    supplier_ds_raw: Any,
    part_ds_raw: Any,
    part_supp_ds_raw: Any,
) -> Any:
    var_1 = 15
    var_2 = "BRASS"
    var_3 = "EUROPE"

    region_ds = nw.from_native(region_ds_raw)
    nation_ds = nw.from_native(nation_ds_raw)
    supplier_ds = nw.from_native(supplier_ds_raw)
    part_ds = nw.from_native(part_ds_raw)
    part_supp_ds = nw.from_native(part_supp_ds_raw)

    result_q2 = (
        part_ds.join(part_supp_ds, left_on="p_partkey", right_on="ps_partkey")
        .join(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey")
        .join(nation_ds, left_on="s_nationkey", right_on="n_nationkey")
        .join(region_ds, left_on="n_regionkey", right_on="r_regionkey")
        .filter(
            nw.col("p_size") == var_1,
            nw.col("p_type").str.ends_with(var_2),
            nw.col("r_name") == var_3,
        )
    )

    final_cols = [
        "s_acctbal",
        "s_name",
        "n_name",
        "p_partkey",
        "p_mfgr",
        "s_address",
        "s_phone",
        "s_comment",
    ]

    q_final = (
        result_q2.group_by("p_partkey")
        .agg(nw.min("ps_supplycost").alias("ps_supplycost"))
        .join(
            result_q2,
            left_on=["p_partkey", "ps_supplycost"],
            right_on=["p_partkey", "ps_supplycost"],
        )
        .select(final_cols)
        .sort(
            by=["s_acctbal", "n_name", "s_name", "p_partkey"],
            descending=[True, False, False, False],
        )
        .head(100)
    )

    return nw.to_native(q_final)

In [None]:
from typing import Any
from datetime import datetime
import ibis

def q2_ibis(
    region: Any,
    nation: Any,
    supplier: Any,
    part: Any,
    partsupp: Any,
    *,
    tool: str,
) -> Any:
    var1 = 15
    var2 = "BRASS"
    var3 = "EUROPE"

    q2 = (
        part.join(partsupp, part["p_partkey"] == partsupp["ps_partkey"])
        .join(supplier, partsupp["ps_suppkey"] == supplier["s_suppkey"])
        .join(nation, supplier["s_nationkey"] == nation["n_nationkey"])
        .join(region, nation["n_regionkey"] == region["r_regionkey"])
        .filter(ibis._["p_size"] == var1)
        .filter(ibis._["p_type"].endswith(var2))
        .filter(ibis._["r_name"] == var3)
    )

    q_final = (
        q2.group_by("p_partkey")
        .agg(ps_supplycost=ibis._["ps_supplycost"].min())
        .join(q2, ["p_partkey"])
        .select(
            "s_acctbal",
            "s_name",
            "n_name",
            "p_partkey",
            "p_mfgr",
            "s_address",
            "s_phone",
            "s_comment",
        )
        .order_by(ibis.desc("s_acctbal"), "n_name", "s_name", "p_partkey")
        .limit(100)
    )
    if tool == 'pandas':
        return q_final.to_pandas()
    if tool == 'polars':
        return q_final.to_polars()
    raise ValueError("expected pandas or polars")

In [None]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
region = dir_ + 'region.parquet'
nation = dir_ + 'nation.parquet'
customer = dir_ + 'customer.parquet'
lineitem = dir_ + 'lineitem.parquet'
orders = dir_ + 'orders.parquet'
supplier = dir_ + 'supplier.parquet'
part = dir_ + 'part.parquet'
partsupp = dir_ + 'partsupp.parquet'

In [None]:
import ibis

con_pd = ibis.pandas.connect()
con_pl = ibis.polars.connect()

IO_FUNCS = {
    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),
    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'polars[eager]': lambda x: pl.read_parquet(x),
    'polars[lazy]': lambda x: pl.scan_parquet(x),
    'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),
}

In [None]:
results = {}

## pandas, pyarrow dtypes, via ibis

In [None]:
tool = 'pandas[pyarrow][ibis]'
fn = IO_FUNCS[tool]
timings = %timeit -o q2_ibis(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp), tool='pandas')
results[tool] = timings.all_runs

## Polars scan_parquet via ibis

In [None]:
tool = 'polars[lazy][ibis]'
fn = IO_FUNCS[tool]
timings = %timeit -o q2_ibis(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp), tool='polars')
results[tool] = timings.all_runs

## pandas, pyarrow dtypes, native

In [None]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q2_pandas_native(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))
results[tool+'[native]'] = timings.all_runs

## pandas via Narwhals

In [None]:
tool = 'pandas'
fn = IO_FUNCS[tool]
timings = %timeit -o q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))
results[tool] = timings.all_runs

## pandas, pyarrow dtypes, via Narwhals

In [None]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))
results[tool] = timings.all_runs

## Polars read_parquet

In [None]:
tool = 'polars[eager]'
fn = IO_FUNCS[tool]
timings = %timeit -o q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))
results[tool] = timings.all_runs

## Polars scan_parquet

In [None]:
tool = 'polars[lazy]'
fn = IO_FUNCS[tool]
timings = %timeit -o q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp)).collect()
results[tool] = timings.all_runs

## Save

In [None]:
import json
with open('results.json', 'w') as fd:
    json.dump(results, fd)
