In [12]:
!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals 



In [13]:
import pandas as pd
import polars as pl

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True

In [14]:
from typing import Any
import narwhals as nw

def q19(
    lineitem_ds_raw: Any,
    part_ds_raw: Any
    
) -> Any:

    lineitem_ds = nw.from_native(lineitem_ds_raw)
    part_ds = nw.from_native(part_ds_raw)

    result = (
        part_ds.join(lineitem_ds, left_on="p_partkey", right_on="l_partkey")
        .filter(nw.col("l_shipmode").is_in(["AIR", "AIR REG"]))
        .filter(nw.col("l_shipinstruct") == "DELIVER IN PERSON")
        .filter(
            (
                (nw.col("p_brand") == "Brand#12")
                & nw.col("p_container").is_in(
                    ["SM CASE", "SM BOX", "SM PACK", "SM PKG"]
                )
                & (nw.col("l_quantity").is_between(1, 11))
                & (nw.col("p_size").is_between(1, 5))
            )
            | (
                (nw.col("p_brand") == "Brand#23")
                & nw.col("p_container").is_in(
                    ["MED BAG", "MED BOX", "MED PKG", "MED PACK"]
                )
                & (nw.col("l_quantity").is_between(10, 20))
                & (nw.col("p_size").is_between(1, 10))
            )
            | (
                (nw.col("p_brand") == "Brand#34")
                & nw.col("p_container").is_in(
                    ["LG CASE", "LG BOX", "LG PACK", "LG PKG"]
                )
                & (nw.col("l_quantity").is_between(20, 30))
                & (nw.col("p_size").is_between(1, 15))
            )
        )
        .select(
            (nw.col("l_extendedprice") * (1 - nw.col("l_discount")))
            .sum()
            .round(2)
            .alias("revenue")
        )
    )


    return nw.to_native(result)

In [15]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
lineitem = dir_ + 'lineitem.parquet'
part = dir_ + 'part.parquet'

In [16]:
IO_FUNCS = {
    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),
    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'polars[eager]': lambda x: pl.read_parquet(x),
    'polars[lazy]': lambda x: pl.scan_parquet(x),
}

In [17]:
results = {}

## pandas via Narwhals

In [18]:
tool = 'pandas'
fn = IO_FUNCS[tool]
timings = %timeit -o q19(fn(lineitem), fn(part))
results[tool] = timings.all_runs

23.1 s ± 441 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## pandas, pyarrow dtypes, via Narwhals

In [19]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q19(fn(lineitem), fn(part))
results[tool] = timings.all_runs

21.6 s ± 57.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Polars read_parquet

In [20]:
tool = 'polars[eager]'
fn = IO_FUNCS[tool]
timings = %timeit -o q19(fn(lineitem), fn(part))
results[tool] = timings.all_runs

5.38 s ± 156 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Polars scan_parquet

In [21]:
tool = 'polars[lazy]'
fn = IO_FUNCS[tool]
timings = %timeit -o q19(fn(lineitem), fn(part)).collect()
results[tool] = timings.all_runs

956 ms ± 89.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Save

In [22]:
import json
with open('results.json', 'w') as fd:
    json.dump(results, fd)
