In [33]:
!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals 



In [34]:
import pandas as pd
import polars as pl

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True

In [35]:
from typing import Any
import narwhals as nw

def q17(
    lineitem_ds_raw: Any,
    part_ds_raw: Any
) -> Any:

    lineitem_ds = nw.from_native(lineitem_ds_raw)
    part_ds = nw.from_native(part_ds_raw)
    
    var1 = "Brand#23"
    var2 = "MED BOX"
    
    query1 = (
        part_ds.filter(nw.col("p_brand") == var1)
        .filter(nw.col("p_container") == var2)
        .join(lineitem_ds, how="left", left_on="p_partkey", right_on="l_partkey")
    )
    
    final_query = (
        query1.group_by("p_partkey")
        .agg((0.2 * nw.col("l_quantity").mean()).alias("avg_quantity"))
        .select(nw.col("p_partkey").alias("key"), nw.col("avg_quantity"))
        .join(query1, left_on="key", right_on="p_partkey")
        .filter(nw.col("l_quantity") < nw.col("avg_quantity"))
        .select((nw.col("l_extendedprice").sum() / 7.0).round(2).alias("avg_yearly"))
    )


    return nw.to_native(final_query)

In [36]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
lineitem = dir_ + 'lineitem.parquet'
part = dir_ + 'part.parquet'

In [37]:
IO_FUNCS = {
    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),
    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'polars[eager]': lambda x: pl.read_parquet(x),
    'polars[lazy]': lambda x: pl.scan_parquet(x),
}

In [38]:
results = {}

## pandas via Narwhals

In [39]:
tool = 'pandas'
fn = IO_FUNCS[tool]
timings = %timeit -o q17(fn(lineitem), fn(part))
results[tool] = timings.all_runs

  return agg_pandas(
  return agg_pandas(
  return agg_pandas(
  return agg_pandas(
  return agg_pandas(
  return agg_pandas(
  return agg_pandas(
  return agg_pandas(


6.91 s ± 108 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## pandas, pyarrow dtypes, via Narwhals

In [40]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q17(fn(lineitem), fn(part))
results[tool] = timings.all_runs

  return agg_pandas(
  return agg_pandas(
  return agg_pandas(
  return agg_pandas(
  return agg_pandas(
  return agg_pandas(
  return agg_pandas(
  return agg_pandas(


5.39 s ± 99.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Polars read_parquet

In [41]:
tool = 'polars[eager]'
fn = IO_FUNCS[tool]
timings = %timeit -o q17(fn(lineitem), fn(part))
results[tool] = timings.all_runs

3.06 s ± 113 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Polars scan_parquet

In [42]:
tool = 'polars[lazy]'
fn = IO_FUNCS[tool]
timings = %timeit -o q17(fn(lineitem), fn(part)).collect()
results[tool] = timings.all_runs

1.39 s ± 33.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Save

In [43]:
import json
with open('results.json', 'w') as fd:
    json.dump(results, fd)
