In [13]:
!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals 



In [14]:
import pandas as pd
import polars as pl

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True

In [15]:
from typing import Any
import narwhals as nw

def q11(
    partsupp_ds_raw: Any,
    nation_ds_raw: Any,
    supplier_ds_raw: Any,
) -> Any:

    nation_ds = nw.from_native(nation_ds_raw)
    partsupp_ds = nw.from_native(partsupp_ds_raw)
    supplier_ds = nw.from_native(supplier_ds_raw)

    
    var1 = "GERMANY"
    var2 = 0.0001

    q1 = (
        partsupp_ds.join(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey")
        .join(nation_ds, left_on="s_nationkey", right_on="n_nationkey")
        .filter(nw.col("n_name") == var1)
    )
    q2 = q1.select(
        (nw.col("ps_supplycost") * nw.col("ps_availqty")).sum().round(2).alias("tmp")
        * var2
    )

    q_final = (
        q1.with_columns(
            (nw.col("ps_supplycost") * nw.col("ps_availqty"))
            .alias("value")
        )
        .group_by("ps_partkey")
        .agg(
            nw.sum("value")
        )
        .join(q2, how="cross")
        .filter(nw.col("value") > nw.col("tmp"))
        .select("ps_partkey", "value")
        .sort("value", descending=True)
    )

    return nw.to_native(q_final)

In [16]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
nation = dir_ + 'nation.parquet'
supplier = dir_ + 'supplier.parquet'
partsupp = dir_ + 'partsupp.parquet'

In [17]:
IO_FUNCS = {
    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),
    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'polars[eager]': lambda x: pl.read_parquet(x),
    'polars[lazy]': lambda x: pl.scan_parquet(x),
}

In [18]:
results = {}

## pandas via Narwhals

In [19]:
tool = 'pandas'
fn = IO_FUNCS[tool]
timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))
results[tool] = timings.all_runs

1.66 s ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## pandas, pyarrow dtypes, via Narwhals

In [20]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))
results[tool] = timings.all_runs

1.67 s ± 86.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Polars read_parquet

In [21]:
tool = 'polars[eager]'
fn = IO_FUNCS[tool]
timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))
results[tool] = timings.all_runs

890 ms ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Polars scan_parquet

In [22]:
tool = 'polars[lazy]'
fn = IO_FUNCS[tool]
timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier)).collect()
results[tool] = timings.all_runs

110 ms ± 12.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Save

In [23]:
import json
with open('results.json', 'w') as fd:
    json.dump(results, fd)


In [24]:
from pprint import pprint

pprint(results)

{'pandas': [1.708278326000027,
            1.8040552429999934,
            1.8417796100000032,
            1.600905629999943,
            1.6415783779998492,
            1.5647700059998897,
            1.493057884000109],
 'pandas[pyarrow]': [1.6380957989999843,
                     1.5802785819998917,
                     1.5376337459999831,
                     1.7884727590001148,
                     1.7397616020000441,
                     1.7496962650000114,
                     1.6605698180001127],
 'polars[eager]': [0.9160442119998606,
                   0.8955544509999527,
                   0.8863846530000501,
                   0.8829364579999037,
                   0.8918134509999618,
                   0.8924379529998987,
                   0.8672452630000862],
 'polars[lazy]': [1.1258213609999075,
                  1.4064464999999018,
                  1.046419743999877,
                  1.0376091739999538,
                  1.043019643999969,
                  1.02965974