In [None]:
!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals 

In [None]:
import pandas as pd
import polars as pl

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True

In [None]:
from typing import Any
import narwhals as nw

def q18(
    customer_ds_raw: Any,
    lineitem_ds_raw: Any,
    orders_ds_raw: Any
) -> Any:

    customer_ds = nw.from_native(customer_ds_raw)
    lineitem_ds = nw.from_native(lineitem_ds_raw)
    orders_ds = nw.from_native(orders_ds_raw)
    
    var1 = 300

    query1 = (
        lineitem_ds.group_by("l_orderkey")
        .agg(nw.col("l_quantity").sum().alias("sum_quantity"))
        .filter(nw.col("sum_quantity") > var1)
    )

    q_final = (
        orders_ds.join(query1, left_on="o_orderkey", right_on="l_orderkey", how="semi")
        .join(lineitem_ds, left_on="o_orderkey", right_on="l_orderkey")
        .join(customer_ds, left_on="o_custkey", right_on="c_custkey")
        .group_by("c_name", "o_custkey", "o_orderkey", "o_orderdate", "o_totalprice")
        .agg(nw.col("l_quantity").sum().alias("col6"))
        .select(
            nw.col("c_name"),
            nw.col("o_custkey").alias("c_custkey"),
            nw.col("o_orderkey"),
            nw.col("o_orderdate").alias("o_orderdat"),
            nw.col("o_totalprice"),
            nw.col("col6"),
        )
        .sort(by=["o_totalprice", "o_orderdat"], descending=[True, False])
        .head(100)
    )


    return nw.to_native(q_final)

In [None]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
customer = dir_ + 'customer.parquet'
lineitem = dir_ + 'lineitem.parquet'
orders = dir_ + 'orders.parquet'

In [None]:
IO_FUNCS = {
    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),
    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'polars[eager]': lambda x: pl.read_parquet(x),
    'polars[lazy]': lambda x: pl.scan_parquet(x),
}

In [None]:
results = {}

## pandas via Narwhals

In [None]:
tool = 'pandas'
fn = IO_FUNCS[tool]
timings = %timeit -o q19(fn(lineitem), fn(part))
results[tool] = timings.all_runs

## pandas, pyarrow dtypes, via Narwhals

In [None]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders))
results[tool] = timings.all_runs

## Polars read_parquet

In [None]:
tool = 'polars[eager]'
fn = IO_FUNCS[tool]
timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders))
results[tool] = timings.all_runs

## Polars scan_parquet

In [None]:
tool = 'polars[lazy]'
fn = IO_FUNCS[tool]
timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders)).collect()
results[tool] = timings.all_runs

## Save

In [None]:
import json
with open('results.json', 'w') as fd:
    json.dump(results, fd)
