In [None]:
!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals 

In [None]:
import pandas as pd
import polars as pl

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True

In [None]:
from typing import Any
import narwhals as nw
from datetime import datetime

def q20(
    part_ds_raw: Any,
    partsupp_ds_raw: Any,
    nation_ds_raw: Any,
    lineitem_ds_raw: Any,
    supplier_ds_raw: Any
) -> Any:

    part_ds = nw.from_native(part_ds_raw)
    nation_ds = nw.from_native(nation_ds_raw)
    partsupp_ds = nw.from_native(partsupp_ds_raw)
    lineitem_ds = nw.from_native(lineitem_ds_raw)
    supplier_ds = nw.from_native(supplier_ds_raw)
    
    var1 = datetime(1994, 1, 1)
    var2 = datetime(1995, 1, 1)
    var3 = "CANADA"
    var4 = "forest"

    query1 = (
        lineitem_ds.filter(nw.col("l_shipdate").is_between(var1, var2, closed="left"))
        .group_by("l_partkey", "l_suppkey")
        .agg((nw.col("l_quantity").sum()).alias("sum_quantity"))
        .with_columns(sum_quantity = nw.col("sum_quantity") * 0.5)
    )
    query2 = nation_ds.filter(nw.col("n_name") == var3)
    query3 = supplier_ds.join(query2, left_on="s_nationkey", right_on="n_nationkey")

    result = (
        part_ds.filter(nw.col("p_name").str.starts_with(var4))
        .select(nw.col("p_partkey").unique())
        .join(partsupp_ds, left_on="p_partkey", right_on="ps_partkey")
        .join(
            query1,
            left_on=["ps_suppkey", "p_partkey"],
            right_on=["l_suppkey", "l_partkey"],
        )
        .filter(nw.col("ps_availqty") > nw.col("sum_quantity"))
        .select(nw.col("ps_suppkey").unique())
        .join(query3, left_on="ps_suppkey", right_on="s_suppkey")
        .select("s_name", "s_address")
        .sort("s_name")
    )


    return nw.to_native(result)

In [None]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
nation = dir_ + 'nation.parquet'
lineitem = dir_ + 'lineitem.parquet'
supplier = dir_ + 'supplier.parquet'
part = dir_ + 'part.parquet'
partsupp = dir_ + 'partsupp.parquet'

In [None]:
IO_FUNCS = {
    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),
    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'polars[eager]': lambda x: pl.read_parquet(x),
    'polars[lazy]': lambda x: pl.scan_parquet(x),
}

In [None]:
results = {}

## pandas via Narwhals

In [None]:
tool = 'pandas'
fn = IO_FUNCS[tool]
timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))
results[tool] = timings.all_runs

## pandas, pyarrow dtypes, via Narwhals

In [None]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))
results[tool] = timings.all_runs

## Polars read_parquet

In [None]:
tool = 'polars[eager]'
fn = IO_FUNCS[tool]
timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))
results[tool] = timings.all_runs

## Polars scan_parquet

In [None]:
tool = 'polars[lazy]'
fn = IO_FUNCS[tool]
timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()
results[tool] = timings.all_runs

## Save

In [None]:
import json
with open('results.json', 'w') as fd:
    json.dump(results, fd)
