In [None]:
!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals 

In [None]:
import pandas as pd
import polars as pl

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True

In [None]:
from typing import Any
import narwhals as nw

def q22(
    customer_ds_raw: Any,
    orders_ds_raw: Any,
) -> Any:

    customer_ds = nw.from_native(customer_ds_raw)
    orders_ds = nw.from_native(orders_ds_raw)
    
    
    query1 = (
        customer_ds.with_columns(nw.col("c_phone").str.slice(0, 2).alias("cntrycode"))
        .filter(nw.col("cntrycode").str.contains("13|31|23|29|30|18|17"))
        .select("c_acctbal", "c_custkey", "cntrycode")
    )

    query2 = query1.filter(nw.col("c_acctbal") > 0.0).select(
        nw.col("c_acctbal").mean().alias("avg_acctbal")
    )

    query3 = orders_ds.select(nw.col("o_custkey").unique()).with_columns(
        nw.col("o_custkey").alias("c_custkey")
    )

    final_query = (
        query1.join(query3, left_on="c_custkey", right_on="c_custkey", how="left")
        .filter(nw.col("o_custkey").is_null())
        .join(query2, how="cross")
        .filter(nw.col("c_acctbal") > nw.col("avg_acctbal"))
        .group_by("cntrycode")
        .agg(
            nw.col("c_acctbal").count().alias("numcust"),
            nw.col("c_acctbal").sum().round(2).alias("totacctbal"),
        )
        .sort("cntrycode")
    )

    return nw.to_native(final_query)

In [None]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
customer = dir_ + 'customer.parquet'
orders = dir_ + 'orders.parquet'

In [None]:
IO_FUNCS = {
    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),
    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'polars[eager]': lambda x: pl.read_parquet(x),
    'polars[lazy]': lambda x: pl.scan_parquet(x),
}

In [None]:
results = {}

## pandas via Narwhals

In [None]:
tool = 'pandas'
fn = IO_FUNCS[tool]
timings = %timeit -o q22(fn(customer), fn(orders))
results[tool] = timings.all_runs

## pandas, pyarrow dtypes, via Narwhals

In [None]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q22(fn(customer), fn(orders))
results[tool] = timings.all_runs

## Polars read_parquet

In [None]:
tool = 'polars[eager]'
fn = IO_FUNCS[tool]
timings = %timeit -o q22(fn(customer), fn(orders))
results[tool] = timings.all_runs

## Polars scan_parquet

In [None]:
tool = 'polars[lazy]'
fn = IO_FUNCS[tool]
timings = %timeit -o q22(fn(customer), fn(orders)).collect()
results[tool] = timings.all_runs

## Save

In [None]:
import json
with open('results.json', 'w') as fd:
    json.dump(results, fd)
