In [None]:
!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals dask[dataframe]

In [None]:
import pandas as pd
import polars as pl

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True

In [None]:
from datetime import datetime
from typing import Any

import narwhals as nw


@nw.narwhalify
def q1(lineitem_ds: Any) -> Any:
    var_1 = datetime(1998, 9, 2)
    return (
        lineitem_ds.filter(nw.col("l_shipdate") <= var_1)
        .with_columns(
            disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")),
            charge=(
                nw.col("l_extendedprice")
                * (1.0 - nw.col("l_discount"))
                * (1.0 + nw.col("l_tax"))
            ),
        )
        .group_by("l_returnflag", "l_linestatus")
        .agg(
            nw.col("l_quantity").sum().alias("sum_qty"),
            nw.col("l_extendedprice").sum().alias("sum_base_price"),
            nw.col("disc_price").sum().alias("sum_disc_price"),
            nw.col("charge").sum().alias("sum_charge"),
            nw.col("l_quantity").mean().alias("avg_qty"),
            nw.col("l_extendedprice").mean().alias("avg_price"),
            nw.col("l_discount").mean().alias("avg_disc"),
            nw.len().alias("count_order"),
        )
        .sort("l_returnflag", "l_linestatus")
    )

In [None]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
region = dir_ + "region.parquet"
nation = dir_ + "nation.parquet"
customer = dir_ + "customer.parquet"
lineitem = dir_ + "lineitem.parquet"
orders = dir_ + "orders.parquet"
supplier = dir_ + "supplier.parquet"
part = dir_ + "part.parquet"
partsupp = dir_ + "partsupp.parquet"

In [None]:
import dask.dataframe as dd
import pyarrow.parquet as pq

IO_FUNCS = {
    "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"),
    "pandas[pyarrow]": lambda x: pd.read_parquet(
        x, engine="pyarrow", dtype_backend="pyarrow"
    ),
    "polars[eager]": lambda x: pl.read_parquet(x),
    "polars[lazy]": lambda x: pl.scan_parquet(x),
    "pyarrow": lambda x: pq.read_table(x),
    "dask": lambda x: dd.read_parquet(x, engine="pyarrow", dtype_backend="pyarrow"),
}

In [None]:
results = {}

## PyArrow.table

In [None]:
tool = "pyarrow"
fn = IO_FUNCS[tool]
timings = %timeit -o -q q1(fn(lineitem))
results[tool] = timings.all_runs

## pandas

In [None]:
tool = "pandas"
fn = IO_FUNCS[tool]
timings = %timeit -o -q q1(lineitem_ds=fn(lineitem))
results[tool] = timings.all_runs

## pandas, pyarrow dtypes

In [None]:
tool = "pandas[pyarrow]"
fn = IO_FUNCS[tool]
timings = %timeit -o -q q1(fn(lineitem))
results[tool] = timings.all_runs

## Polars read_parquet

In [None]:
tool = "polars[eager]"
fn = IO_FUNCS[tool]
timings = %timeit -o -q q1(fn(lineitem))
results[tool] = timings.all_runs

## Polars scan_parquet

In [None]:
tool = "polars[lazy]"
fn = IO_FUNCS[tool]
timings = %timeit -o -q q1(fn(lineitem)).collect()
results[tool] = timings.all_runs

## Dask Dataframe

In [None]:
tool = "dask"
fn = IO_FUNCS[tool]
timings = %timeit -o -q q1(fn(lineitem)).collect()
results[tool] = timings.all_runs

## Save

In [None]:
import json

with open("results.json", "w") as fd:
    json.dump(results, fd)