In [1]:
!pip install -U narwhals>=0.7.2

In [2]:
# Remove all conda packages
!find /opt/conda \( -name "cudf*" -o -name "libcudf*" -o -name "cuml*" -o -name "libcuml*" \
                   -o -name "cugraph*" -o -name "libcugraph*" -o -name "raft*" -o -name "libraft*" \
                   -o -name "pylibraft*" -o -name "libkvikio*" -o -name "*dask*" -o -name "rmm*"\
                   -o -name "librmm*" \) -exec rm -rf {} \; 2>/dev/null

# pip uninstall, just incase there are packages lying around
!pip uninstall cudf cuml dask-cudf cuml cugraph cupy cupy-cuda12x --y


!pip install \
    --extra-index-url=https://pypi.nvidia.com \
    cudf-cu12==24.2.* \
    dask-cudf-cu12==24.2.* \
    cuml-cu12==24.2.* \
    cugraph-cu12==24.2.*



[0mFound existing installation: cupy 13.0.0
Uninstalling cupy-13.0.0:
  Successfully uninstalled cupy-13.0.0
[0mLooking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu12==24.2.*
  Downloading https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.2.2-cp310-cp310-manylinux_2_28_x86_64.whl (464.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.7/464.7 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dask-cudf-cu12==24.2.*
  Downloading https://pypi.nvidia.com/dask-cudf-cu12/dask_cudf_cu12-24.2.2-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cuml-cu12==24.2.*
  Downloading https://pypi.nvidia.com/cuml-cu12/cuml_cu12-24.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1074.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 GB[0m [31m973.2 kB/s[0m eta [36m0:

In [3]:
import cudf

In [4]:
from typing import Any
from datetime import datetime
import narwhals as nw

def q1(df_raw: Any) -> Any:
    var_1 = datetime(1998, 9, 2)
    df = nw.from_native(df_raw)
    result = (
        df.filter(nw.col("l_shipdate") <= var_1)
        .with_columns(
            disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")),
            charge=(
                nw.col("l_extendedprice")
                * (1.0 - nw.col("l_discount"))
                * (1.0 + nw.col("l_tax"))
            ),
        )
        .group_by(["l_returnflag", "l_linestatus"])
        .agg(
            [
                nw.sum("l_quantity").alias("sum_qty"),
                nw.sum("l_extendedprice").alias("sum_base_price"),
                nw.sum("disc_price").alias("sum_disc_price"),
                nw.col("charge").sum().alias("sum_charge"),
                nw.mean("l_quantity").alias("avg_qty"),
                nw.mean("l_extendedprice").alias("avg_price"),
                nw.mean("l_discount").alias("avg_disc"),
                nw.len().alias("count_order"),
            ],
        )
        .sort(["l_returnflag", "l_linestatus"])
    )
    return nw.to_native(result)

In [5]:


from typing import Any
from datetime import datetime
import narwhals as nw

def q2(
    region_ds_raw: Any,
    nation_ds_raw: Any,
    supplier_ds_raw: Any,
    part_ds_raw: Any,
    part_supp_ds_raw: Any,
) -> Any:
    var_1 = 15
    var_2 = "BRASS"
    var_3 = "EUROPE"

    region_ds = nw.from_native(region_ds_raw)
    nation_ds = nw.from_native(nation_ds_raw)
    supplier_ds = nw.from_native(supplier_ds_raw)
    part_ds = nw.from_native(part_ds_raw)
    part_supp_ds = nw.from_native(part_supp_ds_raw)

    result_q2 = (
        part_ds.join(part_supp_ds, left_on="p_partkey", right_on="ps_partkey")
        .join(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey")
        .join(nation_ds, left_on="s_nationkey", right_on="n_nationkey")
        .join(region_ds, left_on="n_regionkey", right_on="r_regionkey")
        .filter(nw.col("p_size") == var_1)
        .filter(nw.col("p_type").str.ends_with(var_2))
        .filter(nw.col("r_name") == var_3)
    )

    final_cols = [
        "s_acctbal",
        "s_name",
        "n_name",
        "p_partkey",
        "p_mfgr",
        "s_address",
        "s_phone",
        "s_comment",
    ]

    q_final = (
        result_q2.group_by("p_partkey")
        .agg(nw.min("ps_supplycost").alias("ps_supplycost"))
        .join(
            result_q2,
            left_on=["p_partkey", "ps_supplycost"],
            right_on=["p_partkey", "ps_supplycost"],
        )
        .select(final_cols)
        .sort(
            by=["s_acctbal", "n_name", "s_name", "p_partkey"],
            descending=[True, False, False, False],
        )
        .head(100)
    )

    return nw.to_native(q_final)

In [6]:
from typing import Any
from datetime import datetime
import narwhals as nw

def q3(
    customer_ds_raw: Any,
    line_item_ds_raw: Any,
    orders_ds_raw: Any,
) -> Any:
    var_1 = var_2 = datetime(1995, 3, 15)
    var_3 = "BUILDING"

    customer_ds = nw.from_native(customer_ds_raw)
    line_item_ds = nw.from_native(line_item_ds_raw)
    orders_ds = nw.from_native(orders_ds_raw)

    q_final = (
        customer_ds.filter(nw.col("c_mktsegment") == var_3)
        .join(orders_ds, left_on="c_custkey", right_on="o_custkey")
        .join(line_item_ds, left_on="o_orderkey", right_on="l_orderkey")
        .filter(nw.col("o_orderdate") < var_2)
        .filter(nw.col("l_shipdate") > var_1)
        .with_columns(
            (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue")
        )
        .group_by(["o_orderkey", "o_orderdate", "o_shippriority"])
        .agg([nw.sum("revenue")])
        .select(
            [
                nw.col("o_orderkey").alias("l_orderkey"),
                "revenue",
                "o_orderdate",
                "o_shippriority",
            ]
        )
        .sort(by=["revenue", "o_orderdate"], descending=[True, False])
        .head(10)
    )

    return nw.to_native(q_final)

In [7]:


from typing import Any
from datetime import datetime
import narwhals as nw

def q4(
    lineitem_ds_raw: Any,
    orders_ds_raw: Any,
) -> Any:
    var_1 = datetime(1993, 7, 1)
    var_2 = datetime(1993, 10, 1)

    line_item_ds = nw.from_native(lineitem_ds_raw)
    orders_ds = nw.from_native(orders_ds_raw)

    result = (
        line_item_ds.join(orders_ds, left_on="l_orderkey", right_on="o_orderkey")
        .filter(nw.col("o_orderdate").is_between(var_1, var_2, closed="left"))
        .filter(nw.col("l_commitdate") < nw.col("l_receiptdate"))
        .unique(subset=["o_orderpriority", "l_orderkey"])
        .group_by("o_orderpriority")
        .agg(nw.len().alias("order_count"))
        .sort(by="o_orderpriority")
        .with_columns(nw.col("order_count").cast(nw.Int64))
    )

    return nw.to_native(result)

In [8]:
from typing import Any
from datetime import datetime
import narwhals as nw

def q5(
    region_ds_raw: Any,
    nation_ds_raw: Any,
    customer_ds_raw: Any,
    lineitem_ds_raw: Any,
    orders_ds_raw: Any,
    supplier_ds_raw: Any,
) -> Any:
    var_1 = "ASIA"
    var_2 = datetime(1994, 1, 1)
    var_3 = datetime(1995, 1, 1)

    region_ds = nw.from_native(region_ds_raw)
    nation_ds = nw.from_native(nation_ds_raw)
    customer_ds = nw.from_native(customer_ds_raw)
    line_item_ds = nw.from_native(lineitem_ds_raw)
    orders_ds = nw.from_native(orders_ds_raw)
    supplier_ds = nw.from_native(supplier_ds_raw)

    result = (
        region_ds.join(nation_ds, left_on="r_regionkey", right_on="n_regionkey")
        .join(customer_ds, left_on="n_nationkey", right_on="c_nationkey")
        .join(orders_ds, left_on="c_custkey", right_on="o_custkey")
        .join(line_item_ds, left_on="o_orderkey", right_on="l_orderkey")
        .join(
            supplier_ds,
            left_on=["l_suppkey", "n_nationkey"],
            right_on=["s_suppkey", "s_nationkey"],
        )
        .filter(
            nw.col("r_name") == var_1,
            nw.col("o_orderdate").is_between(var_2, var_3, closed="left")
        )
        .with_columns(
            (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue")
        )
        .group_by("n_name")
        .agg([nw.sum("revenue")])
        .sort(by="revenue", descending=True)
    )

    return nw.to_native(result)

In [9]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
region = dir_ + 'region.parquet'
nation = dir_ + 'nation.parquet'
customer = dir_ + 'customer.parquet'
lineitem = dir_ + 'lineitem.parquet'
orders = dir_ + 'orders.parquet'
supplier = dir_ + 'supplier.parquet'
part = dir_ + 'part.parquet'
partsupp = dir_ + 'partsupp.parquet'

In [10]:
results = {}

In [11]:
import cudf
fn = cudf.read_parquet
timings = %timeit -o q1(fn(lineitem))
results['q1'] = timings.all_runs

get_mempolicy: Operation not permitted


560 ms ± 7.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
import cudf
fn = cudf.read_parquet
timings = %timeit -o q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))
results['q2'] = timings.all_runs

538 ms ± 54.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
import cudf
fn = cudf.read_parquet
timings = %timeit -o q3(fn(customer), fn(lineitem), fn(orders))
results['q3'] = timings.all_runs

856 ms ± 8.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
import cudf
fn = cudf.read_parquet
timings = %timeit -o q4(fn(lineitem), fn(orders))
results['q4'] = timings.all_runs

806 ms ± 25.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
import cudf
fn = cudf.read_parquet
timings = %timeit -o q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))
results['q5'] = timings.all_runs

1.32 s ± 4.72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
import json
with open('results.json', 'w') as fd:
    json.dump(results, fd)