In [3]:
import ibis
import duckdb

SCALE_FACTOR = 1

# Connect to DuckDB
duckdb_con = duckdb.connect("tpch.duckdb")
con = ibis.connect("duckdb://tpch.duckdb")

In [5]:
# Generate TPC-H data at scale factor 1 (1 GB)
duckdb_con.sql(f"CALL dbgen(sf={SCALE_FACTOR})")

# Export each table to Parquet format
tables = ["lineitem", "orders", "customer", "supplier", "nation", "region", "part", "partsupp"]
for table in tables:
    duckdb_con.sql(f"COPY {table} TO 'data/{table}.parquet' (FORMAT PARQUET)")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [14]:
from sqlframe import activate

# Activate SQLFrame to run directly on DuckDB
activate(engine="duckdb")

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql import Window
session = SparkSession.builder.getOrCreate()

In [9]:
session

<sqlframe.duckdb.session.DuckDBSession at 0xffff58399900>

# Q18 - Large Volume Customer Query

In [25]:

spark = SparkSession.builder.appName('TPCH Benchmark for Python').getOrCreate()
lineitem = spark.read.parquet("data/lineitem.parquet")
orders = spark.read.parquet("data/orders.parquet")
customers = spark.read.parquet("data/customer.parquet")

result = lineitem.groupBy(col("l_orderkey")) \
            .agg(F.sum(col("l_quantity")).alias("sum_quantity")) \
            .filter(col("sum_quantity") > 300) \
            .select(col("l_orderkey").alias("key"), col("sum_quantity")) \
            .join(orders, col("o_orderkey") == col("key")) \
            .join(lineitem, col("o_orderkey") == col("l_orderkey")) \
            .join(customers, col("c_custkey") == col("o_custkey")) \
            .select(col("l_quantity"), col("c_name"), col("c_custkey"), col("o_orderkey"), col("o_orderdate"),
                    col("o_totalprice")) \
            .groupBy(col("c_name"), col("c_custkey"), col("o_orderkey"), col("o_orderdate"), col("o_totalprice")) \
            .agg(F.sum(col("l_quantity"))) \
            .sort(col("o_totalprice").desc(), col("o_orderdate"))

In [29]:
print(result.sql())

WITH `t38591281` AS (
  SELECT
    CAST(`_q_0`.`l_orderkey` AS BIGINT) AS `l_orderkey`,
    SUM(CAST(`_q_0`.`l_quantity` AS DECIMAL(15, 2))) AS `sum_quantity`
  FROM READ_PARQUET(ARRAY('data/lineitem.parquet')) AS `_q_0`
  GROUP BY
    CAST(`_q_0`.`l_orderkey` AS BIGINT)
)
SELECT
  CAST(`_q_3`.`c_name` AS STRING) AS `c_name`,
  CAST(`_q_3`.`c_custkey` AS BIGINT) AS `c_custkey`,
  CAST(`_q_1`.`o_orderkey` AS BIGINT) AS `o_orderkey`,
  CAST(`_q_1`.`o_orderdate` AS DATE) AS `o_orderdate`,
  CAST(`_q_1`.`o_totalprice` AS DECIMAL(15, 2)) AS `o_totalprice`,
  SUM(CAST(`_q_2`.`l_quantity` AS DECIMAL(15, 2))) AS `_col_5`
FROM `t38591281` AS `t38591281`
JOIN READ_PARQUET(ARRAY('data/orders.parquet')) AS `_q_1`
  ON `t38591281`.`l_orderkey` = CAST(`_q_1`.`o_orderkey` AS BIGINT)
JOIN READ_PARQUET(ARRAY('data/customer.parquet')) AS `_q_3`
  ON CAST(`_q_1`.`o_custkey` AS BIGINT) = CAST(`_q_3`.`c_custkey` AS BIGINT)
JOIN READ_PARQUET(ARRAY('data/lineitem.parquet')) AS `_q_2`
  ON CAST(`_q_1`.`o_orde

In [32]:
result.toPandas().head()

  return read_sql_query(


Unnamed: 0,c_name,c_custkey,o_orderkey,o_orderdate,o_totalprice,sum(l_quantity)
0,Customer#000021433,21433,1750466,1992-11-30,555285.16,2400.0
1,Customer#000128120,128120,4722021,1994-04-07,544089.09,2584.0
2,Customer#000144617,144617,3043270,1997-02-12,530604.44,2536.0
3,Customer#000108931,108931,4576548,1997-12-26,525590.57,2360.0
4,Customer#000013940,13940,2232932,1997-04-13,522720.61,2432.0
