In [None]:
#!/usr/bin/env python
# coding: utf-8

EXPERIMENT = 'pac-duckdb-q1'
OUTPUT_DIR = f'./outputs/{EXPERIMENT}'
GENERATE = False
USE_EVEN_NUMBER_OF_INPUT_ROWS = False

if GENERATE:
    print("GENERATE = True, so we will generate new samples.")
else:
    print("GENERATE = False, so we will load saved output from files rather than recomputing.")

import os
from typing import List
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

import duckdb
import polars as pl
import pyarrow as pa

# duckdb load data/tpch/tpch.duckdb
#con = duckdb.connect(database='data/tpch/tpch.duckdb', read_only=True)
con = duckdb.connect(database=':memory:')
#tables = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
tables = ["lineitem", "orders"]
for t in tables:
    con.execute(f"CREATE TABLE {t} AS SELECT * FROM 'data/tpch/{t}.parquet'")

lineitem_df = con.execute("SELECT * FROM lineitem").fetchdf()
orders_df = con.execute("SELECT * FROM orders").fetchdf()

row_count = lineitem_df.shape[0]

# Construct the table of random samples
# to use, join it with the lineitem table (for specific sample # s) and filter to just the
# rows where random_binary = 1.0
# This will give us a 50% sample of the lineitem table for each sample # s

SAMPLES = 6
TABLE_TO_SAMPLE = 'lineitem'
assert SAMPLES % 2 == 0, "SAMPLES must be even to create complementary samples."

con.execute(f"""
CREATE TABLE random_samples AS
WITH sample_numbers AS (
    SELECT range AS sample_id FROM range({SAMPLES//2})
),
random_values AS (
    SELECT 
        sample_numbers.sample_id, 
        {TABLE_TO_SAMPLE}.rowid AS row_id,
        FLOOR(RANDOM() * 2) AS random_binary
    FROM sample_numbers
    JOIN {TABLE_TO_SAMPLE} ON TRUE  -- Cross join to duplicate rows for each sample
)
SELECT
    sample_id,
    row_id,
    random_binary
FROM random_values
UNION ALL
SELECT -- select the complementary samples too
    {SAMPLES//2} + sample_id,
    row_id,
    1 - random_binary  -- Inverse the random_binary to get the complementary sample
FROM random_values;
""")


con.execute("""
PREPARE count_orders AS 
SELECT
    l_returnflag,
    l_linestatus,
    2*sum(l_quantity) AS sum_qty,
    2*sum(l_extendedprice) AS sum_base_price,
    2*sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    2*sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    avg(l_quantity) AS avg_qty,
    avg(l_extendedprice) AS avg_price,
    avg(l_discount) AS avg_disc,
    2*count(*) AS count_order
FROM
    lineitem
JOIN random_samples AS rs
    ON rs.row_id = lineitem.rowid
WHERE
    l_shipdate <= CAST('1998-09-02' AS date)
    AND rs.random_binary = 1.0
    AND rs.sample_id = $sample
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
""")

sample_sizes = con.execute("""
SELECT SUM(random_binary) AS sample_size
FROM random_samples
GROUP BY sample_id;
""").fetchdf()

print(sample_sizes)

dfs: List[pl.DataFrame] = []
for s in range(SAMPLES):
    dfs.append(con.execute(f"EXECUTE count_orders(sample := {s});").pl())

GENERATE = False, so we will load saved output from files rather than recomputing.
   sample_size
0     300564.0
1     300536.0
2     299994.0
3     299561.0
4     299606.0
5     300441.0


In [12]:
dfs[0].schema

Schema([('l_returnflag', String),
        ('l_linestatus', String),
        ('sum_qty', Decimal(precision=38, scale=2)),
        ('sum_base_price', Decimal(precision=38, scale=2)),
        ('sum_disc_price', Decimal(precision=38, scale=4)),
        ('sum_charge', Decimal(precision=38, scale=6)),
        ('avg_qty', Float64),
        ('avg_price', Float64),
        ('avg_disc', Float64),
        ('count_order', Int64)])

In [None]:
INDEX_COLS = ['l_returnflag', 'l_linestatus']
OUTPUT_COLS = ['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']
OUTPUT_SCHEMA = dfs[0].select(OUTPUT_COLS).collect_schema()

In [15]:
dfs[0].select(OUTPUT_COLS)

sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
3792218.0,5348873966.98,5080445037.0262,5284350579.05466,25.606139,36117.124924,0.0502,148098
98150.0,137392726.56,130621994.2802,135936189.560958,25.153767,35210.847401,0.049042,3902
7463696.0,10513055198.42,9986805179.4178,10386025457.583662,25.554477,35994.984759,0.050106,292070
3791476.0,5345510296.84,5079703360.5274,5282847288.13381,25.586616,36073.951606,0.049952,148182


In [None]:
def numpyify(df: pl.DataFrame) -> np.ndarray:
    return df.select(OUTPUT_COLS).to_numpy().flatten()

In [34]:
arr = dfs[0].select(OUTPUT_COLS).to_numpy()
output_shape = arr.shape
arr

array([[3.79221800e+06, 5.34887397e+09, 5.08044504e+09, 5.28435058e+09,
        2.56061392e+01, 3.61171249e+04, 5.02002728e-02, 1.48098000e+05],
       [9.81500000e+04, 1.37392727e+08, 1.30621994e+08, 1.35936190e+08,
        2.51537673e+01, 3.52108474e+04, 4.90415172e-02, 3.90200000e+03],
       [7.46369600e+06, 1.05130552e+10, 9.98680518e+09, 1.03860255e+10,
        2.55544767e+01, 3.59949848e+04, 5.01064128e-02, 2.92070000e+05],
       [3.79147600e+06, 5.34551030e+09, 5.07970336e+09, 5.28284729e+09,
        2.55866165e+01, 3.60739516e+04, 4.99516810e-02, 1.48182000e+05]])

In [74]:
# flatten arr to 1D array
arr.flatten()

array([3.79221800e+06, 5.34887397e+09, 5.08044504e+09, 5.28435058e+09,
       2.56061392e+01, 3.61171249e+04, 5.02002728e-02, 1.48098000e+05,
       9.81500000e+04, 1.37392727e+08, 1.30621994e+08, 1.35936190e+08,
       2.51537673e+01, 3.52108474e+04, 4.90415172e-02, 3.90200000e+03,
       7.46369600e+06, 1.05130552e+10, 9.98680518e+09, 1.03860255e+10,
       2.55544767e+01, 3.59949848e+04, 5.01064128e-02, 2.92070000e+05,
       3.79147600e+06, 5.34551030e+09, 5.07970336e+09, 5.28284729e+09,
       2.55866165e+01, 3.60739516e+04, 4.99516810e-02, 1.48182000e+05])

In [36]:
og = dfs[0].__copy__()

In [39]:
dfs[0]

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3792218.0,5348873966.98,5080445037.0262,5284350579.05466,25.606139,36117.124924,0.0502,148098
"""N""","""F""",98150.0,137392726.56,130621994.2802,135936189.560958,25.153767,35210.847401,0.049042,3902
"""N""","""O""",7463696.0,10513055198.42,9986805179.4178,10386025457.583662,25.554477,35994.984759,0.050106,292070
"""R""","""F""",3791476.0,5345510296.84,5079703360.5274,5282847288.13381,25.586616,36073.951606,0.049952,148182


In [42]:
og.update(dfs[1].select(OUTPUT_COLS))

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3769864.0,5314301932.86,5047031111.3298,5249316193.779848,25.586502,36068.780171,0.050196,147338
"""N""","""F""",99496.0,139694331.64,132899346.776,138271285.165818,25.420542,35690.938079,0.049044,3914
"""N""","""O""",7479196.0,10536094852.04,10008590025.841,10409172954.410564,25.53254,35968.206699,0.05015,292928
"""R""","""F""",3776130.0,5322886356.96,5056940826.7526,5257875624.590406,25.476178,35911.580986,0.050009,148222


In [None]:
dfs[0].update( # put values back into the original dataframe
    pl.DataFrame(
        arr, 
        schema=dfs[0].select(OUTPUT_COLS).collect_schema() # coerce numpy array to the correct schema
    ) # index cols will be left unchanged (not updated b/c we only update output_cols)
)

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3792218.0,5348873966.97,5080445037.0262,5284350579.05466,25.606139,36117.124924,0.0502,148098
"""N""","""F""",98150.0,137392726.56,130621994.2802,135936189.560958,25.153767,35210.847401,0.049042,3902
"""N""","""O""",7463696.0,10513055198.42,9986805179.4178,10386025457.583662,25.554477,35994.984759,0.050106,292070
"""R""","""F""",3791476.0,5345510296.84,5079703360.5274,5282847288.13381,25.586616,36073.951606,0.049952,148182


In [67]:
dfs[0]

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3792218.0,5348873966.98,5080445037.0262,5284350579.05466,25.606139,36117.124924,0.0502,148098
"""N""","""F""",98150.0,137392726.56,130621994.2802,135936189.560958,25.153767,35210.847401,0.049042,3902
"""N""","""O""",7463696.0,10513055198.42,9986805179.4178,10386025457.583662,25.554477,35994.984759,0.050106,292070
"""R""","""F""",3791476.0,5345510296.84,5079703360.5274,5282847288.13381,25.586616,36073.951606,0.049952,148182


In [68]:
dfs[0]

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3792218.0,5348873966.98,5080445037.0262,5284350579.05466,25.606139,36117.124924,0.0502,148098
"""N""","""F""",98150.0,137392726.56,130621994.2802,135936189.560958,25.153767,35210.847401,0.049042,3902
"""N""","""O""",7463696.0,10513055198.42,9986805179.4178,10386025457.583662,25.554477,35994.984759,0.050106,292070
"""R""","""F""",3791476.0,5345510296.84,5079703360.5274,5282847288.13381,25.586616,36073.951606,0.049952,148182
