## DuckDB Notebook

This notebook generates a bunch of raw outputs, without applying PAC, to be consumed by a second stage.

```
 group by              output cols                        
 key cols ┌────────┬────────┬────────┬────────┐           
        │ │   A    │   B    │   C    │   D    │           
      ┌─▼─┼────────┼────────┼────────┼────────┤           
      │ 1 │   2    │        │        │        │           
      ├───┼───|────┼────────┼────────┼────────┤           
      │ 2 │   │    │        │        │        │           
      ├───┼───┼────┼────────┼────────┼────────┤           
      │ 3 │   │    │        │        │        │           
      └───┴───┼────┴────────┴────────┴────────┘           
              ▼                 A_1.json                  
       Sample 0:   A1=2        ┌─────────────────────────┐
       Sample 1:   A1=4  ───▶  │{                        │
             ...               │    col: A               │
       Sample 999: A1=3        │    row: 1               │
                               │    value: [2, 4, ... 3] │
                               │}                        │
                               └─────────────────────────┘
```

In [24]:
#!/usr/bin/env python
# coding: utf-8

import os
import pickle
import shutil

import duckdb
import polars as pl

In [25]:
EXPERIMENT = "pac-duckdb-q1"
OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step1"
SAMPLES = 1024

SAMPLE_STEP = f"""
DROP TABLE IF EXISTS random_samples;

CREATE TABLE random_samples AS
WITH sample_numbers AS MATERIALIZED (
    SELECT range AS sample_id FROM range({SAMPLES//2})
), random_values AS MATERIALIZED (
    SELECT 
        sample_numbers.sample_id,
        customer.rowid AS row_id,
        (RANDOM() > 0.5)::BOOLEAN AS random_binary
    FROM sample_numbers
    JOIN customer ON TRUE  -- Cross join to duplicate rows for each sample
)
SELECT
    sample_id,
    row_id,
    random_binary
FROM random_values
UNION ALL
SELECT -- select the complementary samples too
    ({SAMPLES//2}) + sample_id,
    row_id,
    NOT random_binary  -- Inverse the random_binary to get the complementary sample
FROM random_values
ORDER BY sample_id, row_id;
"""

PREPARE_STEP = """
DEALLOCATE PREPARE run_query;

PREPARE run_query AS 
SELECT
    l_returnflag,
    l_linestatus,
    2*sum(l_quantity) AS sum_qty,
    2*sum(l_extendedprice) AS sum_base_price,
    2*sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    2*sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    avg(l_quantity) AS avg_qty,
    avg(l_extendedprice) AS avg_price,
    avg(l_discount) AS avg_disc,
    2*count(*) AS count_order
FROM
    lineitem
JOIN orders ON lineitem.l_orderkey = orders.o_orderkey
JOIN customer ON orders.o_custkey = customer.c_custkey
JOIN random_samples AS rs
    ON rs.row_id = customer.rowid
WHERE
    l_shipdate <= CAST('1998-09-02' AS date)
    AND rs.random_binary = TRUE
    AND rs.sample_id = $sample
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

INDEX_COLS = ['l_returnflag', 'l_linestatus']
OUTPUT_COLS = ['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']

In [26]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [27]:
# duckdb load data/tpch/tpch.duckdb into the temporary in-memory database
con = duckdb.connect(database=':memory:')
tables = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
for t in tables:
    con.execute(f"CREATE TABLE {t} AS SELECT * FROM 'data/tpch/{t}.parquet'")

In [28]:
# Construct the table of random samples
# to use, join it with the lineitem table (for specific sample # s) and filter to just the
# rows where random_binary = 1.0
# This will give us a 50% sample of the lineitem table for each sample # s

assert SAMPLES % 2 == 0, "SAMPLES must be even to create complementary samples."

random_samples = con.execute(SAMPLE_STEP)

The randomness of what rows are chosen is saved to disk in `random_binary.json`. For each sample #, there is an array with one entry per row, where 1 means the row was chosen and 0 means it was not.

In [29]:
con.execute("""
SELECT sample_id, array_agg(random_binary::TINYINT) as random_binary
FROM random_samples
GROUP BY sample_id;
""").pl().write_json(f"{OUTPUT_DIR}/random_binary.json")

Query is specified as a prepared statement. We will then execute it once per sample.

In [30]:
# Query
con.execute(PREPARE_STEP)

# Run query to see output
dfs0 = con.execute(f"EXECUTE run_query(sample := {0});").pl()

# Save csv copies of the first 5 samples
os.makedirs(f"{OUTPUT_DIR}/csv", exist_ok=True)
for s in range(5):
    con.execute(f"EXECUTE run_query(sample := {s});").pl().write_csv(f"{OUTPUT_DIR}/csv/sample_{s}.csv")

dfs0

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3809186.0,5371648642.66,5102510365.1342,5307051950.891734,25.457028,35899.06332,0.050161,149632
"""N""","""F""",95408.0,133762378.66,127156278.2264,132368210.513864,25.200211,35330.792039,0.049514,3786
"""N""","""O""",7494676.0,10551197026.22,10023030585.0778,10422883872.133736,25.559036,35982.665574,0.050124,293230
"""R""","""F""",3825432.0,5396344572.78,5127498194.524,5332948048.485192,25.545797,36036.171253,0.049927,149748


In [31]:
# Run the query for each sample, but accumulate in a pl.DataFrame instead of a list
dfsdf: pl.DataFrame = pl.concat(
    con.execute(f"EXECUTE run_query(sample := {s});").pl().insert_column(0, pl.lit(s).alias("sample"))
    for s in range(SAMPLES)
)
dfsdf

sample,l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
i32,str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
0,"""A""","""F""",3809186.00,5371648642.66,5102510365.1342,5307051950.891734,25.457028,35899.06332,0.050161,149632
0,"""N""","""F""",95408.00,133762378.66,127156278.2264,132368210.513864,25.200211,35330.792039,0.049514,3786
0,"""N""","""O""",7494676.00,10551197026.22,10023030585.0778,10422883872.133736,25.559036,35982.665574,0.050124,293230
0,"""R""","""F""",3825432.00,5396344572.78,5127498194.5240,5332948048.485192,25.545797,36036.171253,0.049927,149748
1,"""A""","""F""",3746276.00,5288848642.04,5023217128.3400,5224783644.529676,25.497012,35995.703002,0.05029,146930
…,…,…,…,…,…,…,…,…,…,…
1022,"""R""","""F""",3756330.00,5300136579.88,5035835218.1936,5237657834.962020,25.539714,36036.229619,0.049977,147078
1023,"""A""","""F""",3765786.00,5311192130.30,5044012263.4916,5245946823.471304,25.594957,36098.634747,0.050284,147130
1023,"""N""","""F""",94592.00,133935152.66,127304467.0980,132453063.579210,25.441635,36023.440737,0.049242,3718
1023,"""N""","""O""",7471686.00,10524772793.36,9997831438.1462,10397961712.202582,25.573602,36023.510061,0.050145,292164


In [32]:
dfsdf = dfsdf.with_columns(
    pl.int_range(pl.len()).over("sample").alias("rank")
)

In [33]:
# Define which columns are the group-by keys (INDEX_COLS) and which are the output columns (OUTPUT_COLS)
# - moved to parameters cell at top of notebook

# Save these to disk for later use
with open(f"{OUTPUT_DIR}/INDEX_COLS.pkl", 'wb') as f:
    pickle.dump(INDEX_COLS, f)
with open(f"{OUTPUT_DIR}/OUTPUT_COLS.pkl", 'wb') as f:
    pickle.dump(OUTPUT_COLS, f)

In [34]:
# Combine all the samples into one table, grouped-by the group-by keys. Each cell contains an n <= # of samples length array of values.
DEFAULT_INDEX_COLS = ["rank"]
listdf = dfsdf.drop("sample").group_by(INDEX_COLS or DEFAULT_INDEX_COLS, maintain_order=True).all()
listdf

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order,rank
str,str,"list[decimal[38,2]]","list[decimal[38,2]]","list[decimal[38,4]]","list[decimal[38,6]]",list[f64],list[f64],list[f64],list[i64],list[i64]
"""A""","""F""","[3809186.00, 3746276.00, … 3765786.00]","[5371648642.66, 5288848642.04, … 5311192130.30]","[5102510365.1342, 5023217128.3400, … 5044012263.4916]","[5307051950.891734, 5224783644.529676, … 5245946823.471304]","[25.457028, 25.497012, … 25.594957]","[35899.06332, 35995.703002, … 36098.634747]","[0.050161, 0.05029, … 0.050284]","[149632, 146930, … 147130]","[0, 0, … 0]"
"""N""","""F""","[95408.00, 93212.00, … 94592.00]","[133762378.66, 130693844.30, … 133935152.66]","[127156278.2264, 124377742.1244, … 127304467.0980]","[132368210.513864, 129510188.269776, … 132453063.579210]","[25.200211, 25.178822, … 25.441635]","[35330.792039, 35303.577607, … 36023.440737]","[0.049514, 0.04873, … 0.049242]","[3786, 3702, … 3718]","[1, 1, … 1]"
"""N""","""O""","[7494676.00, 7414214.00, … 7471686.00]","[10551197026.22, 10452259510.18, … 10524772793.36]","[10023030585.0778, 9929969933.3542, … 9997831438.1462]","[10422883872.133736, 10326855530.964210, … 10397961712.202582]","[25.559036, 25.537899, … 25.573602]","[35982.665574, 36002.299206, … 36023.510061]","[0.050124, 0.050069, … 0.050145]","[293230, 290322, … 292164]","[2, 2, … 2]"
"""R""","""F""","[3825432.00, 3789280.00, … 3789672.00]","[5396344572.78, 5347223855.24, … 5348513969.42]","[5127498194.5240, 5081016249.4132, … 5082719085.9816]","[5332948048.485192, 5283469265.973486, … 5285678086.817628]","[25.545797, 25.54456, … 25.550992]","[36036.171253, 36047.080054, … 36061.125214]","[0.049927, 0.04998, … 0.049853]","[149748, 148340, … 148318]","[3, 3, … 3]"


In [35]:
# What are all the possible group-by key combinations?
allgroups: pl.DataFrame = listdf.select(INDEX_COLS or DEFAULT_INDEX_COLS)
allgroups.to_dicts()

[{'l_returnflag': 'A', 'l_linestatus': 'F'},
 {'l_returnflag': 'N', 'l_linestatus': 'F'},
 {'l_returnflag': 'N', 'l_linestatus': 'O'},
 {'l_returnflag': 'R', 'l_linestatus': 'F'}]

In [36]:
# Template for the final output, including all possible group-by groups
# Obtained by collecting all the samples in a big table and then keeping only the first occurrence of each groupby key.
# Then, fill all OUTPUT_COLS with nulls
templatedf = dfsdf.drop("sample").group_by(INDEX_COLS or DEFAULT_INDEX_COLS, maintain_order=True).first()
templatedf = templatedf.clear(n=len(allgroups)).with_columns(allgroups)
templatedf

with open(f"{OUTPUT_DIR}/template.pkl", "wb") as f:
    pickle.dump(templatedf, f)

In [37]:
# Write all table entries in the output table to their own JSON files. Each file has a number, the information of which file corresponds to which table entry
# is stored in reverse_map.json (as well as in the files themselves)
os.makedirs(f"{OUTPUT_DIR}/json", exist_ok=True)
i: int = 0
for col in OUTPUT_COLS:
    for group in allgroups.iter_rows(named=True):
        values = listdf.filter(pl.col(k).eq(v) for k, v in group.items()).select(col).to_series()
        j = pl.DataFrame().with_columns([
            pl.lit(col).alias("col"),
            pl.lit(group).alias("row"),
            pl.lit(values.explode().dtype.__repr__()).alias("dtype"),
            pl.lit(SAMPLES).alias("samples"),
            values.alias("values"),
        ])
        j.write_json(f"{OUTPUT_DIR}/json/{i}.json")
        i+=1

In [38]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/mayuri/Desktop/pacdb/outputs/pac-duckdb-q1-step1.zip'