## DuckDB Notebook

This notebook generates a bunch of raw outputs, without applying PAC, to be consumed by a second stage.

```
 group by              output cols                        
 key cols ┌────────┬────────┬────────┬────────┐           
        │ │   A    │   B    │   C    │   D    │           
      ┌─▼─┼────────┼────────┼────────┼────────┤           
      │ 1 │   2    │        │        │        │           
      ├───┼───|────┼────────┼────────┼────────┤           
      │ 2 │   │    │        │        │        │           
      ├───┼───┼────┼────────┼────────┼────────┤           
      │ 3 │   │    │        │        │        │           
      └───┴───┼────┴────────┴────────┴────────┘           
              ▼                 A_1.json                  
       Sample 0:   A1=2        ┌─────────────────────────┐
       Sample 1:   A1=4  ───▶  │{                        │
             ...               │    col: A               │
       Sample 999: A1=3        │    row: 1               │
                               │    value: [2, 4, ... 3] │
                               │}                        │
                               └─────────────────────────┘
```

In [1]:
#!/usr/bin/env python
# coding: utf-8

import os
import pickle
import shutil

import duckdb
import polars as pl

In [2]:
EXPERIMENT = "pac-duckdb-q1"
OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step1"
SAMPLES = 1024

SAMPLE_STEP = f"""
DROP TABLE IF EXISTS random_samples;

CREATE TABLE random_samples AS
WITH sample_numbers AS MATERIALIZED (
    SELECT range AS sample_id FROM range({SAMPLES//2})
), random_values AS MATERIALIZED (
    SELECT 
        sample_numbers.sample_id,
        customer.rowid AS row_id,
        (RANDOM() > 0.5)::BOOLEAN AS random_binary
    FROM sample_numbers
    JOIN customer ON TRUE  -- Cross join to duplicate rows for each sample
)
SELECT
    sample_id,
    row_id,
    random_binary
FROM random_values
UNION ALL
SELECT -- select the complementary samples too
    ({SAMPLES//2}) + sample_id,
    row_id,
    NOT random_binary  -- Inverse the random_binary to get the complementary sample
FROM random_values
ORDER BY sample_id, row_id;
"""

PREPARE_STEP = """
DEALLOCATE PREPARE run_query;

PREPARE run_query AS 
SELECT
    l_returnflag,
    l_linestatus,
    2*sum(l_quantity) AS sum_qty,
    2*sum(l_extendedprice) AS sum_base_price,
    2*sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    2*sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    avg(l_quantity) AS avg_qty,
    avg(l_extendedprice) AS avg_price,
    avg(l_discount) AS avg_disc,
    2*count(*) AS count_order
FROM
    lineitem
JOIN orders ON lineitem.l_orderkey = orders.o_orderkey
JOIN customer ON orders.o_custkey = customer.c_custkey
JOIN random_samples AS rs
    ON rs.row_id = customer.rowid
WHERE
    l_shipdate <= CAST('1998-09-02' AS date)
    AND rs.random_binary = TRUE
    AND rs.sample_id = $sample
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

INDEX_COLS = ['l_returnflag', 'l_linestatus']
OUTPUT_COLS = ['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']

In [3]:
# Parameters
EXPERIMENT = "ap-duckdb-q10-customer"
OUTPUT_DIR = "./outputs/ap-duckdb-q10-customer-step1"
SAMPLES = 1024
SAMPLE_STEP = "DROP TABLE IF EXISTS random_samples;\n\nCREATE TABLE random_samples AS\nWITH sample_numbers AS MATERIALIZED (\n    SELECT range AS sample_id FROM range(1024 // 2)\n), random_values AS MATERIALIZED (\n    SELECT \n        sample_numbers.sample_id,\n        customer.rowid AS row_id,\n        (RANDOM() > 0.5)::BOOLEAN AS random_binary\n    FROM sample_numbers\n    JOIN customer ON TRUE  -- Cross join to duplicate rows for each sample\n)\nSELECT\n    sample_id,\n    row_id,\n    random_binary\nFROM random_values\nUNION ALL\nSELECT -- select the complementary samples too\n    (1024 // 2) + sample_id,\n    row_id,\n    NOT random_binary  -- Inverse the random_binary to get the complementary sample\nFROM random_values\nORDER BY sample_id, row_id;"
PREPARE_STEP = "DEALLOCATE PREPARE run_query;\n\nPREPARE run_query AS \nSELECT\n    c_custkey,\n    c_name,\n    sum(l_extendedprice * (1 - l_discount)) AS revenue,\n    c_acctbal,\n    n_name,\n    c_address,\n    c_phone,\n    c_comment\nFROM\n    (SELECT * FROM customer\n        JOIN random_samples AS rs ON rs.row_id = customer.rowid\n        AND rs.random_binary = TRUE\n        AND rs.sample_id = $sample) AS customer,\n    orders,\n    lineitem,\n    nation\nWHERE\n    c_custkey = o_custkey\n    AND l_orderkey = o_orderkey\n    AND o_orderdate >= CAST('1993-10-01' AS date)\n    AND o_orderdate < CAST('1994-01-01' AS date)\n    AND l_returnflag = 'R'\n    AND c_nationkey = n_nationkey\nGROUP BY\n    c_custkey,\n    c_name,\n    c_acctbal,\n    c_phone,\n    n_name,\n    c_address,\n    c_comment\nORDER BY\n    revenue DESC\nLIMIT 20;"
INDEX_COLS = ["c_custkey", "c_name", "c_acctbal", "c_phone", "n_name", "c_address", "c_comment"]
OUTPUT_COLS = ["revenue"]


In [4]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
# duckdb load data/tpch/tpch.duckdb into the temporary in-memory database
con = duckdb.connect(database=':memory:')
tables = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
for t in tables:
    con.execute(f"CREATE TABLE {t} AS SELECT * FROM 'data/tpch/{t}.parquet'")

In [6]:
# Construct the table of random samples
# to use, join it with the lineitem table (for specific sample # s) and filter to just the
# rows where random_binary = 1.0
# This will give us a 50% sample of the lineitem table for each sample # s

assert SAMPLES % 2 == 0, "SAMPLES must be even to create complementary samples."

random_samples = con.execute(SAMPLE_STEP)

The randomness of what rows are chosen is saved to disk in `random_binary.json`. For each sample #, there is an array with one entry per row, where 1 means the row was chosen and 0 means it was not.

In [7]:
con.execute("""
SELECT sample_id, array_agg(random_binary::TINYINT) as random_binary
FROM random_samples
GROUP BY sample_id;
""").pl().write_json(f"{OUTPUT_DIR}/random_binary.json")

Query is specified as a prepared statement. We will then execute it once per sample.

In [8]:
# Query
con.execute(PREPARE_STEP)

# Run query to see output
dfs0 = con.execute(f"EXECUTE run_query(sample := {0});").pl()

# Save csv copies of the first 5 samples
os.makedirs(f"{OUTPUT_DIR}/csv", exist_ok=True)
for s in range(5):
    con.execute(f"EXECUTE run_query(sample := {s});").pl().write_csv(f"{OUTPUT_DIR}/csv/sample_{s}.csv")

dfs0

c_custkey,c_name,revenue,c_acctbal,n_name,c_address,c_phone,c_comment
i64,str,"decimal[38,4]","decimal[15,2]",str,str,str,str
8242,"""Customer#000008242""",622786.7297,6322.09,"""ETHIOPIA""","""cYDWDiJt06B8CYzXX2L8x2hn1VFG""","""15-792-676-1184""",""" regular theodolites affix. ca…"
2455,"""Customer#000002455""",481592.4053,2070.99,"""GERMANY""","""a5DZ199yfAcFhfi2uwBE PKo,Z""","""17-946-225-9977""","""pinto beans alongside of the f…"
1966,"""Customer#000001966""",444059.0382,1937.72,"""ALGERIA""","""IbwZr7j QVifqf9WizOIWx,UXV9Cqx…","""10-973-269-8886""","""odolites across the unusual ac…"
1565,"""Customer#000001565""",412506.0062,1820.03,"""BRAZIL""","""n4acVpG0Deyj5aIFAfSNg Iu9cUagw…","""12-402-178-2007""","""deposits; unusual, bold deposi…"
14398,"""Customer#000014398""",408575.3600,-602.24,"""UNITED STATES""","""l49oKjbjQHz6YZwjo5wPihM lyYO6G""","""34-814-111-5424""","""es haggle fluffily blithely fl…"
…,…,…,…,…,…,…,…
2614,"""Customer#000002614""",354357.0889,663.38,"""ROMANIA""","""b6u0xJUEv2tCRP6GZNNxyjEEyOc9uB…","""29-740-430-9216""","""efully pending foxes boost pla…"
12499,"""Customer#000012499""",353116.6148,-104.59,"""MOZAMBIQUE""","""00duMvAXCN3""","""26-607-342-9289""","""ar deposits cajole slyly after…"
5987,"""Customer#000005987""",352523.8773,-259.21,"""BRAZIL""","""GY8EJU6YLQ2hu,a4Z""","""12-181-374-9390""","""ronic platelets. quickly expre…"
11177,"""Customer#000011177""",345670.9094,135.99,"""CANADA""",""",w0jX6XnHzTkC2qRRtA hIL3GHz0HD""","""13-151-607-8040""","""eodolites cajole. furiously sp…"


In [9]:
# Run the query for each sample, but accumulate in a pl.DataFrame instead of a list
dfsdf: pl.DataFrame = pl.concat(
    con.execute(f"EXECUTE run_query(sample := {s});").pl().insert_column(0, pl.lit(s).alias("sample"))
    for s in range(SAMPLES)
)
dfsdf

sample,c_custkey,c_name,revenue,c_acctbal,n_name,c_address,c_phone,c_comment
i32,i64,str,"decimal[38,4]","decimal[15,2]",str,str,str,str
0,8242,"""Customer#000008242""",622786.7297,6322.09,"""ETHIOPIA""","""cYDWDiJt06B8CYzXX2L8x2hn1VFG""","""15-792-676-1184""",""" regular theodolites affix. ca…"
0,2455,"""Customer#000002455""",481592.4053,2070.99,"""GERMANY""","""a5DZ199yfAcFhfi2uwBE PKo,Z""","""17-946-225-9977""","""pinto beans alongside of the f…"
0,1966,"""Customer#000001966""",444059.0382,1937.72,"""ALGERIA""","""IbwZr7j QVifqf9WizOIWx,UXV9Cqx…","""10-973-269-8886""","""odolites across the unusual ac…"
0,1565,"""Customer#000001565""",412506.0062,1820.03,"""BRAZIL""","""n4acVpG0Deyj5aIFAfSNg Iu9cUagw…","""12-402-178-2007""","""deposits; unusual, bold deposi…"
0,14398,"""Customer#000014398""",408575.3600,-602.24,"""UNITED STATES""","""l49oKjbjQHz6YZwjo5wPihM lyYO6G""","""34-814-111-5424""","""es haggle fluffily blithely fl…"
…,…,…,…,…,…,…,…,…
1023,11011,"""Customer#000011011""",342392.6260,3204.07,"""CANADA""","""Lcqu0eiaNMF0TFGvN""","""13-454-800-6046""","""s. final accounts integrate re…"
1023,2426,"""Customer#000002426""",341024.9938,2708.47,"""MOROCCO""","""MpZVRy3ypUubfZvbTf4h1SF""","""25-410-418-8923""","""y ironic requests. even instru…"
1023,14872,"""Customer#000014872""",340677.1551,155.08,"""INDIA""","""gfq4d dr61cG6kZasU h9FhDLz8lQ""","""18-766-990-8843""",""" final pinto beans sleep furio…"
1023,5875,"""Customer#000005875""",340301.4495,4907.38,"""ETHIOPIA""","""aY9kl8Xt2wBqp4DF6qJqJUc9hw5cr7…","""15-605-672-5541""","""eep blithely bold ideas. blith…"


In [10]:
# Define which columns are the group-by keys (INDEX_COLS) and which are the output columns (OUTPUT_COLS)
# - moved to parameters cell at top of notebook

# Save these to disk for later use
with open(f"{OUTPUT_DIR}/INDEX_COLS.pkl", 'wb') as f:
    pickle.dump(INDEX_COLS, f)
with open(f"{OUTPUT_DIR}/OUTPUT_COLS.pkl", 'wb') as f:
    pickle.dump(OUTPUT_COLS, f)

In [11]:
# Combine all the samples into one table, grouped-by the group-by keys. Each cell contains an n <= # of samples length array of values.
listdf = dfsdf.drop("sample").group_by(INDEX_COLS or pl.lit(0).alias(""), maintain_order=True).all()
listdf

c_custkey,c_name,c_acctbal,c_phone,n_name,c_address,c_comment,revenue
i64,str,"decimal[15,2]",str,str,str,str,"list[decimal[38,4]]"
8242,"""Customer#000008242""",6322.09,"""15-792-676-1184""","""ETHIOPIA""","""cYDWDiJt06B8CYzXX2L8x2hn1VFG""",""" regular theodolites affix. ca…","[622786.7297, 622786.7297, … 622786.7297]"
2455,"""Customer#000002455""",2070.99,"""17-946-225-9977""","""GERMANY""","""a5DZ199yfAcFhfi2uwBE PKo,Z""","""pinto beans alongside of the f…","[481592.4053, 481592.4053, … 481592.4053]"
1966,"""Customer#000001966""",1937.72,"""10-973-269-8886""","""ALGERIA""","""IbwZr7j QVifqf9WizOIWx,UXV9Cqx…","""odolites across the unusual ac…","[444059.0382, 444059.0382, … 444059.0382]"
1565,"""Customer#000001565""",1820.03,"""12-402-178-2007""","""BRAZIL""","""n4acVpG0Deyj5aIFAfSNg Iu9cUagw…","""deposits; unusual, bold deposi…","[412506.0062, 412506.0062, … 412506.0062]"
14398,"""Customer#000014398""",-602.24,"""34-814-111-5424""","""UNITED STATES""","""l49oKjbjQHz6YZwjo5wPihM lyYO6G""","""es haggle fluffily blithely fl…","[408575.3600, 408575.3600, … 408575.3600]"
…,…,…,…,…,…,…,…
12226,"""Customer#000012226""",1850.48,"""19-265-644-3796""","""INDONESIA""","""08fy8Pc0NmrqAiAYPZuOOD55dV3tbO…",""". quickly bold theodolites gro…","[323369.1017, 323369.1017, … 323369.1017]"
3541,"""Customer#000003541""",7052.19,"""26-551-286-8801""","""MOZAMBIQUE""","""ye4dLVD7hS2cbIL956lC""","""ular ideas wake bold, unusual …",[319976.9349]
11866,"""Customer#000011866""",3380.37,"""33-807-163-1247""","""UNITED KINGDOM""","""hdAdp5v,AYqoX24svKZw1UGEXmQ""","""ep at the furiously final requ…",[316999.3436]
9986,"""Customer#000009986""",-196.72,"""26-349-647-1183""","""MOZAMBIQUE""","""yKAZSS,DElfPsdFKAqaNz0I""","""quickly furiously regular requ…",[316228.2307]


In [12]:
# What are all the possible group-by key combinations?
allgroups: pl.DataFrame = listdf.select(INDEX_COLS or pl.lit(0).alias(""))
allgroups.to_dicts()

[{'c_custkey': 8242,
  'c_name': 'Customer#000008242',
  'c_acctbal': Decimal('6322.09'),
  'c_phone': '15-792-676-1184',
  'n_name': 'ETHIOPIA',
  'c_address': 'cYDWDiJt06B8CYzXX2L8x2hn1VFG',
  'c_comment': ' regular theodolites affix. carefully ironic packages cajole deposits; slyly ironic packages wake quickly. regular,'},
 {'c_custkey': 2455,
  'c_name': 'Customer#000002455',
  'c_acctbal': Decimal('2070.99'),
  'c_phone': '17-946-225-9977',
  'n_name': 'GERMANY',
  'c_address': 'a5DZ199yfAcFhfi2uwBE PKo,Z',
  'c_comment': 'pinto beans alongside of the furiously ironic asymptotes are quickly even platelets: express'},
 {'c_custkey': 1966,
  'c_name': 'Customer#000001966',
  'c_acctbal': Decimal('1937.72'),
  'c_phone': '10-973-269-8886',
  'n_name': 'ALGERIA',
  'c_address': 'IbwZr7j QVifqf9WizOIWx,UXV9CqxUyrwj',
  'c_comment': 'odolites across the unusual accounts hang carefully furiously bold excuses. regular pi'},
 {'c_custkey': 1565,
  'c_name': 'Customer#000001565',
  'c_acctb

In [13]:
# Template for the final output, including all possible group-by groups
# Obtained by collecting all the samples in a big table and then keeping only the first occurrence of each groupby key.
# Then, fill all OUTPUT_COLS with nulls
templatedf = dfsdf.drop("sample").group_by(INDEX_COLS or pl.lit(0).alias(""), maintain_order=True).first()
templatedf = templatedf.clear(n=len(allgroups)).with_columns(allgroups)
templatedf

with open(f"{OUTPUT_DIR}/template.pkl", "wb") as f:
    pickle.dump(templatedf, f)

In [14]:
# Write all table entries in the output table to their own JSON files. Each file has a number, the information of which file corresponds to which table entry
# is stored in reverse_map.json (as well as in the files themselves)
os.makedirs(f"{OUTPUT_DIR}/json", exist_ok=True)
i: int = 0
for col in OUTPUT_COLS:
    for group in allgroups.iter_rows(named=True):
        values = listdf.filter(pl.col(k).eq(v) for k, v in group.items()).select(col).to_series()
        j = pl.DataFrame().with_columns([
            pl.lit(col).alias("col"),
            pl.lit(group).alias("row"),
            pl.lit(values.explode().dtype.__repr__()).alias("dtype"),
            values.alias("values"),
        ])
        j.write_json(f"{OUTPUT_DIR}/json/{i}.json")
        i+=1

In [15]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/ap-duckdb-q10-customer-step1.zip'