## DuckDB Notebook

This notebook generates a bunch of raw outputs, without applying PAC, to be consumed by a second stage.

```
 group by              output cols                        
 key cols ┌────────┬────────┬────────┬────────┐           
        │ │   A    │   B    │   C    │   D    │           
      ┌─▼─┼────────┼────────┼────────┼────────┤           
      │ 1 │   2    │        │        │        │           
      ├───┼───|────┼────────┼────────┼────────┤           
      │ 2 │   │    │        │        │        │           
      ├───┼───┼────┼────────┼────────┼────────┤           
      │ 3 │   │    │        │        │        │           
      └───┴───┼────┴────────┴────────┴────────┘           
              ▼                 A_1.json                  
       Sample 0:   A1=2        ┌─────────────────────────┐
       Sample 1:   A1=4  ───▶  │{                        │
             ...               │    col: A               │
       Sample 999: A1=3        │    row: 1               │
                               │    value: [2, 4, ... 3] │
                               │}                        │
                               └─────────────────────────┘
```

In [1]:
#!/usr/bin/env python
# coding: utf-8

import os
import pickle
import shutil

import duckdb
import polars as pl

In [2]:
EXPERIMENT = "pac-duckdb-q1"
OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step1"
SAMPLES = 1024

SAMPLE_STEP = f"""
DROP TABLE IF EXISTS random_samples;

CREATE TABLE random_samples AS
WITH sample_numbers AS MATERIALIZED (
    SELECT range AS sample_id FROM range({SAMPLES//2})
), random_values AS MATERIALIZED (
    SELECT 
        sample_numbers.sample_id,
        customer.rowid AS row_id,
        (RANDOM() > 0.5)::BOOLEAN AS random_binary
    FROM sample_numbers
    JOIN customer ON TRUE  -- Cross join to duplicate rows for each sample
)
SELECT
    sample_id,
    row_id,
    random_binary
FROM random_values
UNION ALL
SELECT -- select the complementary samples too
    ({SAMPLES//2}) + sample_id,
    row_id,
    NOT random_binary  -- Inverse the random_binary to get the complementary sample
FROM random_values
ORDER BY sample_id, row_id;
"""

PREPARE_STEP = """
DEALLOCATE PREPARE run_query;

PREPARE run_query AS 
SELECT
    l_returnflag,
    l_linestatus,
    2*sum(l_quantity) AS sum_qty,
    2*sum(l_extendedprice) AS sum_base_price,
    2*sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    2*sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    avg(l_quantity) AS avg_qty,
    avg(l_extendedprice) AS avg_price,
    avg(l_discount) AS avg_disc,
    2*count(*) AS count_order
FROM
    lineitem
JOIN orders ON lineitem.l_orderkey = orders.o_orderkey
JOIN customer ON orders.o_custkey = customer.c_custkey
JOIN random_samples AS rs
    ON rs.row_id = customer.rowid
WHERE
    l_shipdate <= CAST('1998-09-02' AS date)
    AND rs.random_binary = TRUE
    AND rs.sample_id = $sample
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

INDEX_COLS = ['l_returnflag', 'l_linestatus']
OUTPUT_COLS = ['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']

In [3]:
# Parameters
EXPERIMENT = "ap-duckdb-q1"
OUTPUT_DIR = "./outputs/ap-duckdb-q1-step1"
SAMPLES = 1024
SAMPLE_STEP = "DROP TABLE IF EXISTS random_samples;\n\nCREATE TABLE random_samples AS\nWITH sample_numbers AS MATERIALIZED (\n    SELECT range AS sample_id FROM range(512)\n), random_values AS MATERIALIZED (\n    SELECT \n        sample_numbers.sample_id,\n        customer.rowid AS row_id,\n        (RANDOM() > 0.5)::BOOLEAN AS random_binary\n    FROM sample_numbers\n    JOIN customer ON TRUE  -- Cross join to duplicate rows for each sample\n)\nSELECT\n    sample_id,\n    row_id,\n    random_binary\nFROM random_values\nUNION ALL\nSELECT -- select the complementary samples too\n    (512) + sample_id,\n    row_id,\n    NOT random_binary  -- Inverse the random_binary to get the complementary sample\nFROM random_values\nORDER BY sample_id, row_id;"
PREPARE_STEP = "DEALLOCATE PREPARE run_query;\n\nPREPARE run_query AS \nSELECT\n    l_returnflag,\n    l_linestatus,\n    2*sum(l_quantity) AS sum_qty,\n    2*sum(l_extendedprice) AS sum_base_price,\n    2*sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,\n    2*sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,\n    avg(l_quantity) AS avg_qty,\n    avg(l_extendedprice) AS avg_price,\n    avg(l_discount) AS avg_disc,\n    2*count(*) AS count_order\nFROM\n    lineitem\nJOIN orders ON lineitem.l_orderkey = orders.o_orderkey\nJOIN customer ON orders.o_custkey = customer.c_custkey\nJOIN random_samples AS rs\n    ON rs.row_id = customer.rowid\nWHERE\n    l_shipdate <= CAST('1998-09-02' AS date)\n    AND rs.random_binary = TRUE\n    AND rs.sample_id = $sample\nGROUP BY\n    l_returnflag,\n    l_linestatus\nORDER BY\n    l_returnflag,\n    l_linestatus;"
INDEX_COLS = ["l_returnflag", "l_linestatus"]
OUTPUT_COLS = ["sum_qty", "sum_base_price", "sum_disc_price", "sum_charge", "avg_qty", "avg_price", "avg_disc", "count_order"]


In [4]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
# duckdb load data/tpch/tpch.duckdb into the temporary in-memory database
con = duckdb.connect(database=':memory:')
tables = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
for t in tables:
    con.execute(f"CREATE TABLE {t} AS SELECT * FROM 'data/tpch/{t}.parquet'")

In [6]:
# Construct the table of random samples
# to use, join it with the lineitem table (for specific sample # s) and filter to just the
# rows where random_binary = 1.0
# This will give us a 50% sample of the lineitem table for each sample # s

assert SAMPLES % 2 == 0, "SAMPLES must be even to create complementary samples."

random_samples = con.execute(SAMPLE_STEP)

The randomness of what rows are chosen is saved to disk in `random_binary.json`. For each sample #, there is an array with one entry per row, where 1 means the row was chosen and 0 means it was not.

In [7]:
con.execute("""
SELECT sample_id, array_agg(random_binary::TINYINT) as random_binary
FROM random_samples
GROUP BY sample_id;
""").pl().write_json(f"{OUTPUT_DIR}/random_binary.json")

Query is specified as a prepared statement. We will then execute it once per sample.

In [8]:
# Query
con.execute(PREPARE_STEP)

# Run query to see output
dfs0 = con.execute(f"EXECUTE run_query(sample := {0});").pl()

# Save csv copies of the first 5 samples
os.makedirs(f"{OUTPUT_DIR}/csv", exist_ok=True)
for s in range(5):
    con.execute(f"EXECUTE run_query(sample := {s});").pl().write_csv(f"{OUTPUT_DIR}/csv/sample_{s}.csv")

dfs0

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3842098.0,5416596135.12,5145229170.086,5350995387.534556,25.55436,36026.578883,0.050143,150350
"""N""","""F""",99744.0,140795832.56,133840744.6392,139282165.85689,25.760331,36362.560062,0.049726,3872
"""N""","""O""",7634678.0,10761783537.96,10224255360.7336,10632709989.678114,25.562424,36032.596522,0.050008,298668
"""R""","""F""",3861686.0,5443865388.16,5173591839.9416,5380758912.449874,25.529445,35989.167205,0.049779,151264


In [9]:
# Run the query for each sample, but accumulate in a pl.DataFrame instead of a list
dfsdf: pl.DataFrame = pl.concat(
    con.execute(f"EXECUTE run_query(sample := {s});").pl().insert_column(0, pl.lit(s).alias("sample"))
    for s in range(SAMPLES)
)
dfsdf

sample,l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
i32,str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
0,"""A""","""F""",3842098.00,5416596135.12,5145229170.0860,5350995387.534556,25.55436,36026.578883,0.050143,150350
0,"""N""","""F""",99744.00,140795832.56,133840744.6392,139282165.856890,25.760331,36362.560062,0.049726,3872
0,"""N""","""O""",7634678.00,10761783537.96,10224255360.7336,10632709989.678114,25.562424,36032.596522,0.050008,298668
0,"""R""","""F""",3861686.00,5443865388.16,5173591839.9416,5380758912.449874,25.529445,35989.167205,0.049779,151264
1,"""A""","""F""",3829758.00,5406438475.32,5134768371.6780,5340452680.848228,25.558642,36080.928413,0.050236,149842
…,…,…,…,…,…,…,…,…,…,…
1022,"""R""","""F""",3733374.00,5262956829.20,5000971054.4154,5200691898.130638,25.514434,35967.830494,0.049923,146324
1023,"""A""","""F""",3758262.00,5295659653.84,5030523645.3272,5232137644.573534,25.60124,36073.975844,0.050096,146800
1023,"""N""","""F""",95808.00,135249217.72,128688773.1344,133977626.098436,25.480851,35970.536628,0.049016,3760
1023,"""N""","""O""",7408040.00,10436287443.80,9914870412.4462,10311215844.872888,25.571772,36024.96218,0.050026,289696


In [10]:
# Define which columns are the group-by keys (INDEX_COLS) and which are the output columns (OUTPUT_COLS)
# - moved to parameters cell at top of notebook

# Save these to disk for later use
with open(f"{OUTPUT_DIR}/INDEX_COLS.pkl", 'wb') as f:
    pickle.dump(INDEX_COLS, f)
with open(f"{OUTPUT_DIR}/OUTPUT_COLS.pkl", 'wb') as f:
    pickle.dump(OUTPUT_COLS, f)

In [11]:
# Combine all the samples into one table, grouped-by the group-by keys. Each cell contains an n <= # of samples length array of values.
listdf = dfsdf.drop("sample").group_by(INDEX_COLS, maintain_order=True).all()
listdf

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"list[decimal[38,2]]","list[decimal[38,2]]","list[decimal[38,4]]","list[decimal[38,6]]",list[f64],list[f64],list[f64],list[i64]
"""A""","""F""","[3842098.00, 3829758.00, … 3758262.00]","[5416596135.12, 5406438475.32, … 5295659653.84]","[5145229170.0860, 5134768371.6780, … 5030523645.3272]","[5350995387.534556, 5340452680.848228, … 5232137644.573534]","[25.55436, 25.558642, … 25.60124]","[36026.578883, 36080.928413, … 36073.975844]","[0.050143, 0.050236, … 0.050096]","[150350, 149842, … 146800]"
"""N""","""F""","[99744.00, 95772.00, … 95808.00]","[140795832.56, 134084171.84, … 135249217.72]","[133840744.6392, 127410397.6904, … 128688773.1344]","[139282165.856890, 132530678.683340, … 133977626.098436]","[25.760331, 25.15021, … 25.480851]","[36362.560062, 35211.17958, … 35970.536628]","[0.049726, 0.04989, … 0.049016]","[3872, 3808, … 3760]"
"""N""","""O""","[7634678.00, 7535866.00, … 7408040.00]","[10761783537.96, 10613995132.78, … 10436287443.80]","[10224255360.7336, 10083192277.3680, … 9914870412.4462]","[10632709989.678114, 10486254622.918048, … 10311215844.872888]","[25.562424, 25.58642, … 25.571772]","[36032.596522, 36037.548919, … 36024.96218]","[0.050008, 0.050039, … 0.050026]","[298668, 294526, … 289696]"
"""R""","""F""","[3861686.00, 3825876.00, … 3744198.00]","[5443865388.16, 5392912813.42, … 5279596842.16]","[5173591839.9416, 5124544370.5096, … 5016424743.9692]","[5380758912.449874, 5329511205.479190, … 5216391292.051106]","[25.529445, 25.534098, … 25.54302]","[35989.167205, 35992.583882, … 36017.551999]","[0.049779, 0.049899, … 0.049956]","[151264, 149834, … 146584]"


In [12]:
# What are all the possible group-by key combinations?
allgroups: pl.DataFrame = dfsdf.select(INDEX_COLS).unique()
allgroups.to_dicts()

[{'l_returnflag': 'A', 'l_linestatus': 'F'},
 {'l_returnflag': 'N', 'l_linestatus': 'F'},
 {'l_returnflag': 'R', 'l_linestatus': 'F'},
 {'l_returnflag': 'N', 'l_linestatus': 'O'}]

In [13]:
# Template for the final output, including all possible group-by groups
# Obtained by collecting all the samples in a big table and then keeping only the first occurrence of each groupby key.
# Then, fill all OUTPUT_COLS with nulls
templatedf = dfsdf.drop("sample").group_by(INDEX_COLS, maintain_order=True).first()
templatedf = templatedf.clear(n=len(allgroups)).with_columns(allgroups)
templatedf

with open(f"{OUTPUT_DIR}/template.pkl", "wb") as f:
    pickle.dump(templatedf, f)

In [14]:
# Write all table entries in the output table to their own JSON files. Each file has a number, the information of which file corresponds to which table entry
# is stored in reverse_map.json (as well as in the files themselves)
os.makedirs(f"{OUTPUT_DIR}/json", exist_ok=True)
i: int = 0
for col in OUTPUT_COLS:
    for group in allgroups.iter_rows(named=True):
        values = listdf.filter(pl.col(k).eq(v) for k, v in group.items()).select(col).to_series()
        j = pl.DataFrame().with_columns([
            pl.lit(col).alias("col"),
            pl.lit(group).alias("row"),
            pl.lit(values.explode().dtype.__repr__()).alias("dtype"),
            values.alias("values"),
        ])
        j.write_json(f"{OUTPUT_DIR}/json/{i}.json")
        i+=1

In [15]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/ap-duckdb-q1-step1.zip'