## DuckDB Notebook

This notebook generates a bunch of raw outputs, without applying PAC, to be consumed by a second stage.

```
 group by              output cols                        
 key cols ┌────────┬────────┬────────┬────────┐           
        │ │   A    │   B    │   C    │   D    │           
      ┌─▼─┼────────┼────────┼────────┼────────┤           
      │ 1 │   2    │        │        │        │           
      ├───┼───|────┼────────┼────────┼────────┤           
      │ 2 │   │    │        │        │        │           
      ├───┼───┼────┼────────┼────────┼────────┤           
      │ 3 │   │    │        │        │        │           
      └───┴───┼────┴────────┴────────┴────────┘           
              ▼                 A_1.json                  
       Sample 0:   A1=2        ┌─────────────────────────┐
       Sample 1:   A1=4  ───▶  │{                        │
             ...               │    col: A               │
       Sample 999: A1=3        │    row: 1               │
                               │    value: [2, 4, ... 3] │
                               │}                        │
                               └─────────────────────────┘
```

In [1]:
#!/usr/bin/env python
# coding: utf-8

EXPERIMENT = 'pac-duckdb-q1'
OUTPUT_DIR = f'./outputs/{EXPERIMENT}'
GENERATE = False
USE_EVEN_NUMBER_OF_INPUT_ROWS = False

if GENERATE:
    print("GENERATE = True, so we will generate new samples.")
else:
    print("GENERATE = False, so we will load saved output from files rather than recomputing.")

import os
from typing import List
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

import numpy as np
import pickle

import duckdb
import polars as pl
import pyarrow as pa

# duckdb load data/tpch/tpch.duckdb
#con = duckdb.connect(database='data/tpch/tpch.duckdb', read_only=True)
con = duckdb.connect(database=':memory:')
tables = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
#tables = ["lineitem", "orders"]
for t in tables:
    con.execute(f"CREATE TABLE {t} AS SELECT * FROM 'data/tpch/{t}.parquet'")

lineitem_df = con.execute("SELECT * FROM lineitem").fetchdf()
orders_df = con.execute("SELECT * FROM orders").fetchdf()

row_count = lineitem_df.shape[0]

GENERATE = False, so we will load saved output from files rather than recomputing.


In [2]:
# Construct the table of random samples
# to use, join it with the lineitem table (for specific sample # s) and filter to just the
# rows where random_binary = 1.0
# This will give us a 50% sample of the lineitem table for each sample # s

SAMPLES = 1024
assert SAMPLES % 2 == 0, "SAMPLES must be even to create complementary samples."

random_samples = con.execute(f"""
DROP TABLE IF EXISTS random_samples;

CREATE TABLE random_samples AS
WITH sample_numbers AS MATERIALIZED (
    SELECT range AS sample_id FROM range({SAMPLES//2})
), random_values AS MATERIALIZED (
    SELECT 
        sample_numbers.sample_id,
        customer.rowid AS row_id,
        (RANDOM() > 0.5)::BOOLEAN AS random_binary
    FROM sample_numbers
    JOIN customer ON TRUE  -- Cross join to duplicate rows for each sample
)
SELECT
    sample_id,
    row_id,
    random_binary
FROM random_values
UNION ALL
SELECT -- select the complementary samples too
    ({SAMPLES//2}) + sample_id,
    row_id,
    NOT random_binary  -- Inverse the random_binary to get the complementary sample
FROM random_values
ORDER BY sample_id, row_id;
""")

The randomness of what rows are chosen is saved to disk in `random_binary.json`. For each sample #, there is an array with one entry per row, where 1 means the row was chosen and 0 means it was not.

In [3]:
con.execute(f"""
SELECT sample_id, array_agg(random_binary::TINYINT) as random_binary
FROM random_samples
GROUP BY sample_id;
""").pl().write_json(f"{OUTPUT_DIR}/random_binary.json")

Query is specified as a prepared statement. We will then execute it once per sample.

In [4]:
# Query
con.execute("""
DEALLOCATE PREPARE run_query;

PREPARE run_query AS 
SELECT
    l_returnflag,
    l_linestatus,
    2*sum(l_quantity) AS sum_qty,
    2*sum(l_extendedprice) AS sum_base_price,
    2*sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    2*sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    avg(l_quantity) AS avg_qty,
    avg(l_extendedprice) AS avg_price,
    avg(l_discount) AS avg_disc,
    2*count(*) AS count_order
FROM
    lineitem
JOIN orders ON lineitem.l_orderkey = orders.o_orderkey
JOIN customer ON orders.o_custkey = customer.c_custkey
JOIN random_samples AS rs
    ON rs.row_id = customer.rowid
WHERE
    l_shipdate <= CAST('1998-09-02' AS date)
    AND rs.random_binary = TRUE
    AND rs.sample_id = $sample
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
""")

# Run query to see output
dfs0 = con.execute(f"EXECUTE run_query(sample := {0});").pl()

# Save csv copies of the first 5 samples
os.makedirs(f"{OUTPUT_DIR}/csv", exist_ok=True)
for s in range(5):
    con.execute(f"EXECUTE run_query(sample := {s});").pl().write_csv(f"{OUTPUT_DIR}/csv/sample_{s}.csv")

dfs0

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3872600.0,5455999234.44,5183002074.637,5390928871.288478,25.547888,35993.71452,0.050045,151582
"""N""","""F""",100320.0,140524548.0,133673902.6596,139097782.899638,25.683564,35976.58679,0.049037,3906
"""N""","""O""",7612938.0,10723688730.44,10187962703.2562,10595611689.229362,25.595902,36054.738392,0.050026,297428
"""R""","""F""",3883504.0,5480829269.88,5206187089.8862,5414018366.67911,25.530892,36032.011504,0.050182,152110


In [5]:
# Run the query for each sample, but accumulate in a pl.DataFrame instead of a list
dfsdf: pl.DataFrame = pl.concat(
    con.execute(f"EXECUTE run_query(sample := {s});").pl().insert_column(0, pl.lit(s).alias("sample"))
    for s in range(SAMPLES)
)
dfsdf

sample,l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
i32,str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
0,"""A""","""F""",3872600.00,5455999234.44,5183002074.6370,5390928871.288478,25.547888,35993.71452,0.050045,151582
0,"""N""","""F""",100320.00,140524548.00,133673902.6596,139097782.899638,25.683564,35976.58679,0.049037,3906
0,"""N""","""O""",7612938.00,10723688730.44,10187962703.2562,10595611689.229362,25.595902,36054.738392,0.050026,297428
0,"""R""","""F""",3883504.00,5480829269.88,5206187089.8862,5414018366.679110,25.530892,36032.011504,0.050182,152110
1,"""A""","""F""",3814684.00,5371564822.48,5102349478.7372,5307162753.446800,25.575472,36013.548564,0.050089,149154
…,…,…,…,…,…,…,…,…,…,…
1022,"""R""","""F""",3816766.00,5380654177.90,5112668842.5692,5316650859.393206,25.536357,35999.666662,0.049963,149464
1023,"""A""","""F""",3735524.00,5263230946.96,4998594086.3996,5199040855.429432,25.566868,36022.879972,0.050322,146108
1023,"""N""","""F""",96504.00,135476490.56,128828550.0050,134060343.512812,25.557203,35878.307881,0.04901,3776
1023,"""N""","""O""",7383016.00,10405091665.52,9884850142.3378,10280838228.071774,25.579517,36049.931281,0.050046,288630


In [19]:
# Define which columns are the group-by keys (INDEX_COLS) and which are the output columns (OUTPUT_COLS)
INDEX_COLS = ['l_returnflag', 'l_linestatus']
OUTPUT_COLS = ['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']
with open(f"{OUTPUT_DIR}/INDEX_COLS.pkl", 'wb') as f:
    pickle.dump(INDEX_COLS, f)
with open(f"{OUTPUT_DIR}/OUTPUT_COLS.pkl", 'wb') as f:
    pickle.dump(OUTPUT_COLS, f)

In [7]:
# Combine all the samples into one table, grouped-by the group-by keys. Each cell contains an n <= # of samples length array of values.
listdf = (dfsdf.group_by(INDEX_COLS, maintain_order=True)
      .all()
      .drop(pl.col("sample")))
listdf

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"list[decimal[38,2]]","list[decimal[38,2]]","list[decimal[38,4]]","list[decimal[38,6]]",list[f64],list[f64],list[f64],list[i64]
"""A""","""F""","[3872600.00, 3814684.00, … 3735524.00]","[5455999234.44, 5371564822.48, … 5263230946.96]","[5183002074.6370, 5102349478.7372, … 4998594086.3996]","[5390928871.288478, 5307162753.446800, … 5199040855.429432]","[25.547888, 25.575472, … 25.566868]","[35993.71452, 36013.548564, … 36022.879972]","[0.050045, 0.050089, … 0.050322]","[151582, 149154, … 146108]"
"""N""","""F""","[100320.00, 91692.00, … 96504.00]","[140524548.00, 128616635.28, … 135476490.56]","[133673902.6596, 122292640.8112, … 128828550.0050]","[139097782.899638, 127310635.683498, … 134060343.512812]","[25.683564, 24.90277, … 25.557203]","[35976.58679, 34931.188289, … 35878.307881]","[0.049037, 0.049745, … 0.04901]","[3906, 3682, … 3776]"
"""N""","""O""","[7612938.00, 7483198.00, … 7383016.00]","[10723688730.44, 10546727307.62, … 10405091665.52]","[10187962703.2562, 10019260010.9796, … 9884850142.3378]","[10595611689.229362, 10419724808.623714, … 10280838228.071774]","[25.595902, 25.545854, … 25.579517]","[36054.738392, 36004.01222, … 36049.931281]","[0.050026, 0.05005, … 0.050046]","[297428, 292932, … 288630]"
"""R""","""F""","[3883504.00, 3814518.00, … 3733208.00]","[5480829269.88, 5380328265.08, … 5264958039.52]","[5206187089.8862, 5112221125.1604, … 5003254201.9076]","[5414018366.679110, 5316421123.965584, … 5202537701.341728]","[25.530892, 25.492321, … 25.582884]","[36032.011504, 35956.589178, … 36079.643378]","[0.050182, 0.050035, … 0.049795]","[152110, 149634, … 145926]"


In [None]:
allgroups: pl.DataFrame = listdf.select(INDEX_COLS)
allgroups.to_dicts()

[{'l_returnflag': 'A', 'l_linestatus': 'F'},
 {'l_returnflag': 'N', 'l_linestatus': 'F'},
 {'l_returnflag': 'N', 'l_linestatus': 'O'},
 {'l_returnflag': 'R', 'l_linestatus': 'F'}]

In [17]:
# Template for the final output, including all possible group-by groups
# Obtained by collecting all the samples in a big table and then keeping only the first occurrence of each groupby key.
# Then, fill all OUTPUT_COLS with nulls
templatedf = dfsdf.drop(pl.col("sample")).group_by(INDEX_COLS, maintain_order=True).first()
templatedf = templatedf.clear(n=len(allgroups)).with_columns(allgroups)
templatedf

with open(f"{OUTPUT_DIR}/template.pkl", "wb") as f:
    pickle.dump(templatedf, f)

In [18]:
os.makedirs(f"{OUTPUT_DIR}/json", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/pkl", exist_ok=True)
i: int = 0
reverse_map = {}
for col in OUTPUT_COLS:
    for group in allgroups.iter_rows(named=True):
        values = listdf.filter(pl.col(k).eq(v) for k, v in group.items()).select(col).to_series()
        j = pl.DataFrame().with_columns([
            pl.lit(col).alias("col"),
            pl.lit(group).alias("row"),
            values.alias("values"),
        ])
        reverse_map[i] = (col, group)
        j.write_json(f"{OUTPUT_DIR}/json/{i}.json")
        # save pickle of numpy conversion of values
        pickle.dump(values.explode().to_numpy(), open(f"{OUTPUT_DIR}/pkl/{i}.pkl", "wb"))
        i+=1

import json
with open(f"{OUTPUT_DIR}/reverse_map.json", "w") as f:
    json.dump(reverse_map, f)
pickle.dump(reverse_map, open(f"{OUTPUT_DIR}/reverse_map.pkl", "wb"))

In [11]:
for col in OUTPUT_COLS:
    for group in allgroups.iter_rows(named=True):
        values = listdf.filter(pl.col(k).eq(v) for k, v in group.items()).select(col).to_series().explode()
        print(col, group, values.to_numpy().var())

sum_qty {'l_returnflag': 'A', 'l_linestatus': 'F'} 1860348960.2500
sum_qty {'l_returnflag': 'N', 'l_linestatus': 'F'} 5787269.9375
sum_qty {'l_returnflag': 'N', 'l_linestatus': 'O'} 7620912534.0000
sum_qty {'l_returnflag': 'R', 'l_linestatus': 'F'} 1901432521.7500
sum_base_price {'l_returnflag': 'A', 'l_linestatus': 'F'} 3714742484253082.4505453125
sum_base_price {'l_returnflag': 'N', 'l_linestatus': 'F'} 11488706723513.5382578125
sum_base_price {'l_returnflag': 'N', 'l_linestatus': 'O'} 15211638773405307.9490640625
sum_base_price {'l_returnflag': 'R', 'l_linestatus': 'F'} 3792667125189579.1729640625
sum_disc_price {'l_returnflag': 'A', 'l_linestatus': 'F'} 3354208466817374.607160701875
sum_disc_price {'l_returnflag': 'N', 'l_linestatus': 'F'} 10391674132324.11949290570312
sum_disc_price {'l_returnflag': 'N', 'l_linestatus': 'O'} 13732849406732366.33934198312
sum_disc_price {'l_returnflag': 'R', 'l_linestatus': 'F'} 3429258028753074.150377986094
sum_charge {'l_returnflag': 'A', 'l_line

In [12]:
listdf.with_columns([
    pl.col(column).map_elements(lambda x: x.to_numpy().var(), returns_scalar=True).alias(column)
    for column in OUTPUT_COLS
])



l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[*,4]","decimal[*,10]","decimal[*,14]","decimal[*,14]",f64,f64,f64,f64
"""A""","""F""",1860348960.25,3714742484253081.5,3354208466817374.5,3628293529639753.5,0.00131,3137.748236,6.56e-09,2782300.0
"""N""","""F""",5787269.9375,11488706723513.537,10391674132324.12,11265797768775.123,0.053266,120294.500967,2.404e-07,7385.390625
"""N""","""O""",7620912534.0,1.5211638773405308e+16,1.3732849406732366e+16,1.4857462191709084e+16,0.000776,1717.037374,3.1061e-09,11427000.0
"""R""","""F""",1901432521.75,3792667125189579.0,3429258028753074.0,3707499794661229.5,0.001312,3466.831049,7.0248e-09,2858900.0


In [13]:
def numpyify(df: pl.DataFrame) -> np.ndarray:
    return df.select(OUTPUT_COLS).to_numpy().flatten()

def tablify(arr: np.ndarray) -> pl.DataFrame:
    global OUTPUT_SHAPE, OUTPUT_SCHEMA
    return templatedf.update( # put values back into the original dataframe
        pl.DataFrame(
            arr.reshape(OUTPUT_SHAPE), # reshape to the original shape
            schema=OUTPUT_SCHEMA # coerce numpy array to the correct schema
        ) # index cols will be left unchanged (not updated b/c we only update output_cols)
    )

In [14]:
# zip the OUTPUT_DIR
import shutil
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/pac-duckdb-q1.zip'

Save for later:

In [15]:
mi = 1./4
noisescaledf = listdf.with_columns([
    pl.col(column).map_elements(lambda x: (1./(2*mi)) * x.cast(pl.Float64).var(), return_dtype=pl.Float64).alias(column)
    for column in OUTPUT_COLS
])
noisescaledf

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,f64,f64,f64,f64,f64,f64,f64,f64
"""A""","""F""",3724300000.0,7436700000000000.0,6715000000000000.0,7263700000000000.0,0.002623,6281.630877,1.3133e-08,5570000.0
"""N""","""F""",11586000.0,23000000000000.0,20804000000000.0,22554000000000.0,0.106636,240824.1818,4.8126e-07,14785.219941
"""N""","""O""",15257000000.0,3.0453e+16,2.7493e+16,2.9744e+16,0.001554,3437.431614,6.2183e-09,22876000.0
"""R""","""F""",3806600000.0,7592700000000000.0,6865200000000000.0,7422200000000000.0,0.002627,6940.439872,1.4063e-08,5723300.0


In [16]:
randomchoicedf = listdf.with_columns([
    pl.col(column).map_elements(lambda x: np.random.choice(x), return_dtype=templatedf.select(column).dtypes[0]).alias(column)
    for column in OUTPUT_COLS
])
randomchoicedf

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[*,2]","decimal[*,2]","decimal[*,4]","decimal[*,6]",f64,f64,f64,i64
"""A""","""F""",3777034.0,5292157623.02,5054003797.5806,5195964939.232922,25.570837,35995.457465,0.050011,149458
"""N""","""F""",94406.0,132758538.52,124272049.2462,130503485.029802,25.247755,35409.868184,0.0488,3760
"""N""","""O""",7519670.0,10502926615.42,10132418180.2374,10304718867.486025,25.606987,35969.400759,0.050065,297208
"""R""","""F""",3783334.0,5330816283.94,5041865160.4534,5211457050.016964,25.550229,36109.46539,0.049794,147052
