## DuckDB Notebook

This notebook generates a bunch of raw outputs, without applying PAC, to be consumed by a second stage.

In [None]:
#!/usr/bin/env python
# coding: utf-8

import csv
import os
import pickle
import shutil
from typing import List

import duckdb
import numpy as np
import polars as pl

EXPERIMENT = 'pac-duckdb-q2'
OUTPUT_DIR = f'./outputs/{EXPERIMENT}'
GENERATE = False
USE_EVEN_NUMBER_OF_INPUT_ROWS = False

if GENERATE:
    print("GENERATE = True, so we will generate new samples.")
else:
    print("GENERATE = False, so we will load saved output from files rather than recomputing.")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# duckdb load data/tpch/tpch.duckdb
#con = duckdb.connect(database='data/tpch/tpch.duckdb', read_only=True)
con = duckdb.connect(database=':memory:')
tables = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
#tables = ["lineitem", "orders"]
for t in tables:
    con.execute(f"CREATE TABLE {t} AS SELECT * FROM 'data/tpch/{t}.parquet'")

lineitem_df = con.execute("SELECT * FROM lineitem").fetchdf()
orders_df = con.execute("SELECT * FROM orders").fetchdf()

row_count = lineitem_df.shape[0]

GENERATE = False, so we will load saved output from files rather than recomputing.


In [2]:
# Construct the table of random samples
# to use, join it with the lineitem table (for specific sample # s) and filter to just the
# rows where random_binary = 1.0
# This will give us a 50% sample of the lineitem table for each sample # s

SAMPLES = 1024
assert SAMPLES % 2 == 0, "SAMPLES must be even to create complementary samples."

random_samples = con.execute(f"""
DROP TABLE IF EXISTS random_samples;

CREATE TABLE random_samples AS
WITH sample_numbers AS MATERIALIZED (
    SELECT range AS sample_id FROM range({SAMPLES//2})
), random_values AS MATERIALIZED (
    SELECT 
        sample_numbers.sample_id,
        supplier.rowid AS row_id,
        (RANDOM() > 0.5)::BOOLEAN AS random_binary
    FROM sample_numbers
    JOIN supplier ON TRUE  -- Cross join to duplicate rows for each sample
)
SELECT
    sample_id,
    row_id,
    random_binary
FROM random_values
UNION ALL
SELECT -- select the complementary samples too
    ({SAMPLES//2}) + sample_id,
    row_id,
    NOT random_binary  -- Inverse the random_binary to get the complementary sample
FROM random_values
ORDER BY sample_id, row_id;
""")

In [3]:
#sample_sizes = con.execute("""
#SELECT sample_id, SUM(random_binary) AS sample_size
#FROM random_samples
#GROUP BY sample_id;
#""").pl()

The randomness of what rows are chosen is saved to disk in `random_binary.json`. For each sample #, there is an array with one entry per row, where 1 means the row was chosen and 0 means it was not.

In [None]:
con.execute("""
SELECT sample_id, array_agg(random_binary::TINYINT) as random_binary
FROM random_samples
GROUP BY sample_id;
""").pl().write_json(f"{OUTPUT_DIR}/random_binary.json")

Query is specified as a prepared statement. We will then execute it once per sample.

In [5]:
# Query
con.execute("""
DEALLOCATE PREPARE run_query;

PREPARE run_query AS 
SELECT
    s_acctbal,
    s_name,
    n_name,
    p_partkey,
    p_mfgr,
    s_address,
    s_phone,
    s_comment
FROM
    part,
    supplier,
    partsupp,
    nation,
    region
WHERE
    p_partkey = ps_partkey
    AND s_suppkey = ps_suppkey
    AND p_size = 15
    AND p_type LIKE '%BRASS'
    AND s_nationkey = n_nationkey
    AND n_regionkey = r_regionkey
    AND r_name = 'EUROPE'
    AND ps_supplycost = (
        SELECT
            min(ps_supplycost)
        FROM
            partsupp,
            supplier,
            nation,
            region
        JOIN random_samples AS rs ON rs.row_id = supplier.rowid
        WHERE
            p_partkey = ps_partkey
            AND s_suppkey = ps_suppkey
            AND s_nationkey = n_nationkey
            AND n_regionkey = r_regionkey
            AND r_name = 'EUROPE'
            AND rs.random_binary = TRUE
            AND rs.sample_id = $sample
        )
ORDER BY
    s_acctbal DESC,
    n_name,
    s_name,
    p_partkey
LIMIT 100;
""")

con.execute(f"EXECUTE run_query(sample := {0});").pl()

s_acctbal,s_name,n_name,p_partkey,p_mfgr,s_address,s_phone,s_comment
"decimal[15,2]",str,str,i64,str,str,str,str
9508.37,"""Supplier#000000070""","""FRANCE""",3563,"""Manufacturer#1""","""jd4djZv0cc5KdnA0q9oOqvceaPUbNl…","""16-821-608-1166""","""n instructions are about the i…"
9508.37,"""Supplier#000000070""","""FRANCE""",17268,"""Manufacturer#4""","""jd4djZv0cc5KdnA0q9oOqvceaPUbNl…","""16-821-608-1166""","""n instructions are about the i…"
9453.01,"""Supplier#000000802""","""ROMANIA""",10021,"""Manufacturer#5""","""1Uj23QWxQjj7EyeqHWqGWTbN""","""29-342-882-6463""","""s according to the even deposi…"
9453.01,"""Supplier#000000802""","""ROMANIA""",13275,"""Manufacturer#4""","""1Uj23QWxQjj7EyeqHWqGWTbN""","""29-342-882-6463""","""s according to the even deposi…"
9198.31,"""Supplier#000000025""","""RUSSIA""",12238,"""Manufacturer#1""","""aoagce3elDACNssVvTLcQl55Up6EYA""","""32-431-945-3541""","""quickly pending accounts cajol…"
…,…,…,…,…,…,…,…
683.07,"""Supplier#000000651""","""RUSSIA""",4888,"""Manufacturer#4""","""D4MGIq5Uz0,K""","""32-181-426-4490""","""ve to are slyly ironic asympto…"
167.56,"""Supplier#000000290""","""FRANCE""",2037,"""Manufacturer#1""","""VpG,Ul5yv1RgAK,,""","""16-675-286-5102""",""" carefully furiously stealthy …"
91.39,"""Supplier#000000949""","""UNITED KINGDOM""",9430,"""Manufacturer#2""","""R06m0VD95FZLoBJHcCMyaZQHitqmhZ…","""33-332-697-2768""","""sual requests. carefully regul…"
-314.06,"""Supplier#000000510""","""ROMANIA""",17242,"""Manufacturer#4""","""6E3aFs0w2SiImzMDSewWtzOwdpLz2""","""29-207-852-3454""","""lyly regular accounts. deposit…"


We now generate the query result for each sample. The query output of each sample is saved to disk in multiple formats:
- `csv` contains one file per sample, with the table written in CSV format. This does not preserve data type information.
- `parquet` contains one file per sample, with the table written in Parquet format. This preserves data type information as apache arrow converted types.
- `dfs.pkl` contains the python list of polars dataframes in a binary format. This could be used to resume the notebook with the exact same previously-used randomness.

In [6]:
# Run all the queries
dfs: List[pl.DataFrame] = []
for s in range(SAMPLES):
    dfs.append(con.execute(f"EXECUTE run_query(sample := {s});").pl())

# Save the results to disk
os.makedirs(f"{OUTPUT_DIR}/csv", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/parquet", exist_ok=True)
for i, df in enumerate(dfs):
    df.write_csv(f"{OUTPUT_DIR}/csv/sample_{i}.csv")
    df.write_parquet(f"{OUTPUT_DIR}/parquet/sample_{i}.parquet")
with open(f"{OUTPUT_DIR}/dfs.pkl", "wb") as f:
    pickle.dump(dfs, f)

The samples have been generated and stored in `outputs/{OUTPUT_DIR}/csv/sample_{i}.csv`.

In [7]:
dfs[0].columns

['s_acctbal',
 's_name',
 'n_name',
 'p_partkey',
 'p_mfgr',
 's_address',
 's_phone',
 's_comment']

In [10]:
INDEX_COLS = []
OUTPUT_COLS = ['s_acctbal', 's_name', 'n_name', 'p_partkey', 'p_mfgr', 's_address', 's_phone', 's_comment']
OUTPUT_SCHEMA = dfs[0].select(OUTPUT_COLS).collect_schema()
OUTPUT_SHAPE = dfs[0].select(OUTPUT_COLS).to_numpy().shape
with open(f"{OUTPUT_DIR}/schema.txt", "w") as f:
    f.write(str(OUTPUT_SCHEMA))
print(f"Output schema: {OUTPUT_SCHEMA}")
print(f"Output shape: {OUTPUT_SHAPE}")

Output schema: Schema([('s_acctbal', Decimal(precision=15, scale=2)), ('s_name', String), ('n_name', String), ('p_partkey', Int64), ('p_mfgr', String), ('s_address', String), ('s_phone', String), ('s_comment', String)])
Output shape: (27, 8)


In [11]:
dfs[0].select(OUTPUT_COLS)

s_acctbal,s_name,n_name,p_partkey,p_mfgr,s_address,s_phone,s_comment
"decimal[15,2]",str,str,i64,str,str,str,str
9508.37,"""Supplier#000000070""","""FRANCE""",3563,"""Manufacturer#1""","""jd4djZv0cc5KdnA0q9oOqvceaPUbNl…","""16-821-608-1166""","""n instructions are about the i…"
9508.37,"""Supplier#000000070""","""FRANCE""",17268,"""Manufacturer#4""","""jd4djZv0cc5KdnA0q9oOqvceaPUbNl…","""16-821-608-1166""","""n instructions are about the i…"
9453.01,"""Supplier#000000802""","""ROMANIA""",10021,"""Manufacturer#5""","""1Uj23QWxQjj7EyeqHWqGWTbN""","""29-342-882-6463""","""s according to the even deposi…"
9453.01,"""Supplier#000000802""","""ROMANIA""",13275,"""Manufacturer#4""","""1Uj23QWxQjj7EyeqHWqGWTbN""","""29-342-882-6463""","""s according to the even deposi…"
9198.31,"""Supplier#000000025""","""RUSSIA""",12238,"""Manufacturer#1""","""aoagce3elDACNssVvTLcQl55Up6EYA""","""32-431-945-3541""","""quickly pending accounts cajol…"
…,…,…,…,…,…,…,…
683.07,"""Supplier#000000651""","""RUSSIA""",4888,"""Manufacturer#4""","""D4MGIq5Uz0,K""","""32-181-426-4490""","""ve to are slyly ironic asympto…"
167.56,"""Supplier#000000290""","""FRANCE""",2037,"""Manufacturer#1""","""VpG,Ul5yv1RgAK,,""","""16-675-286-5102""",""" carefully furiously stealthy …"
91.39,"""Supplier#000000949""","""UNITED KINGDOM""",9430,"""Manufacturer#2""","""R06m0VD95FZLoBJHcCMyaZQHitqmhZ…","""33-332-697-2768""","""sual requests. carefully regul…"
-314.06,"""Supplier#000000510""","""ROMANIA""",17242,"""Manufacturer#4""","""6E3aFs0w2SiImzMDSewWtzOwdpLz2""","""29-207-852-3454""","""lyly regular accounts. deposit…"


In [12]:
def numpyify(df: pl.DataFrame) -> np.ndarray:
    return df.select(OUTPUT_COLS).to_numpy().flatten()

In [13]:
dfs[0]

s_acctbal,s_name,n_name,p_partkey,p_mfgr,s_address,s_phone,s_comment
"decimal[15,2]",str,str,i64,str,str,str,str
9508.37,"""Supplier#000000070""","""FRANCE""",3563,"""Manufacturer#1""","""jd4djZv0cc5KdnA0q9oOqvceaPUbNl…","""16-821-608-1166""","""n instructions are about the i…"
9508.37,"""Supplier#000000070""","""FRANCE""",17268,"""Manufacturer#4""","""jd4djZv0cc5KdnA0q9oOqvceaPUbNl…","""16-821-608-1166""","""n instructions are about the i…"
9453.01,"""Supplier#000000802""","""ROMANIA""",10021,"""Manufacturer#5""","""1Uj23QWxQjj7EyeqHWqGWTbN""","""29-342-882-6463""","""s according to the even deposi…"
9453.01,"""Supplier#000000802""","""ROMANIA""",13275,"""Manufacturer#4""","""1Uj23QWxQjj7EyeqHWqGWTbN""","""29-342-882-6463""","""s according to the even deposi…"
9198.31,"""Supplier#000000025""","""RUSSIA""",12238,"""Manufacturer#1""","""aoagce3elDACNssVvTLcQl55Up6EYA""","""32-431-945-3541""","""quickly pending accounts cajol…"
…,…,…,…,…,…,…,…
683.07,"""Supplier#000000651""","""RUSSIA""",4888,"""Manufacturer#4""","""D4MGIq5Uz0,K""","""32-181-426-4490""","""ve to are slyly ironic asympto…"
167.56,"""Supplier#000000290""","""FRANCE""",2037,"""Manufacturer#1""","""VpG,Ul5yv1RgAK,,""","""16-675-286-5102""",""" carefully furiously stealthy …"
91.39,"""Supplier#000000949""","""UNITED KINGDOM""",9430,"""Manufacturer#2""","""R06m0VD95FZLoBJHcCMyaZQHitqmhZ…","""33-332-697-2768""","""sual requests. carefully regul…"
-314.06,"""Supplier#000000510""","""ROMANIA""",17242,"""Manufacturer#4""","""6E3aFs0w2SiImzMDSewWtzOwdpLz2""","""29-207-852-3454""","""lyly regular accounts. deposit…"


In [14]:
def tablify(arr: np.ndarray) -> pl.DataFrame:
    global OUTPUT_SHAPE, OUTPUT_SCHEMA
    return dfs[0].update( # put values back into the original dataframe
        pl.DataFrame(
            arr.reshape(OUTPUT_SHAPE), # reshape to the original shape
            schema=OUTPUT_SCHEMA # coerce numpy array to the correct schema
        ) # index cols will be left unchanged (not updated b/c we only update output_cols)
    )

Samples in numpy format are saved to disk in a variety of ways, all of which contain the same data:
- `npy` contains arrays saved in the Numpy format. See https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html
- `npcsv` contains numpy arrays saved in the CSV format. These are 1D arrays of whatever data type (probably float) is in the table.
- `nparr.npz` contains all the numpy arrays saved in the Numpy format for saving multiple arrays in one file. See https://numpy.org/doc/stable/reference/generated/numpy.savez.html
- `nparr.pkl` contains the python list of numpy arrays in a binary format, if you don't want to use the numpy format for some reason.

In [15]:
# Convert the DataFrames to numpy arrays
nparr = [numpyify(df) for df in dfs]

# Save the numpy arrays to disk
os.makedirs(f"{OUTPUT_DIR}/npy", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/npcsv", exist_ok=True)
for i, arr in enumerate(nparr):
    np.save(f"{OUTPUT_DIR}/npy/arr_{i}.npy", arr)
for i, arr in enumerate(nparr):
    with open(f"{OUTPUT_DIR}/npcsv/arr_{i}.csv", "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerow(arr)
np.savez(f"{OUTPUT_DIR}/nparr.npz", *nparr)
with open(f"{OUTPUT_DIR}/nparr.pkl", "wb") as f:
    pickle.dump(nparr, f)

Here is an example of how to load the `npz` file into an array of 'samples' where the samples are each a 1d numpy array.
```python
test = np.load(f"{OUTPUT_DIR}/nparr.npz")
npsamples = [test[f'arr_{i}'] for i in range(SAMPLES)]
```

In [16]:
test = np.load(f"{OUTPUT_DIR}/nparr.npz", allow_pickle=True)
npsamples = [test[f'arr_{i}'] for i in range(SAMPLES)]
npsamples[0].shape

(216,)

In [None]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, "zip", OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/pac-duckdb-q2.zip'