# PAC-DB

In [None]:
# Import your generated samples
ZIPFILE = f'./outputs/pac-duckdb-q1.zip'

EXPERIMENT = 'pac-q1-e2e'
OUTPUT_DIR = f'./outputs/{EXPERIMENT}'

In [None]:
import zipfile
import numpy as np
import pickle
import io

def load_numpy_outputs_from_zip(zip_path):
    output_np = []

    with zipfile.ZipFile(zip_path, 'r') as zf:
        for filename in zf.namelist():
            with zf.open(filename) as file:
                # if filename.endswith('.npy'):
                #     arr = np.load(file, allow_pickle=True)
                if filename.endswith('nparr.pkl'):
                    arr = pickle.load(file)
                else:
                    # print(f"Skipping files: {filename}")
                    continue

                output_np.append(arr)

    return output_np


In [None]:
import pandas as pd
import zipfile

out_np = load_numpy_outputs_from_zip(ZIPFILE)

print(f"Loaded {len(out_np)} arrays")
print(type(out_np[0]), out_np[0].shape if hasattr(out_np[0], 'shape') else "No shape")


In [None]:
print(len(out_np[0]))
# [array([[95549.5]]), 
# array([[95549.5]]), 
out_np = out_np[0]
print(out_np)

In [None]:
from typing import Any, Callable, Dict, List, Tuple, Union

import numpy as np

import pandas as pd
from pandas import DataFrame
import concurrent.futures
import pyarrow as pa
import pyarrow.parquet as pq
import pickle
from numpy.random import laplace
from functools import reduce
import operator
from IPython.display import display, HTML
from datetime import date
from scipy import special

In [None]:
GENERATE = True
USE_EVEN_NUMBER_OF_INPUT_ROWS = False
SEED_RANDOM_NUMBER_GENERATOR = True

SAMPLING_METHOD = 'poisson' # 'poisson' or 'half'

if GENERATE:
    print("GENERATE = True, so we will generate new samples.")
else:
    print("GENERATE = False, so we will load saved output from files rather than recomputing.")

import os
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
### Compute PAC Noise
def get_pac_noise_scale(out_np_raw: List[np.ndarray],
                           max_mi: float = 1./4) -> np.ndarray:
    if out_np_raw is None or len(out_np_raw) == 0:
        raise ValueError("Input list out_np cannot be empty.")
    out_np = out_np_raw.copy()
    # print(out_np)
    dimensions: int = len(out_np[0])
    print(f'The dimensions are - {dimensions}')

    out_np_2darr = [np.atleast_2d(o) for o in out_np] # make sure all the DF -> np.ndarray conversions result in 2d arrays
    print(out_np_2darr)
    est_y: np.ndarray = np.stack(out_np_2darr, axis=-1).reshape(dimensions, len(out_np))  # shape (dimensions, samples)
    print(f"est_y.shape: {est_y.shape}")
    print(f"est_y: {est_y}")

    # get the scale in each basis direction
    fin_var: np.ndarray = np.var(est_y, axis=1)  # shape (dimensions,)
    print(f"fin_var: {fin_var}")
    # fin_var: np.ndarray = np.array([float(x) for x in fin_var], dtype=np.float64)
    sqrt_total_var: np.floating[Any] = np.sum(np.sqrt(fin_var))
    print(f"sqrt_total_var: {sqrt_total_var}")

    # pac_noise: np.ndarray = (1./(2*max_mi)) * sqrt_total_var * np.sqrt(fin_var)  # scale of the PAC noise
    # THE NEW NOISE CALCULATION ->
    pac_noise: np.ndarray = (1./(2*max_mi)) * fin_var
    print(f"For mi={max_mi}, we should add noise from a normal distribution with scale...")
    print(f"pac_noise: {pac_noise}")
    return pac_noise

In [None]:
MI_OPTIONS = [0.001248318631131131, 1/64, 1/32, 1/16, 1/4, 1., 2., 4., 16.]
EXPERIMENTS = 1000
SAMPLES = 1024
OUTPUT_COLS = ['A_F_sum_qty', 'A_F_sum_base_price', 'A_F_sum_disc_price', 'A_F_sum_charge', 'A_F_avg_qty', 'A_F_avg_price', 'A_F_avg_disc', 'A_F_count_order',
               'N_F_sum_qty', 'N_F_sum_base_price', 'N_F_sum_disc_price', 'N_F_sum_charge', 'N_F_avg_qty', 'N_F_avg_price', 'N_F_avg_disc', 'N_F_count_order',
               'N_O_sum_qty', 'N_O_sum_base_price', 'N_O_sum_disc_price', 'N_O_sum_charge', 'N_O_avg_qty', 'N_O_avg_price', 'N_O_avg_disc', 'N_O_count_order',
               'R_F_sum_qty', 'R_F_sum_base_price', 'R_F_sum_disc_price', 'R_F_sum_charge', 'R_F_avg_qty', 'R_F_avg_price', 'R_F_avg_disc', 'R_F_count_order'
               ]

if GENERATE:
    experiment_results = []
    saved_steps = []

    print(f"Generate samples... {len(out_np)} samples generated.")

    for mi in MI_OPTIONS:
        print("Getting PAC Noise...")
        scale = get_pac_noise_scale(out_np, mi) # estimate the stability of the query
        print(f"mi={mi}, scale={scale}")
        
        for e in range(EXPERIMENTS):
            # for each PAC release at this MI, we will choose a sample from the pre-generated out_np list and add noise to it
            steps = {
                "mi": mi,
                "scale": scale,
            }

            # choose our sample
            chosen_index = np.random.choice(range(SAMPLES))
            chosen_sample = out_np[chosen_index].copy()
            steps["chosen_sample"] = chosen_sample
            
            # add noise to it
            # chosen_noise will also be an array
            
            chosen_noise = np.random.normal(loc=0, scale=np.sqrt(scale))
            steps["chosen_noise"] = chosen_noise
            
            print(f'Chosen Sample {chosen_sample}')
            # chosen_sample = np.array([float(x) for x in chosen_sample], dtype=np.float64)
            release = chosen_sample + chosen_noise # do_pac_and_release(out_np, mi, scale, chosen_index)

            #print(f"sample(#{chosen_index}):{chosen_sample} + noise:{chosen_noise} = {release}")
            steps["release"] = release
            #release[0] *= 2   # manually correct count = count * 2

            # manually add sum as count * mean
            #noisy_output = [noisy_output[0], noisy_output[0] * noisy_output[1], noisy_output[1]]
            #chosen_sample = [chosen_sample[0], chosen_sample[0] * chosen_sample[1], chosen_sample[1]]
            experiment_results.append([mi, *release])
            saved_steps.append(steps)
    
    df = pd.DataFrame(experiment_results, columns=['mi', *OUTPUT_COLS])
    
    # Save the new data to outputs/...
    df.to_parquet(f'{OUTPUT_DIR}/pac_results.parquet')
    with open(f'{OUTPUT_DIR}/experiment_results.pkl', 'wb') as f:
        pickle.dump(experiment_results, f)
    with open(f'{OUTPUT_DIR}/saved_steps.pkl', 'wb') as f:
        pickle.dump(saved_steps, f)
else:
    df = pq.read_table(f"{OUTPUT_DIR}/pac_results.parquet").to_pandas()

    with open(f'{OUTPUT_DIR}/experiment_results.pkl', 'rb') as f:
        experiment_results = pickle.load(f)
    with open(f'{OUTPUT_DIR}/saved_steps.pkl', 'rb') as f:
        saved_steps = pickle.load(f)

# df.head()

In [None]:
saved_steps_df_temp = pd.DataFrame(saved_steps)
saved_steps_df_temp.head()

In [None]:
### Reconstruct Saved Steps
# Get list of keys from the first dict, excluding 'mi' since it's not a list
steps = [k for k in saved_steps[0].keys() if k not in ('mi')] # pull keys from saved_steps[0]

saved_steps_df_temp = pd.DataFrame(saved_steps)

# Create expanded columns using comprehension
expanded = {
    'mi': saved_steps_df_temp['mi'],
    **{f'{step}_{col}': saved_steps_df_temp[step].str[i] 
        for step in steps
        for i, col in enumerate(OUTPUT_COLS)}
}

# Create MultiIndex DataFrame using OUTPUT_COLS
saved_steps_df = pd.DataFrame(expanded)
saved_steps_df.columns = pd.MultiIndex.from_tuples([('mi',''), *[  # multiindex so that we can do things like saved_steps_df['release'][<aggregation>]
    (step, col) for step in steps for col in OUTPUT_COLS
]], names=["step", "query"])
saved_steps_df.head()

In [None]:
saved_steps_df.groupby('mi').mean()

In [None]:
saved_steps_df[saved_steps_df['mi'] == 1/4]['chosen_sample']['A_F_count_order'].hist()

In [None]:
if ('mean' in OUTPUT_COLS):
    print("Mean of chosen_sample['mean'] for mi = 1/32")
    saved_steps_df[saved_steps_df['mi'] == 1/32]['chosen_sample']['mean'].hist()