# Modified TPC-H Q1 Microbenchmark
This file is adapted to use numpy only without Spark.

```sql
select
  sum(l_quantity) as sum_qty,
  sum(l_extendedprice) as sum_base_price,
  sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
  sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
  avg(l_quantity) as avg_qty,
  avg(l_extendedprice) as avg_price,
  avg(l_discount) as avg_disc,
  count(*) as count_order
from
  lineitem
where
  l_shipdate <= '1998-09-02'
  and l_returnflag = 'A'
  and l_linestatus = 'F'
```

In [1]:
EXPERIMENT = 'pac-q1-svd'
OUTPUT_DIR = f'./outputs/{EXPERIMENT}'
GENERATE = True
USE_EVEN_NUMBER_OF_INPUT_ROWS = False
SEED_RANDOM_NUMBER_GENERATOR = True

SAMPLING_METHOD = 'poisson' # 'poisson' or 'half'

if GENERATE:
    print("GENERATE = True, so we will generate new samples.")
else:
    print("GENERATE = False, so we will load saved output from files rather than recomputing.")

import os
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


GENERATE = True, so we will generate new samples.


### Running PAC

In [2]:
from typing import Any, Callable, Dict, List, Tuple, Union

import numpy as np
if SEED_RANDOM_NUMBER_GENERATOR:
    np.random.seed(0)

import pandas as pd
from pandas import DataFrame
import concurrent.futures
import pyarrow as pa
import pyarrow.parquet as pq
import pickle
from numpy.random import laplace
from functools import reduce
import operator
from IPython.display import display, HTML
from datetime import date
from scipy import special

In [3]:
### Mayuri's conversion functions between DP epsilon and PAC MI using posterior advantage for equivalence
def calc_posterior(mi, prior=0.5, prec = 100000):
    test_vals = [x / prec for x in range(1, prec)]
    max_t = None
    for t in test_vals:
        if t*np.log(t/prior)+(1-t)*np.log((1-t)/(1-prior)) <= mi:
            if  max_t is None or t > max_t:
                max_t = t
    return max_t

def dp_epsilon_to_posterior_success(epsilon):
    return 1 - 1./(1+np.exp(epsilon))

def dp_ps_to_epsilon(ps):
    return np.log(ps / (1-ps))

# example usage:
# dp_ps_to_epsilon(calc_posterior(1/256.))

In [4]:
### Data Setup
#por_df = pq.read_table(f"./data/student_performance/student-por.parquet").to_pandas()
lineitem_df = pd.read_parquet('data/tpch/lineitem.parquet')

lineitem_df.shape

(600572, 16)

In [5]:
type(lineitem_df['l_linestatus'][0])

str

In [6]:
def runquery(lineitem_df: DataFrame) -> int:
    # 1. Filter lineitem rows where commit date is before receipt date.
    lineitem_filtered = lineitem_df[
        (lineitem_df['l_shipdate'] <= date(1998, 9, 2)) &
        (lineitem_df['l_returnflag'] == 'A') &
        (lineitem_df['l_linestatus'] == 'F')
    ]
    print(lineitem_filtered.shape)

    # 2. Pre-compute
    discounted_price = lineitem_filtered['l_extendedprice'] * (1 - lineitem_filtered['l_discount'])
    charged_price = discounted_price * (1 + lineitem_filtered['l_tax'])

    # 3. Get aggregations
    aggregated_result = [
        float(lineitem_filtered['l_quantity'].sum()),       # sum_qty
        float(lineitem_filtered['l_extendedprice'].sum()),  # sum_base_price
        float(discounted_price.sum()),                      # sum_disc_price
        float(charged_price.sum()),                         # sum_charge
        float(lineitem_filtered['l_quantity'].mean()),      # avg_qty
        float(lineitem_filtered['l_extendedprice'].mean()), # avg_price
        float(lineitem_filtered['l_discount'].mean()),      # avg_disc
        float(len(lineitem_filtered))                       # count_order (faster than .count())
    ]

    # print(aggregated_result)

    return aggregated_result
runquery(lineitem_df)

(147790, 16)


[3774200.0,
 5320753880.69,
 5054096266.6828,
 5256751331.449234,
 25.537587116854997,
 36002.12382901414,
 0.05014459706340077,
 147790.0]

In [7]:
### Query Setup
SAMPLES = 100
assert SAMPLES % 2 == 0, "We need an even number of samples for paired sampling."
number_of_pairs = SAMPLES // 2

OUTPUT_COLS = ['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']

true_result = np.array(runquery(lineitem_df)) # Save the true result of the query for later
#true_result = np.divide(true_result, 2) # manually correct count = count * 2

number_of_contributing_rows = 147790  # hardcode number of eligible rows

def poisson_paired_sample(df: DataFrame) -> Tuple[DataFrame, DataFrame]:
    """
    This will select a subset of indices, where each index is selected with probability 0.5.
    The first result is the dataframe composed of the selected rows.
    The second result is the complement / the dataframe composed of the rows that were not selected.
    """
    mask = np.random.random_sample(len(df)) < 0.5  # Generates a bitmask of length df.shape[0] where each bit is 1 with probability 0.5
    selected = df[mask]
    not_selected = df[~mask]
    return selected, not_selected

def half_paired_sample(df: DataFrame) -> Tuple[DataFrame, DataFrame]:
    """
    This will select half of the row indices from the dataframe at random.
    The first result is the dataframe composed of the selected rows.
    The second result is the complement / the dataframe composed of the rows that were not selected.
    """
    indices = np.random.choice(df.index, size=(df.shape[0] // 2), replace=False)
    not_indices = list(set(df.index) - set(indices))
    selected: DataFrame = df.loc[indices]
    not_selected: DataFrame = df.loc[not_indices]
    return (selected, not_selected)

def sample_using_chosen_method(df: DataFrame) -> Tuple[DataFrame, DataFrame]:
    if SAMPLING_METHOD == 'poisson':
        return poisson_paired_sample(df)
    elif SAMPLING_METHOD == 'half':
        return half_paired_sample(df)

def generate_samples(laplace_lambda: float = 1.0, alpha: int = 10) -> List[np.ndarray]:
    # Apply thresholding: if there are less than alpha samples
    # (with some Laplacian noise), then we don't subsample and return None.
    laplace_noise: float = np.random.laplace(scale=laplace_lambda)

    if number_of_contributing_rows + laplace_noise < alpha:  # if we don't get enough results from the query
        print("There are not enough rows contributing to the result for PAC to be meaningful.")
        return []
    
    number_of_pairs = SAMPLES // 2
    out_np: List[np.ndarray] = []
    for i in range(number_of_pairs):
        print(f'iteration : {i}')
        for temp_df in sample_using_chosen_method(lineitem_df.reset_index(drop=True)):  # reset index to sequential
            out = runquery(temp_df)
            # double the sums and counts, avg stays the same
            out[0] *= 2
            out[1] *= 2
            out[2] *= 2
            out[3] *= 2
            out[7] *= 2
            # for a 2d array, flatten it
            out_np.append(np.array(out))
    # Debug: SVD new (not needed I think??)
    # out_np = reduce(operator.iconcat, out_np, [])

    return out_np

(147790, 16)


In [8]:
out_np = generate_samples()

iteration : 0
(73684, 16)
(74106, 16)
iteration : 1
(73818, 16)
(73972, 16)
iteration : 2
(73858, 16)
(73932, 16)
iteration : 3
(74007, 16)
(73783, 16)
iteration : 4
(73832, 16)
(73958, 16)
iteration : 5
(73949, 16)
(73841, 16)
iteration : 6
(73827, 16)
(73963, 16)
iteration : 7
(73876, 16)
(73914, 16)
iteration : 8
(73943, 16)
(73847, 16)
iteration : 9
(74144, 16)
(73646, 16)
iteration : 10
(73870, 16)
(73920, 16)
iteration : 11
(73681, 16)
(74109, 16)
iteration : 12
(73801, 16)
(73989, 16)
iteration : 13
(73773, 16)
(74017, 16)
iteration : 14
(73555, 16)
(74235, 16)
iteration : 15
(73962, 16)
(73828, 16)
iteration : 16
(74054, 16)
(73736, 16)
iteration : 17
(73704, 16)
(74086, 16)
iteration : 18
(73663, 16)
(74127, 16)
iteration : 19
(73809, 16)
(73981, 16)
iteration : 20
(73772, 16)
(74018, 16)
iteration : 21
(74007, 16)
(73783, 16)
iteration : 22
(74058, 16)
(73732, 16)
iteration : 23
(73756, 16)
(74034, 16)
iteration : 24
(74167, 16)
(73623, 16)
iteration : 25
(74109, 16)
(73681, 

In [29]:
dimensions: int = len(out_np[0])

In [30]:
out_np_2darr = [np.atleast_2d(o) for o in out_np] # make sure all the DF -> np.ndarray conversions result in 2d arrays
est_y: np.ndarray = np.stack(out_np_2darr, axis=-1).reshape(dimensions, len(out_np))  # shape (dimensions, samples)

In [31]:
var = np.var(est_y, axis=1)
sqrt_total_var_no_svd = np.sum(np.sqrt(var))
sqrt_total_var_no_svd

49019433.347756885

In [32]:
cov = np.cov(est_y)
proj_var, u = np.linalg.eig(cov)
sqrt_total_var_svd = np.sum(np.sqrt(proj_var))
sqrt_total_var_svd

29144795.30056708

In [33]:
pac_noise_no_svd: np.ndarray = (1./(2*1)) * sqrt_total_var_no_svd * np.sqrt(var)

print(pac_noise_no_svd)

[2.75813242e+11 4.07374907e+14 3.88475985e+14 4.05314712e+14
 8.41970932e+05 1.42470334e+09 2.03540619e+03 9.57892659e+09]


In [37]:
pac_noise__svd: np.ndarray = (1./(2*1)) * sqrt_total_var_svd * np.sqrt(proj_var)
pac_noise__svd

array([4.14419368e+14, 6.90403678e+12, 3.34734599e+12, 3.61592933e+10,
       2.63436345e+09, 1.87041602e+06, 5.26883793e+02, 5.48510893e+02])

In [40]:
pac_noise_svd = np.matmul(np.matmul(u, pac_noise__svd), u.T)
pac_noise_svd

array([-2.43860957e+14, -2.22817366e+14,  2.49722739e+14, -1.50780853e+13,
        6.26991679e+06,  2.33996073e+12,  6.82470666e+08, -9.60567164e+12])

In [36]:
print(pac_noise_svd)

[-2.36347902e+14 -2.21343172e+14  2.41861135e+14 -8.95756810e+12
 -2.09459826e+07  2.26689913e+12  4.76797547e+09 -9.30992508e+12]


In [41]:
pac_noises_to_add_svd: np.ndarray = np.random.normal(loc=0, scale=np.sqrt(pac_noise_svd))

pac_noises_to_add_no_svd: np.ndarray = np.random.normal(loc=0, scale=np.sqrt(pac_noise_no_svd))

  pac_noises_to_add_svd: np.ndarray = np.random.normal(loc=0, scale=np.sqrt(pac_noise_svd))


In [42]:
print(pac_noises_to_add_svd)

[            nan             nan  1.16364308e+07             nan
 -3.04532894e+03 -1.89432949e+06 -2.28325949e+02             nan]


In [43]:
print(pac_noises_to_add_no_svd)

[-5.95477486e+05  1.99790528e+07 -1.98637366e+07  4.05011717e+06
 -3.65672263e+02  2.67589593e+04  8.54768233e+00  9.95381812e+04]


In [44]:
pac_noises_to_add_svd: np.ndarray = np.random.normal(loc=0, scale=np.sqrt(np.abs(pac_noise_svd)))

pac_noises_to_add_no_svd: np.ndarray = np.random.normal(loc=0, scale=np.sqrt(pac_noise_no_svd))

In [45]:
print(pac_noises_to_add_svd)

[ 5.40001579e+06 -7.17059789e+06  2.19019785e+07 -4.31642750e+06
  2.49810890e+03  1.02996487e+05  3.36043279e+04  6.92463163e+05]


In [46]:
print(pac_noises_to_add_no_svd)

[ 6.23422954e+04 -8.37506654e+05  1.00588299e+07 -2.25081621e+04
 -8.24821141e+02  2.48676321e+04  3.44536973e+01  5.84580801e+04]
