In [106]:
EXPERIMENT = 'dp-q1-count-order-N-O'
OUTPUT_DIR = f'../dp/{EXPERIMENT}'

import os
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


In [107]:
from typing import Any, Callable, Dict, List, Tuple, Union

import numpy as np

import pandas as pd
from pandas import DataFrame
import concurrent.futures
import pyarrow as pa
import pyarrow.parquet as pq
import pickle
from numpy.random import laplace
from functools import reduce
import operator
from IPython.display import display, HTML
from datetime import date

In [108]:
### Data Setup
lineitem_df = pd.read_parquet('../data/tpch/lineitem.parquet')

lineitem_df.shape

(600572, 16)

In [109]:
lineitem_filtered = lineitem_df[
        (lineitem_df['l_shipdate'] <= date(1998, 9, 2)) &
        (lineitem_df['l_returnflag'] == 'N') &
        (lineitem_df['l_linestatus'] == 'O')
    ]

In [110]:
### Mayuri's conversion functions between DP epsilon and PAC MI using posterior advantage for equivalence
def calc_posterior(mi, prior=0.5, prec = 100000):
    test_vals = [x / prec for x in range(1, prec)]
    max_t = None
    for t in test_vals:
        if t*np.log(t/prior)+(1-t)*np.log((1-t)/(1-prior)) <= mi:
            if  max_t is None or t > max_t:
                max_t = t
    return max_t

def dp_epsilon_to_posterior_success(epsilon):
    return 1 - 1./(1+np.exp(epsilon))

def dp_ps_to_epsilon(ps):
    return np.log(ps / (1-ps))

# example usage:
# dp_ps_to_epsilon(calc_posterior(1/256.))

In [111]:
# mi_list = [1/256., 1/128., 1/64., 1/16., 1/4., 1., 2., 4., 16.]
mi_list = [0.001248318631131131, 1/64, 1/32, 1/16, 1/4, 1., 2., 4., 16.]
eps_list = [(dp_ps_to_epsilon(calc_posterior(mi))) for mi in mi_list]

In [112]:
import numpy as np
true_count = 3765
# for every epsilon, there are 100 values of abs dp noise, 
# and 100 values of noisy results
# for the eps vs noise graph, we take the avg of dp noise per epsilon
# for the other graph, we plot all points on the y-axis

def run_dp(df, eps_list, prefix, true_result):
    dp_noise_list = []
    dp_result_list = []
    scales = []
    for eps in eps_list:
        if prefix == 'count':
            sensitivity = 1  # Sensitivity for count queries is always 1
        # elif prefix == 'mean':
        #     max_absences = np.max(df["absences"])
        #     min_absences = np.min(df["absences"])
        #     n = len(df)
        #     sensitivity = (max_absences - min_absences) / n 
        # elif prefix == 'sum':
        #     max_absences = np.max(df["absences"])
        #     sensitivity = max_absences
        scale = sensitivity / eps  # Scale parameter for Laplace noise
        scales.append(scale)
        
        all_noise = []
        all_abs_noise = []
        all_noisy_result = []
        # for every epsilon, get 100 values
        for i in range(100):
            noise = np.random.laplace(loc=0, scale=scale)
            all_noise.append(noise)
            all_abs_noise.append(np.abs(noise))
            all_noisy_result.append(true_result + noise)
    
        dp_noise_list.append(all_abs_noise) # list of lists
        dp_result_list.append(all_noisy_result)
    
    return scales, dp_noise_list, dp_result_list


In [113]:
scale, dp_noise_list, dp_result_list = run_dp(lineitem_filtered, eps_list, 'count', true_count)

In [114]:
var_of_count = [1/item for item in eps_list]

In [115]:
import os
from datetime import datetime

def save_results(list, mi_list, prefix: str, type_of_list: str):

    flattened_eps = [eps for eps, results in zip(mi_list, list) for _ in results]
    flattened_values = [val for results in list for val in results]

    # Create DataFrame
    df = pd.DataFrame({
        'mi': flattened_eps,
        'count_order': flattened_values
    })

    df.to_csv(f'{OUTPUT_DIR}/dp_basic_{prefix}_{type_of_list}.csv', index=False)

In [116]:
# save_results(list=scale, mi_list=mi_list, prefix="count", type_of_list="variances")
save_results(list=dp_noise_list, mi_list=mi_list, prefix="count", type_of_list="noise")
save_results(list=dp_result_list, mi_list=mi_list, prefix="count", type_of_list="results")

In [117]:
def absolute_scaled_error(est: np.ndarray, actual: np.ndarray) -> np.ndarray:
    return np.abs(est - actual)
def relative_error_percent(est: np.ndarray, actual: np.ndarray) -> np.ndarray:
    return (np.abs(est - actual) / actual) * 100

In [118]:
OUTPUT_COLS = ['count_order']
ERROR_COLS = [*[f'absolute error {i}' for i in OUTPUT_COLS], *[f'relative error {i}' for i in OUTPUT_COLS]]

In [119]:
dp_results_df = pd.read_csv(f"{OUTPUT_DIR}/dp_basic_count_results.csv")

dp_results_df

Unnamed: 0,mi,count_order
0,0.001248,3763.156900
1,0.001248,3763.722172
2,0.001248,3769.203896
3,0.001248,3759.416009
4,0.001248,3797.651608
...,...,...
895,16.000000,3765.188697
896,16.000000,3764.873477
897,16.000000,3765.020582
898,16.000000,3764.910237


In [120]:
errors_list = []
for i, row in dp_results_df.iterrows():
    mi = row['mi']
    r = row[OUTPUT_COLS].to_numpy()
    errors_list.append([mi, *absolute_scaled_error(r, true_count), *relative_error_percent(r, true_count)])
dp_errors_df = pd.DataFrame(errors_list, columns=['mi', *ERROR_COLS])
dp_errors_df.to_csv(f"{OUTPUT_DIR}/dp-q1-errors.csv")

In [121]:
grouped_avg = dp_errors_df.groupby('mi')[['absolute error count_order', 'relative error count_order']].mean().reset_index()

In [122]:
print(grouped_avg)

          mi  absolute error count_order  relative error count_order
0   0.001248                   10.507145                    0.279074
1   0.015625                    3.066360                    0.081444
2   0.031250                    2.263352                    0.060116
3   0.062500                    1.247529                    0.033135
4   0.250000                    0.547295                    0.014536
5   1.000000                    0.089819                    0.002386
6   2.000000                    0.090586                    0.002406
7   4.000000                    0.079863                    0.002121
8  16.000000                    0.084827                    0.002253
