### Setup

In [1]:
# import parquet files as dataframes
import pandas as pd
import pyarrow.parquet as pq
import os
import sys
import numpy as np

In [2]:
### Mayuri's conversion functions between DP epsilon and PAC MI using posterior advantage for equivalence
def calc_posterior(mi, prior=0.5, prec = 100000):
    test_vals = [x / prec for x in range(1, prec)]
    max_t = None
    for t in test_vals:
        if t*np.log(t/prior)+(1-t)*np.log((1-t)/(1-prior)) <= mi:
            if  max_t is None or t > max_t:
                max_t = t
    return max_t

def dp_epsilon_to_posterior_success(epsilon):
    return 1 - 1./(1+np.exp(epsilon))

def dp_ps_to_epsilon(ps):
    return np.log(ps / (1-ps))

# example usage:
# dp_ps_to_epsilon(calc_posterior(1/256.))

In [3]:
EXPERIMENT = 'dp-student_performance'
OUTPUT_DIR = f'./outputs/{EXPERIMENT}'
GENERATE = True

if GENERATE:
    print("GENERATE = True, so we will generate new samples.")
else:
    print("GENERATE = False, so we will load saved output from files rather than recomputing.")

import os
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

GENERATE = True, so we will generate new samples.


In [4]:
### Data Setup
por_df = pq.read_table(f"./data/student_performance/student-por.parquet").to_pandas()
por_df.shape

(649, 33)

In [5]:
true_result = por_df['absences'].agg(['count', 'sum', 'mean']).to_numpy() # Save the true result of the query for later
true_result

array([ 649.        , 2375.        ,    3.65947612])

In [6]:
### Manually input DP sensitivity per query
max_absences = np.max(por_df['absences'])
min_absences = np.min(por_df['absences'])
n = por_df.shape[0] # 649 rows

sensitivity = {
    'count': 1,
    'mean': (max_absences - min_absences) / n  # global sensitivity
}

In [12]:
### Run trials of DP
MI_OPTIONS = [16., 4., 2., 1., 1/4, 1/16, 1/32, 1/64]
EPS_OPTIONS = [dp_ps_to_epsilon(calc_posterior(mi)) for mi in MI_OPTIONS]
EXPERIMENTS = 100

OUTPUT_COLS = ['count', 'sum', 'mean']

# temp
mi_list = MI_OPTIONS
eps_list = EPS_OPTIONS

if GENERATE:
    experiment_results = []

    for mi, eps in zip(MI_OPTIONS, EPS_OPTIONS):
        for e in range(EXPERIMENTS):
            sample = true_result  # count, sum, mean

            # count
            true_count = sample[0]
            scale = sensitivity['count'] / eps  # Scale parameter for Laplace noise
            noise = np.random.laplace(loc=0, scale=scale)
            count_result = true_count + noise

            # mean
            true_mean = sample[2]
            scale = sensitivity['mean'] / eps
            noise = np.random.laplace(loc=0, scale=scale)
            mean_result = true_mean + noise

            # sum as count * mean
            sum_result = count_result * mean_result

            experiment_results.append([eps, mi, count_result, sum_result, mean_result])
        
    df = pd.DataFrame(experiment_results, columns=['eps', 'mi', *OUTPUT_COLS])
    
    # Save the new data to outputs/...
    df.to_parquet(f'{OUTPUT_DIR}/dp_results.parquet')
    # with open(f'{OUTPUT_DIR}/experiment_results.pkl', 'wb') as f:
    #     pickle.dump(experiment_results, f)
else:
    df = pq.read_table(f"{OUTPUT_DIR}/dp_results.parquet").to_pandas()

    # with open('{OUTPUT_DIR}/experiment_results.pkl', 'rb') as f:
    #     experiment_results = pickle.load(f)

df.head()

Unnamed: 0,eps,mi,count,sum,mean
0,11.512915,16.0,649.076103,2379.751741,3.666368
1,11.512915,16.0,648.93024,2374.700793,3.659408
2,11.512915,16.0,648.988753,2375.919468,3.660956
3,11.512915,16.0,649.105309,2379.460371,3.665754
4,11.512915,16.0,649.019945,2375.712173,3.660461


In [13]:
# show the first 2 rows of each eps
df.groupby('eps').head(2)

Unnamed: 0,eps,mi,count,sum,mean
0,11.512915,16.0,649.076103,2379.751741,3.666368
1,11.512915,16.0,648.93024,2374.700793,3.659408
400,1.642612,0.25,649.26528,2392.44573,3.684851
401,1.642612,0.25,648.538159,2385.24064,3.677872
500,0.730432,0.0625,649.851537,2346.816638,3.611312
501,0.730432,0.0625,653.898826,2359.887735,3.608949
600,0.508011,0.03125,648.369957,2549.399537,3.932014
601,0.508011,0.03125,645.848029,2337.129215,3.618698
700,0.356323,0.015625,645.572514,2358.728603,3.6537
701,0.356323,0.015625,654.341056,2479.280805,3.788973
