In [1]:
import datetime
import pandas
import numpy
import glob
import re

# local imports
from differential_privacy_parameters import get_query_point_sensitivity
from differential_privacy_parameters import get_query_row_sensitivity
from differential_privacy_parameters import get_query_gamma

from differential_privacy_mechanisms import gaussian_mechanism_matrix_sample

from model_evaluation import test_train_split
from model_evaluation import principle_component_RSS
from model_evaluation import root_mean_squared_error
from model_evaluation import record_result

from preprocessing import centered_sample_covariance_matrix
from preprocessing import scale_data

#  Data Processing and Setup

Import and concatonate all data

In [2]:
target_dir = 'data/'

data_load = None
for file_name in glob.glob(target_dir + '*'):
    if not(re.search(r'\.data$',file_name)):
        print('Loading...\t' + file_name)
        if data_load is None:
            data_load = pandas.read_pickle(file_name)
        else:
            data_load = pandas.concat([data_load,
                                       pandas.read_pickle(file_name)], 
                                      sort=False)

Loading...	data/Florida_100000_20190227
Loading...	data/Ohio_100000_20190227
Loading...	data/Pennsylvania_100000_20190227
Loading...	data/Illinois_100000_20190227
Loading...	data/Texas_100000_20190227
Loading...	data/California_100000_20190227
Loading...	data/Georgia_100000_20190227
Loading...	data/New York_100000_20190227


In [3]:
data_load.describe()

Unnamed: 0,bmi,diastolic_blood_pressure,glucose,hdl_cholesterol,ldl_cholesterol,systolic_blood_pressure,total_cholesterol,triglycerides,age,framingham
count,361387.0,361387.0,361387.0,361387.0,361387.0,361387.0,361387.0,361387.0,361387.0,355256.0
mean,35.605546,88.660763,90.589581,62.73458,98.904787,137.677034,193.914573,157.565889,54.731738,10.585139
std,6.172461,13.520208,24.874843,14.752899,28.458834,26.827233,29.088694,82.483132,11.830316,5.425655
min,14.0,66.8,33.15,0.0,50.4,96.7,160.0,100.0,2.433949,-6.0
25%,31.6,78.0,74.7,59.6,78.7,116.4,173.2,116.3,46.053388,8.0
50%,35.1,83.6,85.3,66.2,91.8,127.9,186.4,132.4,55.96167,12.0
75%,39.0,100.6,96.1,73.1,109.6,161.4,199.4,148.5,64.221766,14.0
max,255.7,123.5,200.0,80.0,200.0,203.3,305.0,599.9,78.863792,26.0


Scale data and establish evaluation parameters

In [4]:
evaluation_samples = 100

evaluation_features = [
    'bmi',
    'diastolic_blood_pressure',
    'systolic_blood_pressure',
    'glucose',
    'hdl_cholesterol',
    'ldl_cholesterol',
    'total_cholesterol',
    'triglycerides',
    'age',
    'framingham'    
]

data_feature_bounds = {
    'bmi':(0,400),
    'diastolic_blood_pressure':(60,140),
    'systolic_blood_pressure':(90,250),
    'glucose':(0,2000),
    'hdl_cholesterol':(0,1500),
    'ldl_cholesterol':(0,2000),
    'total_cholesterol':(0,2100),
    'triglycerides':(0,3000),
    'age':(0,120),
    'framingham':(-10,37)
}
target_feature_bounds = (0,1)

# Setup for estimation of framingham score
response = ['framingham']
predictors = [ f for f in evaluation_features if f not in response]

results_columns = [
    'mechanism', 
    'query', 
    'sample size',
    'iteration', 
    'metric', 
    'result', 
    'mechanism runtime (s)', 
    'total runtime (s)'
]
result_pickle_location = 'results/'

# Scale data 'data_feature_bounds' -> 'target_feature_bounds'
data_scaled = scale_data(data_load[evaluation_features].dropna(),
                         target_bounds=target_feature_bounds,
                         data_bounds=data_feature_bounds)

# Sample data if needed
sample_size = 350000


if isinstance(sample_size, int) and sample_size < len(data_scaled):
    data = data_scaled.sample(sample_size)
else:
    data = data_scaled

Differential Privacy parameters

In [5]:
epsilon = 1.0 
# 1 / number of observations
delta = pow(data.shape[0], -1)

In [6]:
data.describe()

Unnamed: 0,bmi,diastolic_blood_pressure,systolic_blood_pressure,glucose,hdl_cholesterol,ldl_cholesterol,total_cholesterol,triglycerides,age,framingham
count,350000.0,350000.0,350000.0,350000.0,350000.0,350000.0,350000.0,350000.0,350000.0,350000.0
mean,0.079198,0.383522,0.391108,0.337285,0.725792,0.318408,0.225064,0.114429,0.591019,0.51832
std,0.025143,0.23782,0.256661,0.139,0.224468,0.186366,0.190235,0.150998,0.201168,0.169538
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.062709,0.195767,0.18906,0.248427,0.666667,0.187834,0.089655,0.035834,0.442527,0.4375
50%,0.077341,0.294533,0.297505,0.310758,0.773333,0.274064,0.17931,0.070777,0.613155,0.5625
75%,0.093227,0.592593,0.616123,0.374288,0.886667,0.385027,0.267586,0.106165,0.754477,0.625
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Evaluation of Sample Covariance Differential Privacy Methods

## Gaussian Mechanism with symmetric and identity sampling

### Symmetric Gaussian Mechanism

Evaluation setup

In [None]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'covariance'
metric = 'principle component RSS'


result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(evaluation_samples), 
                               str(sample_size),
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

Differential Privacy parameters

In [None]:
sensitivity = get_query_point_sensitivity(query_type='covariance',
                                          query_scale=target_feature_bounds,
                                          query_shape=data.shape)

Evaluation of symmetric matrix gaussian mechanism sample

In [None]:
query = centered_sample_covariance_matrix(X=data)

result = None
sample = dict()

for i in range(evaluation_samples): 
    # Sample mechanism
    start_clock = datetime.datetime.now()
    # Add symmetric iid noise
    sample[i] = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon,
                delta=delta,
                sensitivity=sensitivity,
                symmetric=True,
                verbose=False)
    end_sample_clock = datetime.datetime.now() 

    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      principle_component_RSS(true=query, pred=sample[i]), 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_sample_clock - start_clock).total_seconds()
                                     ]])
    

result.to_pickle(result_pickle_location + result_pickle_name)

In [None]:
result.describe()

## Identity Gaussian Mechanism

Evaluation setup

In [None]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'itentity'
metric = 'principle component RSS'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(sample_size),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

Differential Privacy parameters

In [None]:
sensitivity = get_query_row_sensitivity(query_type='identity',
                                        query_scale=target_feature_bounds,
                                        query_shape=data.shape)

Evaluation of identity query guassian mechanism sample

In [None]:
query = centered_sample_covariance_matrix(X=data)

result = None

for i in range(evaluation_samples): 
    # Sample mechanism
    start_clock = datetime.datetime.now()
    # Add symmetric iid noise
    sample = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon,
                delta=delta,
                sensitivity=sensitivity,
                symmetric=False,
                verbose=False)
    end_sample_clock = datetime.datetime.now() 

    sample_cov = centered_sample_covariance_matrix(X=sample)
    end_loop_clock = datetime.datetime.now() 
    
    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      principle_component_RSS(true=query, pred=sample_cov), 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)

In [None]:
result.describe()