In [45]:
import datetime
import pandas
import numpy
import glob
import re

# local imports
from differential_privacy_parameters import get_query_point_sensitivity
from differential_privacy_parameters import get_query_row_sensitivity
from differential_privacy_parameters import get_query_gamma

from differential_privacy_mechanisms import gaussian_mechanism_matrix_sample
from differential_privacy_mechanisms import matrixvariate_gaussian_mechanism_sample
from differential_privacy_mechanisms import MVGMechanism

from model_evaluation import test_train_split
from model_evaluation import principle_component_RSS
from model_evaluation import root_mean_squared_error
from model_evaluation import record_result

from preprocessing import centered_sample_covariance_matrix
from preprocessing import scale_data

from models import seq_nn_single_evaluation

#  Data Processing and Setup

Import and concatonate all data

In [2]:
target_dir = 'data/'

data_load = None
for file_name in glob.glob(target_dir + '*'):
    if not(re.search(r'\.data$',file_name)):
        print('Loading...\t' + file_name)
        if data_load is None:
            data_load = pandas.read_pickle(file_name)
        else:
            data_load = pandas.concat([data_load,
                                       pandas.read_pickle(file_name)], 
                                      sort=False)

Loading...	data\California_100000_20190227
Loading...	data\Florida_100000_20190227
Loading...	data\Georgia_100000_20190227
Loading...	data\Illinois_100000_20190227
Loading...	data\New York_100000_20190227
Loading...	data\Ohio_100000_20190227
Loading...	data\Pennsylvania_100000_20190227
Loading...	data\Texas_100000_20190227


In [3]:
data_load.describe()

Unnamed: 0,bmi,diastolic_blood_pressure,glucose,hdl_cholesterol,ldl_cholesterol,systolic_blood_pressure,total_cholesterol,triglycerides,age,framingham
count,361387.0,361387.0,361387.0,361387.0,361387.0,361387.0,361387.0,361387.0,361387.0,355256.0
mean,35.605546,88.660763,90.589581,62.73458,98.904787,137.677034,193.914573,157.565889,54.731738,10.585139
std,6.172461,13.520208,24.874843,14.752899,28.458834,26.827233,29.088694,82.483132,11.830316,5.425655
min,14.0,66.8,33.15,0.0,50.4,96.7,160.0,100.0,2.433949,-6.0
25%,31.6,78.0,74.7,59.6,78.7,116.4,173.2,116.3,46.053388,8.0
50%,35.1,83.6,85.3,66.2,91.8,127.9,186.4,132.4,55.96167,12.0
75%,39.0,100.6,96.1,73.1,109.6,161.4,199.4,148.5,64.221766,14.0
max,255.7,123.5,200.0,80.0,200.0,203.3,305.0,599.9,78.863792,26.0


Scale data and establish evaluation parameters

In [4]:
evaluation_samples = 100

evaluation_features = [
    'bmi',
    'diastolic_blood_pressure',
    'systolic_blood_pressure',
    'glucose',
    'hdl_cholesterol',
    'ldl_cholesterol',
    'total_cholesterol',
    'triglycerides',
    'age',
    'framingham'    
]

data_feature_bounds = {
    'bmi':(0,400),
    'diastolic_blood_pressure':(60,140),
    'systolic_blood_pressure':(90,250),
    'glucose':(0,2000),
    'hdl_cholesterol':(0,1500),
    'ldl_cholesterol':(0,2000),
    'total_cholesterol':(0,2100),
    'triglycerides':(0,3000),
    'age':(0,120),
    'framingham':(-10,37)
}
target_feature_bounds = (0,1)

# Setup for estimation of framingham score
response = ['framingham']
predictors = [ f for f in evaluation_features if f not in response]

results_columns = [
    'mechanism', 
    'query', 
    'sample size',
    'iteration', 
    'metric', 
    'result', 
    'mechanism runtime (s)', 
    'total runtime (s)'
]

# Scale data 'data_feature_bounds' -> 'target_feature_bounds'
data_scaled = scale_data(data_load[evaluation_features].dropna(),
                         target_bounds=target_feature_bounds,
                         data_bounds=data_feature_bounds)

# Sample data if needed
sample_size = 20000

if isinstance(sample_size, int) and sample_size < len(data_scaled):
    data = data_scaled.sample(sample_size)
else:
    data = data_scaled

Differential Privacy parameters

In [5]:
epsilon = 1.0 
# 1 / number of observations
delta = pow(data.shape[0], -1)

In [6]:
data.describe()

Unnamed: 0,bmi,diastolic_blood_pressure,systolic_blood_pressure,glucose,hdl_cholesterol,ldl_cholesterol,total_cholesterol,triglycerides,age,framingham
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,0.088702,0.356564,0.296782,0.044884,0.042263,0.049119,0.091807,0.050593,0.456524,0.43802
std,0.015016,0.167495,0.167375,0.011722,0.009029,0.013949,0.013147,0.022695,0.098096,0.113801
min,0.04475,0.10625,0.04875,0.017475,0.013333,0.02615,0.07619,0.033333,0.171526,0.085106
25%,0.079,0.225,0.164375,0.0374,0.039933,0.03935,0.082333,0.038767,0.383778,0.382979
50%,0.08775,0.295,0.23625,0.04265,0.0442,0.04575,0.088571,0.0441,0.466347,0.468085
75%,0.097,0.505,0.44375,0.04795,0.048733,0.05415,0.094726,0.049333,0.535181,0.510638
max,0.407,0.7775,0.6925,0.1,0.053333,0.1,0.145238,0.166133,0.655282,0.765957


# Evaluation of Sample Covariance Differential Privacy Methods

## Gaussian Mechanism with symmetric and identity sampling

### Symmetric Gaussian Mechanism

Evaluation setup

In [7]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'covariance'
metric = 'principle component RSS'

result_pickle_location = 'results/'
result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(evaluation_samples), 
                               str(sample_size),
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

gaussian_covariance_100_20000_20190227


Differential Privacy parameters

In [8]:
sensitivity = get_query_point_sensitivity(query_type='covariance',
                                          query_scale=target_feature_bounds,
                                          query_shape=data.shape)

Evaluation of symmetric matrix gaussian mechanism sample

In [9]:
query = centered_sample_covariance_matrix(X=data)

result = None
sample = dict()

for i in range(evaluation_samples): 
    # Sample mechanism
    start_clock = datetime.datetime.now()
    # Add symmetric iid noise
    sample[i] = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon,
                delta=delta,
                sensitivity=sensitivity,
                symmetric=True,
                verbose=False)
    end_sample_clock = datetime.datetime.now() 

    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      principle_component_RSS(true=query, pred=sample[i]), 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_sample_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)


In [10]:
result.describe()

Unnamed: 0,sample size,iteration,result,mechanism runtime (s),total runtime (s)
count,100.0,100.0,100.0,100.0,100.0
mean,20000.0,50.5,0.076782,0.004936,0.004936
std,0.0,29.011492,0.019116,0.007252,0.007252
min,20000.0,1.0,0.041431,0.0,0.0
25%,20000.0,25.75,0.062701,0.0,0.0
50%,20000.0,50.5,0.074151,0.0,0.0
75%,20000.0,75.25,0.087489,0.0156,0.0156
max,20000.0,100.0,0.140329,0.0176,0.0176


## Identity Gaussian Mechanism

Evaluation setup

In [11]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'itentity'
metric = 'principle component RSS'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(sample_size),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

gaussian_itentity_20000_100_20190227


Differential Privacy parameters

In [12]:
sensitivity = get_query_row_sensitivity(query_type='identity',
                                        query_scale=target_feature_bounds,
                                        query_shape=data.shape)

Evaluation of identity query guassian mechanism sample

In [13]:
query = centered_sample_covariance_matrix(X=data)

result = None

for i in range(evaluation_samples): 
    # Sample mechanism
    start_clock = datetime.datetime.now()
    # Add symmetric iid noise
    sample = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon,
                delta=delta,
                sensitivity=sensitivity,
                symmetric=False,
                verbose=False)
    end_sample_clock = datetime.datetime.now() 

    sample_cov = centered_sample_covariance_matrix(X=sample)
    end_loop_clock = datetime.datetime.now() 
    
    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      principle_component_RSS(true=query, pred=sample_cov), 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)

In [14]:
result.describe()

Unnamed: 0,sample size,iteration,result,mechanism runtime (s),total runtime (s)
count,100.0,100.0,100.0,100.0,100.0
mean,20000.0,50.5,0.396607,0.054264,0.07744
std,0.0,29.011492,0.008425,0.011342,0.01363
min,20000.0,1.0,0.37208,0.0312,0.0468
25%,20000.0,25.75,0.391469,0.0468,0.07495
50%,20000.0,50.5,0.396935,0.046801,0.078
75%,20000.0,75.25,0.401968,0.0624,0.078
max,20000.0,100.0,0.41321,0.1248,0.1404


#  Evaluation of Data Release Differential Privacy Methods

## Gaussian and Matrixvariate Gaussian Mechanisms by regression task

### Identity Query 

Training parameters

In [23]:
n_obs, n_features = data.shape

evaluation_test_split = 0.1
evalaution_test_size = int(n_obs*evaluation_test_split)
evaluation_train_size = n_obs - evalaution_test_size

evaluation_samples = 20

# Model parameters
model_params = dict(epochs=5, batch_size=32, verbose=0) 

### Baseline

Evaluation Setup

In [24]:
# labelling result values for mechanism
mechanism = 'baseline'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type,
                               str(sample_size),
                               ''.join(response), 
                               str(evaluation_samples),
                               datetime.date.today().strftime("%Y%m%d")])



Evaluation of baseline model

In [25]:
result = None

for i in range(evaluation_samples):
    start_clock = datetime.datetime.now()
    # Train model and evaluate prediction metric on holdout set   
    metric_result = seq_nn_single_evaluation(train_data=data,
                                             test_data=data,
                                             test_holdout_p=evaluation_test_split,
                                             X_labels=predictors,
                                             y_label=response,
                                             fit_params=model_params)
    
    end_loop_clock = datetime.datetime.now() 
    
    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      metric_result, 
                                      0,
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)

In [26]:
result.describe()

Unnamed: 0,sample size,iteration,result,mechanism runtime (s),total runtime (s)
count,20.0,20.0,20.0,20.0,20.0
mean,20000.0,10.5,0.077935,0.0,15.09187
std,0.0,5.91608,0.015947,0.0,2.250145
min,20000.0,1.0,0.061336,0.0,11.6972
25%,20000.0,5.75,0.069028,0.0,13.4016
50%,20000.0,10.5,0.072984,0.0,14.3955
75%,20000.0,15.25,0.078328,0.0,16.74895
max,20000.0,20.0,0.114291,0.0,19.4386


### Identity Gaussian Mechanism

Evaluation Setup

In [48]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(sample_size),
                               ''.join(response),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

gaussian_identity_20000_framingham_20_20190227


Differential Privacy Parameters

In [34]:
epsilon = 1.0 
# 1 / number of observations
delta = pow(evaluation_train_size, -1)

sensitivity = get_query_point_sensitivity(query_type='identity',
                                          query_scale=target_feature_bounds,
                                          query_shape=(evaluation_train_size, n_features))

Evaluation of singleton gaussian sequential NN model

In [38]:
result = None

for i in range(evaluation_samples):
    
    start_clock = datetime.datetime.now()
    
    train_ind, test_ind = \
            test_train_split(len(data),
                             evaluation_test_split)
        
    sample = gaussian_mechanism_matrix_sample(
        data=data.iloc[train_ind],
        epsilon=epsilon,
        delta=delta,
        sensitivity=sensitivity,
        symmetric=False,
        verbose=False)
    
    end_sample_clock = datetime.datetime.now() 
        
    # Train model and evaluate prediction metric on holdout set   
    metric_result = seq_nn_single_evaluation(train_data=sample,
                                             test_data=data,
                                             X_labels=predictors,
                                             y_label=response,
                                             train_ind=train_ind, 
                                             test_ind=test_ind,
                                             fit_params=model_params)
    
    end_loop_clock = datetime.datetime.now() 
    
    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      metric_result, 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)

In [39]:
result.describe()

Unnamed: 0,sample size,iteration,result,mechanism runtime (s),total runtime (s)
count,20.0,20.0,20.0,20.0,20.0
mean,20000.0,10.5,0.116373,8.18543,29.38442
std,0.0,5.91608,0.003442,1.525898,3.694571
min,20000.0,1.0,0.111524,6.098,24.4002
25%,20000.0,5.75,0.114132,7.25605,26.8912
50%,20000.0,10.5,0.115754,7.7215,28.6247
75%,20000.0,15.25,0.116753,9.0141,31.60105
max,20000.0,20.0,0.125086,12.0838,36.4512


### Matrix-variate Gaussian Mechanism

Binary Allocation Strategy - Key features
   
    key features = ['age','total_cholesterol','framingham'] 
    
    'age' and 'cholesterol' important as contribute the largest scores to the total. 
    'framingham' important as the target variable.


Evaluation Setup

In [49]:
# labelling result values for mechanism
mechanism = 'MVG_binary_knowledge'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(sample_size),
                               ''.join(response),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

model_params = dict(epochs=10, batch_size=64, verbose=0)

MVG_binary_knowledge_identity_20000_framingham_20_20190227


Differential Privacy Parameters

In [50]:
sensitivity = get_query_row_sensitivity(query_type='identity',
                                        query_scale=target_feature_bounds,
                                        query_shape=(evaluation_train_size, n_features))

gamma = get_query_gamma(query_scale=target_feature_bounds, 
                        query_shape=(evaluation_train_size, n_features), 
                        query_type='identity')

# Allocation percentages in 'key_features_allocation' to key features 
# and remainder to all other features
key_features_binary_mvg = ['age','total_cholesterol','framingham']  
key_features_allocation = [0.45,0.55,0.65,0.75,0.85,0.95]

feature_allocations = dict()
for allocation in key_features_allocation:
    
    feature_allocations[allocation] = [ 
        allocation / len(key_features_binary_mvg)
        if feature in key_features_binary_mvg 
        else (1 - allocation) / (n_features - len(key_features_binary_mvg))
        for feature in evaluation_features 
    ]

Matrix-variate Gaussian Mechnaism Evaluation

In [None]:
result = None 
for key, allocation in feature_allocations.items():
    
    params = dict(
        epsilon=epsilon,
        delta=delta,
        sensitivity=sensitivity,
        gamma=gamma,
        precision_allocation=allocation,
        precision_direction=numpy.identity(n_features),
        covariance_direction='unimodal features',
        covariance_method='binary'
    )
    
    
    for i in range(evaluation_samples):
        start_clock = datetime.datetime.now() 
        train_ind, test_ind = \
            test_train_split(len(data),
                             evaluation_test_split)
            
        sample = matrixvariate_gaussian_mechanism_sample(data=data.iloc[train_ind],
                                                         **params)
        
        end_sample_clock = datetime.datetime.now() 
        
        metric_result = seq_nn_single_evaluation(train_data=sample,
                                                test_data=data,
                                                X_labels=predictors,
                                                y_label=response,
                                                train_ind=train_ind, 
                                                test_ind=test_ind,
                                                fit_params=model_params)
        
        end_loop_clock = datetime.datetime.now() 
        
        result = record_result(results=result, 
                               column_names=results_columns, 
                               new_data=[[mechanism + '_' + str(key), 
                                          query_type, 
                                          sample_size,
                                          i+1,
                                          metric, 
                                          metric_result, 
                                          (end_sample_clock - start_clock).total_seconds(),
                                          (end_loop_clock - start_clock).total_seconds()
                                         ]])
        
result.to_pickle(result_pickle_location + result_pickle_name)         

In [None]:
result.describe()

Binary Allocation Strategy - Key features
   
    Features allocations are proprotional to the singular values or explained directional variance
    
    Directions are equal to eigenvectors of the sample covariance. 
    These are the orthogonal primary axis of the variation in the sample covariance 

Evaluation setup

In [None]:
# labelling result values for mechanism
mechanism = 'MVG_binary_directed'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(sample_size),
                               ''.join(response),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

Differential Privacy parameters

In [None]:
# DP SVD parameters
svd_privacy_allocation = 0.2

svd_sensitivity = get_query_row_sensitivity(query_type='covariance',
                                            query_scale=target_feature_bounds,
                                            query_shape=data.shape)

# DP MVG parameters

sensitivity = get_query_row_sensitivity(query_type='identity',
                                        query_scale=target_feature_bounds,
                                        query_shape=(evaluation_train_size, n_features))

gamma = get_query_gamma(query_scale=target_feature_bounds, 
                        query_shape=(evaluation_train_size, n_features), 
                        query_type='identity')

In [None]:
query = centered_sample_covariance_matrix(X=data)

result = None

for i in range(evaluation_samples): 

    start_svd_clock = datetime.datetime.now()
    
    cov_sample = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon*svd_privacy_allocation,
                delta=delta*svd_privacy_allocation,
                sensitivity=svd_sensitivity,
                symmetric=True,
                verbose=False)

    precision_directions, singular_values, _ = numpy.linalg.svd(cov_sample, full_matrices=True)

    sv_proportions = singular_values / numpy.sum(singular_values)
    sv_allocations = [0.55,0.75,0.95]

    feature_allocations = dict()
    for allocation in sv_allocations:
        feature_allocations[allocation] = [ 
            ((1 - allocation) / len(sv_proportions)) + 
            (sv * allocation)
            for sv in sv_proportions
        ]    

    end_svd_clock = datetime.datetime.now()
    
    for key, allocation in feature_allocations.items():

        start_mvg_clock = datetime.datetime.now()
        params = dict(
            epsilon=epsilon*(1.0-svd_privacy_allocation),
            delta=delta*(1.0-svd_privacy_allocation),
            sensitivity=sensitivity,
            gamma=gamma,
            precision_allocation=allocation,
            precision_direction=precision_directions,
            covariance_direction='unimodal features',
            covariance_method='binary'
        )        
         
        train_ind, test_ind = \
            test_train_split(len(data),
                             evaluation_test_split)

        sample = matrixvariate_gaussian_mechanism_sample(data=data.iloc[train_ind],
                                                         **params)

        end_mvg_clock = datetime.datetime.now() 

        metric_result = seq_nn_single_evaluation(train_data=sample,
                                                 test_data=data,
                                                 X_labels=predictors,
                                                 y_label=response,
                                                 train_ind=train_ind, 
                                                 test_ind=test_ind,
                                                 fit_params=model_params)

        end_loop_clock = datetime.datetime.now() 
        
        sample_clock_dif = (end_svd_clock - start_svd_clock) + (end_mvg_clock - start_mvg_clock)
        total_clock_dif =  (end_loop_clock - end_mvg_clock) + sample_clock_dif
        
        result = record_result(results=result, 
                               column_names=results_columns, 
                               new_data=[[mechanism + '_' + str(key), 
                                          query_type, 
                                          sample_size,
                                          i+1,
                                          metric, 
                                          metric_result, 
                                          sample_clock_dif.total_seconds(),
                                          total_clock_dif.total_seconds()
                                         ]])

result.to_pickle(result_pickle_location + result_pickle_name) 

In [None]:
result.describe()