In [19]:
import datetime
import pandas
import numpy
import glob
import re

# local imports
from differential_privacy_parameters import get_query_point_sensitivity
from differential_privacy_parameters import get_query_row_sensitivity
from differential_privacy_parameters import get_query_gamma

from differential_privacy_mechanisms import gaussian_mechanism_matrix_sample
from differential_privacy_mechanisms import matrixvariate_gaussian_mechanism_sample
from differential_privacy_mechanisms import MVGMechanism

from model_evaluation import test_train_split
from model_evaluation import principle_component_RSS
from model_evaluation import root_mean_squared_error
from model_evaluation import record_result

from preprocessing import centered_sample_covariance_matrix
from preprocessing import scale_data

from models import seq_nn_single_evaluation

#  Data Processing and Setup

Import and concatonate all data

In [2]:
target_dir = 'data/'

data_load = None
for file_name in glob.glob(target_dir + '*'):
    if not(re.search(r'\.data$',file_name)):
        print('Loading...\t' + file_name)
        if data_load is None:
            data_load = pandas.read_pickle(file_name)
        else:
            data_load = pandas.concat([data_load,
                                       pandas.read_pickle(file_name)], 
                                      sort=False)

Loading...	data/Florida_100000_20190227
Loading...	data/Ohio_100000_20190227
Loading...	data/Pennsylvania_100000_20190227
Loading...	data/Illinois_100000_20190227
Loading...	data/Texas_100000_20190227
Loading...	data/California_100000_20190227
Loading...	data/Georgia_100000_20190227
Loading...	data/New York_100000_20190227


In [None]:
data_load.describe()

Scale data and establish evaluation parameters

In [3]:
evaluation_samples = 100

evaluation_features = [
    'bmi',
    'diastolic_blood_pressure',
    'systolic_blood_pressure',
    'glucose',
    'hdl_cholesterol',
    'ldl_cholesterol',
    'total_cholesterol',
    'triglycerides',
    'age',
    'framingham'    
]

data_feature_bounds = {
    'bmi':(0,400),
    'diastolic_blood_pressure':(60,140),
    'systolic_blood_pressure':(90,250),
    'glucose':(0,2000),
    'hdl_cholesterol':(0,1500),
    'ldl_cholesterol':(0,2000),
    'total_cholesterol':(0,2100),
    'triglycerides':(0,3000),
    'age':(0,120),
    'framingham':(-10,37)
}
target_feature_bounds = (0,1)

# Setup for estimation of framingham score
response = ['framingham']
predictors = [ f for f in evaluation_features if f not in response]

results_columns = [
    'mechanism', 
    'query', 
    'iteration', 
    'metric', 
    'result', 
    'mechanism runtime (s)', 
    'total runtime (s)'
]

# Scale data 'data_feature_bounds' -> 'target_feature_bounds'
data = scale_data(data_load[evaluation_features].dropna(),
                  target_bounds=target_feature_bounds,
                  data_bounds=data_feature_bounds)

Differential Privacy parameters

In [11]:
epsilon = 1.0 
# 1 / number of observations
delta = pow(data.shape[0], -1)

In [None]:
data.describe()

# Evaluation of Sample Covariance Differential Privacy Methods

## Gaussian Mechanism with symmetric and identity sampling

### Symmetric Gaussian Mechanism

Evaluation setup

In [None]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'covariance'
metric = 'principle component RSS'

result_pickle_location = 'results/'
result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

Differential Privacy parameters

In [None]:
epsilon = 1.0 
# 1 / number of observations
delta = pow(data.shape[0], -1)

sensitivity = get_query_row_sensitivity(query_type='covariance',
                                        query_scale=target_feature_bounds,
                                        query_shape=data.shape)

Evaluation of symmetric matrix gaussian mechanism sample

In [None]:
query = centered_sample_covariance_matrix(X=data)

result = None
sample = dict()

for i in xrange(evaluation_samples): 
    # Sample mechanism
    start_clock = datetime.datetime.now()
    # Add symmetric iid noise
    sample[i] = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon,
                delta=delta,
                sensitivity=sensitivity,
                symmetric=True,
                verbose=False)
    end_sample_clock = datetime.datetime.now() 

    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      i+1,
                                      metric, 
                                      principle_component_RSS(true=query, pred=sample[i]), 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_sample_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)


In [None]:
result.describe()

## Identity Gaussian Mechanism

Evaluation setup

In [None]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'itentity'
metric = 'principle component RSS'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

Differential Privacy parameters

In [None]:
sensitivity = get_query_row_sensitivity(query_type='singleton',
                                        query_scale=target_feature_bounds,
                                        query_shape=data.shape)

Evaluation of identity query guassian mechanism sample

In [None]:
query = centered_sample_covariance_matrix(X=data)

result = None

for i in xrange(evaluation_samples): 
    # Sample mechanism
    start_clock = datetime.datetime.now()
    # Add symmetric iid noise
    sample = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon,
                delta=delta,
                sensitivity=sensitivity,
                symmetric=False,
                verbose=False)
    end_sample_clock = datetime.datetime.now() 

    sample_cov = centered_sample_covariance_matrix(X=sample)
    end_loop_clock = datetime.datetime.now() 
    
    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      i+1,
                                      metric, 
                                      principle_component_RSS(true=query, pred=sample_cov), 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)

In [None]:
result.describe()

#  Evaluation of Data Release Differential Privacy Methods

## Gaussian and Matrixvariate Gaussian Mechanisms by regression task

### Identity Query 

Training parameters

In [5]:
n_obs, n_features = data.shape

evaluation_test_split = 0.1
evalaution_test_size = int(n_obs*evaluation_test_split)
evaluation_train_size = n_obs - evalaution_test_size

evaluation_samples = 2

# Model parameters
model_params = dict(epochs=5, batch_size=32, verbose=0) 

### Baseline

Evaluation Setup

In [4]:
# labelling result values for mechanism
mechanism = 'baseline'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type,
                               ''.join(response), 
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])



Evaluation of baseline model

In [None]:
result = None

for i in xrange(evaluation_samples):
    start_clock = datetime.datetime.now()
    # Train model and evaluate prediction metric on holdout set   
    metric_result = seq_nn_single_evaluation(train_data=data,
                                             test_data=data,
                                             test_holdout_p=evaluation_test_split,
                                             X_labels=predictors,
                                             y_label=response,
                                             fit_params=model_params)
    
    end_loop_clock = datetime.datetime.now() 
    
    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      i+1,
                                      metric, 
                                      metric_result, 
                                      0,
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)

In [None]:
result.decsribe()

### Singleton Gaussian Mechanism

Evaluation Setup

In [None]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               ''.join(response),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

Differential Privacy Parameters

In [None]:
epsilon = 1.0 
# 1 / number of observations
delta = pow(evaluation_train_size, -1)

sensitivity = get_query_point_sensitivity(query_type='singleton',
                                          query_scale=target_feature_bounds,
                                          query_shape=(evaluation_train_size, n_features))

Evaluation of singleton gaussian sequential NN model

In [None]:
result = None

for i in xrange(evaluation_samples):
    
    start_clock = datetime.datetime.now()
    
    train_ind, test_ind = \
            test_train_split(len(data),
                             evaluation_test_split)
        
    sample = gaussian_mechanism_matrix_sample(
        data=data.iloc[train_ind],
        epsilon=epsilon,
        delta=delta,
        sensitivity=sensitivity,
        symmetric=False,
        verbose=False)
    
    end_sample_clock = datetime.datetime.now() 
        
    # Train model and evaluate prediction metric on holdout set   
    metric_result = seq_nn_single_evaluation(train_data=sample,
                                             test_data=data,
                                             X_labels=predictors,
                                             y_label=response,
                                             train_ind=train_ind, 
                                             test_ind=test_ind,
                                             fit_params=model_params)
    
    end_loop_clock = datetime.datetime.now() 
    
    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      i+1,
                                      metric, 
                                      metric_result, 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)

In [None]:
result.describe()

### Matrix-variate Gaussian Mechanism

Binary Allocation Strategy - Key features
   
    key features = ['age','total_cholesterol','framingham'] 
    
    'age' and 'cholesterol' important as contribute the largest scores to the total. 
    'framingham' important as the target variable.


Evaluation Setup

In [6]:
# labelling result values for mechanism
mechanism = 'MVG_binary_knowledge'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               ''.join(response),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

model_params = dict(epochs=10, batch_size=16, verbose=0)

MVG_binary_knowledge_identity_framingham_2_20190227


Differential Privacy Parameters

In [None]:
sensitivity = get_query_row_sensitivity(query_type='identity',
                                        query_scale=target_feature_bounds,
                                        query_shape=(evaluation_train_size, n_features))

gamma = get_query_gamma(query_scale=target_feature_bounds, 
                        query_shape=(evaluation_train_size, n_features), 
                        query_type='identity')

# Allocation percentages in 'key_features_allocation' to key features 
# and remainder to all other features
key_features_binary_mvg = ['age','total_cholesterol','framingham']  
key_features_allocation = [0.45,0.55,0.65,0.75,0.85,0.95]

feature_allocations = dict()
for allocation in key_features_allocation:
    
    feature_allocations[allocation] = [ 
        allocation / len(key_features_binary_mvg)
        if feature in key_features_binary_mvg 
        else (1 - allocation) / (n_features - len(key_features_binary_mvg))
        for feature in evaluation_features 
    ]

Matrix-variate Gaussian Mechnaism Evaluation

In [None]:
result = None 

for key, allocation in feature_allocations.items():
    
    params = dict(
        epsilon=epsilon,
        delta=delta,
        sensitivity=sensitivity,
        gamma=gamma,
        precision_allocation=allocation,
        precision_direction=numpy.identity(features),
        covariance_direction='unimodal features',
        covariance_method='binary'
    )
    
    for i in xrange(evaluation_samples):
        start_clock = datetime.datetime.now() 
        train_ind, test_ind = \
            test_train_split(len(data),
                             evaluation_test_split)
            
        sample = matrixvariate_gaussian_mechanism_sample(data=data.iloc[train_ind],
                                                         **params)
        
        end_sample_clock = datetime.datetime.now() 
        
        metric_result = seq_nn_single_evaluation(train_data=sample,
                                                test_data=data,
                                                X_labels=predictors,
                                                y_label=response,
                                                train_ind=train_ind, 
                                                test_ind=test_ind,
                                                fit_params=model_params)
        
        end_loop_clock = datetime.datetime.now() 
        
        result = record_result(results=result, 
                               column_names=results_columns, 
                               new_data=[[mechanism + '_' + str(key), 
                                          query_type, 
                                          i+1,
                                          metric, 
                                          metric_result, 
                                          (end_sample_clock - start_clock).total_seconds(),
                                          (end_loop_clock - start_clock).total_seconds()
                                         ]])
        
result.to_pickle(result_pickle_location + result_pickle_name)            

In [None]:
result.describe()

Binary Allocation Strategy - Key features
   
    Features allocations are proprotional to the singular values or explained directional variance
    
    Directions are equal to eigenvectors of the sample covariance. 
    These are the orthogonal primary axis of the variation in the sample covariance 

Evaluation setup

In [16]:
# labelling result values for mechanism
mechanism = 'MVG_binary_directed'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               ''.join(response),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

MVG_binary_directed_identity_framingham_2_20190227


Differential Privacy parameters

In [17]:
# DP SVD parameters
svd_privacy_allocation = 0.2

svd_sensitivity = get_query_row_sensitivity(query_type='covariance',
                                            query_scale=target_feature_bounds,
                                            query_shape=data.shape)

# DP MVG parameters

sensitivity = get_query_row_sensitivity(query_type='identity',
                                        query_scale=target_feature_bounds,
                                        query_shape=(evaluation_train_size, n_features))

gamma = get_query_gamma(query_scale=target_feature_bounds, 
                        query_shape=(evaluation_train_size, n_features), 
                        query_type='identity')

In [None]:
query = centered_sample_covariance_matrix(X=data)

result = None

for i in xrange(evaluation_samples): 

    start_svd_clock = datetime.datetime.now()
    
    cov_sample = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon*svd_privacy_allocation,
                delta=delta*svd_privacy_allocation,
                sensitivity=svd_sensitivity,
                symmetric=True,
                verbose=False)

    precision_directions, singular_values, _ = numpy.linalg.svd(cov_sample, full_matrices=True)

    sv_proportions = singular_values / numpy.sum(singular_values)
    sv_allocations = [0.55,0.75,0.95]

    feature_allocations = dict()
    for allocation in sv_allocations:
        feature_allocations[allocation] = [ 
            ((1 - allocation) / len(sv_proportions)) + 
            (sv * allocation)
            for sv in sv_proportions
        ]    

    end_svd_clock = datetime.datetime.now()
    
    for key, allocation in feature_allocations.items():

        start_mvg_clock = datetime.datetime.now()
        params = dict(
            epsilon=epsilon*(1.0-svd_privacy_allocation),
            delta=delta*(1.0-svd_privacy_allocation),
            sensitivity=sensitivity,
            gamma=gamma,
            precision_allocation=allocation,
            precision_direction=precision_directions,
            covariance_direction='unimodal features',
            covariance_method='binary'
        )

        
         
        train_ind, test_ind = \
            test_train_split(len(data),
                             evaluation_test_split)

        sample = matrixvariate_gaussian_mechanism_sample(data=data.iloc[train_ind],
                                                         **params)

        end_mvg_clock = datetime.datetime.now() 

        metric_result = seq_nn_single_evaluation(train_data=sample,
                                                 test_data=data,
                                                 X_labels=predictors,
                                                 y_label=response,
                                                 train_ind=train_ind, 
                                                 test_ind=test_ind,
                                                 fit_params=model_params)

        end_loop_clock = datetime.datetime.now() 
        
        sample_clock_dif = (end_svd_clock - start_svd_clock) + (end_mvg_clock - start_mvg_clock)
        total_clock_dif =  (end_loop_clock - end_mvg_clock) + sample_clock_dif
        
        result = record_result(results=result, 
                               column_names=results_columns, 
                               new_data=[[mechanism + '_' + str(key), 
                                          query_type, 
                                          i+1,
                                          metric, 
                                          metric_result, 
                                          sample_clock_dif.total_seconds(),
                                          total_clock_dif.total_seconds()
                                         ]])

result.to_pickle(result_pickle_location + result_pickle_name) 