In [1]:
from thundersvmScikit import SVR

import sklearn.metrics

import datetime
import pandas
import numpy
import glob
import re
from random import uniform


from preprocessing import scale_data
from preprocessing import centered_sample_covariance_matrix

from differential_privacy_parameters import get_query_point_sensitivity
from differential_privacy_parameters import get_query_row_sensitivity
from differential_privacy_parameters import get_query_gamma

from differential_privacy_mechanisms import gaussian_mechanism_matrix_sample
from differential_privacy_mechanisms import matrixvariate_gaussian_mechanism_sample
from differential_privacy_mechanisms import MVGMechanism

from model_evaluation import record_result
from model_evaluation import test_train_split

from models import svr_parameter_tuning

from metrics import mean_absolute_error


#  Data Processing and Setup

Import and concatonate all data

In [None]:
target_dir = 'data/'

data_load = None
for file_name in glob.glob(target_dir + '*'):
    if not(re.search(r'\.data$',file_name)):
        print('Loading...\t' + file_name)
        if data_load is None:
            data_load = pandas.read_pickle(file_name)
        else:
            data_load = pandas.concat([data_load,
                                       pandas.read_pickle(file_name)], 
                                      sort=False)        

In [None]:
data_load.describe()

## Feature Parameters

In [None]:
evaluation_features = [
    'bmi',
    'diastolic_blood_pressure',
    'systolic_blood_pressure',
    'glucose',
    'hdl_cholesterol',
    'ldl_cholesterol',
    'total_cholesterol',
    'triglycerides',
    'age',
    'framingham'    
]

data_feature_bounds = {
    'bmi':(0,400),
    'diastolic_blood_pressure':(60,140),
    'systolic_blood_pressure':(90,250),
    'glucose':(0,2000),
    'hdl_cholesterol':(0,1500),
    'ldl_cholesterol':(0,2000),
    'total_cholesterol':(0,2100),
    'triglycerides':(0,3000),
    'age':(0,120),
    'framingham':(-10,37)
}
target_feature_bounds = (0,1)

response = ['framingham']
predictors = [ f for f in evaluation_features if f not in response]

## Data Processing
### Scaling and Sampling

In [133]:
# Sample data if needed
sample_size = 100000

pickle_file = 'results/SVMR_15_' + str(sample_size) + '_global_sensitivity_20190302'
print(pickle_file)

# Scale data 'data_feature_bounds' -> 'target_feature_bounds'
if isinstance(sample_size, int) and sample_size < len(data_load):
    data = (scale_data(data_load[evaluation_features].dropna(),
                      target_bounds=target_feature_bounds,
                      data_bounds=data_feature_bounds)
            .sample(sample_size))
else:
    data = scale_data(data_load[evaluation_features].dropna(),
                      target_bounds=target_feature_bounds,
                      data_bounds=data_feature_bounds)      

results/SVMR_15_100000_global_sensitivity_20190302


### Experimental Configuration

In [151]:
n_obs, n_features = data.shape

evaluations = 25

# Set aside data for hyperparameter tuning
hyper_param_tuning_split = 0.15
hyper_param_tuning_size = int(sample_size*hyper_param_tuning_split)
hp_data = data.sample(hyper_param_tuning_size)

test_split = 0.15
test_size = int(n_obs*test_split)
train_size = (n_obs - hyper_param_tuning_size) - test_size

# Remove tuning data to create train/test set
data = data[~data.set_index(evaluation_features)
             .index
             .isin(hp_data
                   .set_index(evaluation_features)
                   .index)]

X = data[predictors].values
y = data[response].values

recorded_results = None
results_columns = [
    'query',
    'mechanism',
    'metric',
    'result',
    'sample size',
    'mechanism runtime (s)',
    'iteration',
    'sensitivity type',
    'sensitivity',
    'epsilon',
    'delta',
    'gamma',
    'training size',
    'test holdout size',
    'tuning holdout size',
    'model',
    'kernel',
    'global min',
    'global max',
    'response min',
    'response max',
    'response',
    'predictors'   
]

print(['data', data.shape, 
      'training', train_size, 
      'test', test_size, 
      'tuning', len(hp_data)])

['data', (69994, 10), 'training', 57249, 'test', 12749, 'tuning', 15000]


#  Evaluation of Data Release Differential Privacy Methods

## Gaussian and Matrixvariate Gaussian Mechanisms by Regression Experiments

### Identity Query 

### Support Vector Machine Regression

Mean Baseline Experiment

In [135]:
for i in range(evaluations):

    train_ind, test_ind = test_train_split(y_len=len(y), test_perc=test_split)
    
    result = [
        mean_absolute_error(y[test_ind], 
                            [numpy.mean(y[train_ind])]*len(test_ind), 200)
    ]
    
    to_record = {
        'query':'none',
        'mechanism':'mean',
        'metric':'MAE',
        'result':result,
        'sample size':sample_size,
        'mechanism runtime (s)':0,
        'iteration':i,
        'sensitivity type':'none',
        'sensitivity':0,
        'epsilon':0,
        'delta':0,
        'gamma':0,
        'training size':train_size,
        'test holdout size':test_size,
        'tuning holdout size':hyper_param_tuning_size,
        'model':'SVM Regression',
        'kernel':'rbf',
        'global min':target_feature_bounds[0],
        'global max':target_feature_bounds[1],
        'response min':min(y[test_ind]),
        'response max':max(y[test_ind]),
        'response':response,
        'predictors': ' '.join(predictors)
    }
    recorded_results = \
        record_result(results=recorded_results, 
                      new_data=to_record,
                      column_names=results_columns)

Unprivatised Data 

Baseline Experiment

In [137]:
params = svr_parameter_tuning(
    X=hp_data[predictors].values,
    y=hp_data[response].values,
    scoring_function=mean_absolute_error,
    rand_iters=200,
    search_range=(0.001,4.0),
    holdout_split=0.2,
    kernel='rbf')

for i in range(evaluations):
    
    train_ind, test_ind = test_train_split(y_len=len(y), test_perc=test_split)
    
    model = SVR(kernel='rbf', C=params['best_C'], gamma=params['best_gamma'])
    model.fit(X[train_ind], y[train_ind].ravel())

    result = [
        mean_absolute_error(y[test_ind], 
                            model.predict(X[test_ind]).reshape(-1,1), 
                            100)
    ]
    
    to_record = {
        'query':'none',
        'mechanism':'baseline',
        'metric':'MAE',
        'result':result,
        'sample size':sample_size,
        'mechanism runtime (s)':0,
        'iteration':i,
        'sensitivity type':'none',
        'sensitivity':0,
        'epsilon':0,
        'delta':0,
        'gamma':0,
        'training size':train_size,
        'test holdout size':test_size,
        'tuning holdout size':hyper_param_tuning_size,
        'model':'SVM Regression',
        'kernel':'rbf',
        'global min':target_feature_bounds[0],
        'global max':target_feature_bounds[1],
        'response min':min(y[test_ind]),
        'repsonse max':max(y[test_ind]),
        'response':response,
        'predictors': ' '.join(predictors)
    }
    
    recorded_results = \
        record_result(results=recorded_results, 
                      new_data=to_record,
                      column_names=results_columns)


In [138]:
recorded_results[recorded_results['mechanism'] == 'baseline']

iid Gaussian Matrix Mechanism 

Baseline Experiment

In [139]:
# differential privacy parameters
epsilon = 1.0 
# 1 / number of observations
delta = pow(train_size, -1)

sensitivity = get_query_point_sensitivity(query_type='identity',
                                          query_scale=target_feature_bounds,
                                          query_shape=(train_size, n_features))

print(epsilon, round(delta, 8), sensitivity)

(1.0, 1.429e-05, 1)


In [140]:
privacy_params = dict(
    epsilon=epsilon,
    delta=delta,
    sensitivity=sensitivity,    
    symmetric=False,
    verbose=False
)

params = svr_parameter_tuning(
    X=hp_data[predictors].values,
    y=hp_data[response].values,
    scoring_function=mean_absolute_error,
    rand_iters=200,
    search_range=(0.001,4.0),
    holdout_split=0.2,
    kernel='rbf',
    privacy_mechanism_sampler=gaussian_mechanism_matrix_sample,
    privacy_mechanism_params=privacy_params,
    privacy_sample_iterations=5)


for i in range(evaluations):
  
    train_ind, test_ind = test_train_split(y_len=len(data), 
                                           test_perc=test_split)
    
    start_clock = datetime.datetime.now()
    sample_gaus = gaussian_mechanism_matrix_sample(
            data=data.iloc[train_ind],
            **privacy_params)
    end_clock = datetime.datetime.now()
    
    model = SVR(kernel='rbf', C=params['best_C'], gamma=params['best_gamma'])
    model.fit(sample_gaus[predictors].values, 
              sample_gaus[response].values)
    
    result = [
        mean_absolute_error(y[test_ind], 
                            model.predict(X[test_ind]).reshape(-1,1), 
                            100)
    ] 
    
    to_record = {
        'query':'singleton',
        'mechanism':'matrix iid gaussian',
        'metric':'MAE',
        'result':result,
        'sample size':sample_size,
        'mechanism runtime (s)':(end_clock - start_clock).total_seconds(),
        'iteration':i,
        'sensitivity type':'global',
        'sensitivity':sensitivity,
        'epsilon':epsilon,
        'delta':delta,
        'gamma':0,
        'training size':train_size,
        'test holdout size':test_size,
        'tuning holdout size':hyper_param_tuning_size,
        'model':'SVM Regression',
        'kernel':'rbf',
        'global min':target_feature_bounds[0],
        'global max':target_feature_bounds[1],
        'response min':min(y[test_ind]),
        'response max':max(y[test_ind]),
        'response':response,
        'predictors': ' '.join(predictors)
    }
    
    recorded_results = \
        record_result(results=recorded_results, 
                      new_data=to_record,
                      column_names=results_columns)    

0.534689


In [141]:
recorded_results[recorded_results['mechanism'] == 'matrix iid gaussian']

Matrix-variate Gaussian Mechanism

Binary Allocation Strategy Key features
   
    key features = ['age','total_cholesterol','framingham'] 
    
    'age' and 'cholesterol' important as contribute the largest scores to the total. 
    'framingham' important as the target variable.

In [142]:
# MVG mechanism privacy parameters
sensitivity = get_query_row_sensitivity(query_scale=target_feature_bounds,
                                        query_shape=(train_size, n_features),
                                        query_type='identity')

gamma = get_query_gamma(query_scale=target_feature_bounds, 
                        query_shape=(train_size, n_features), 
                        query_type='identity')


# Allocation percentages in 'key_features_allocation' to key features 
# and remainder to all other features
key_features_binary_mvg = ['age','total_cholesterol','framingham']  
key_features_allocation =[0.65, 0.7, 0.75, 0.80, 0.85]

feature_allocations = dict()

for allocation in key_features_allocation:
    
    feature_allocations['mvg '+str(allocation)] = [ 
        allocation / len(key_features_binary_mvg)
        if feature in key_features_binary_mvg 
        else (1 - allocation) / (n_features - len(key_features_binary_mvg))
        for feature in evaluation_features 
    ]

(3.1622776601683795, 836.6600265340755)


In [143]:
for key, allocation in feature_allocations.items():
    
    privacy_params_mvg = dict(
        epsilon=epsilon,
        delta=delta,
        sensitivity=sensitivity,
        gamma=gamma,
        precision_allocation=allocation,
        precision_direction=numpy.identity(n_features),
        covariance_direction='unimodal features',
        covariance_method='binary'
    )
    
    params = svr_parameter_tuning(
        X=hp_data[predictors].values,
        y=hp_data[response].values,
        scoring_function=mean_absolute_error,
        rand_iters=200,
        search_range=(0.001,4.0),
        holdout_split=0.2,
        kernel='rbf',
        privacy_mechanism_sampler=matrixvariate_gaussian_mechanism_sample,
        privacy_mechanism_params=privacy_params_mvg,
        privacy_sample_iterations=5)
    
    for i in range(evaluations):
        
        start_clock = datetime.datetime.now()
        train_ind, test_ind = test_train_split(y_len=len(data), 
                                           test_perc=test_split)
    
        sample_mvg = matrixvariate_gaussian_mechanism_sample(
            data=data.iloc[train_ind],
            **privacy_params_mvg)

        end_clock = datetime.datetime.now()

        model = SVR(kernel='rbf', C=params['best_C'], gamma=params['best_gamma'])
        model.fit(sample_mvg[predictors], sample_mvg[response])

        result = [
            mean_absolute_error(y[test_ind], 
                                model.predict(X[test_ind]).reshape(-1,1), 
                                100)
        ]     
       
        to_record = {
            'query':'identity',
            'mechanism':key,
            'metric':'MAE',
            'result':result,
            'sample size':sample_size,
            'mechanism runtime (s)':(end_clock - start_clock).total_seconds(),
            'iteration':i,
            'sensitivity type':'global',
            'sensitivity':sensitivity,
            'epsilon':epsilon,
            'delta':delta,
            'gamma':gamma,
            'training size':train_size,
            'test holdout size':test_size,
            'tuning holdout size':hyper_param_tuning_size,
            'model':'SVM Regression',
            'kernel':'rbf',
            'global min':target_feature_bounds[0],
            'global max':target_feature_bounds[1],
            'response min':min(y[test_ind]),
            'response max':max(y[test_ind]),
            'response':response,
            'predictors': ' '.join(predictors)
        }
        
        recorded_results = \
            record_result(results=recorded_results, 
                          new_data=to_record,
                          column_names=results_columns)

2.086958
1.945335
2.06515
1.928468
2.01048


In [145]:
recorded_results[recorded_results['mechanism'].str.startswith('mvg', na=False)]

Matrix-variate Gaussian Mechanism

Derived Directional Noise

Features allocations are proprotional to the singular values or explained directional variance
    
    Directions are equal to eigenvectors of the sample covariance. 
    These are the orthogonal primary axis of the variation in the sample covariance 

In [146]:
# DP SVD parameters
svd_privacy_allocation = 0.15

svd_sensitivity = get_query_row_sensitivity(query_type='covariance',
                                            query_scale=target_feature_bounds,
                                            query_shape=data.shape)

# DP MVG parameters

sensitivity = get_query_row_sensitivity(query_type='identity',
                                        query_scale=target_feature_bounds,
                                        query_shape=(train_size, n_features))

gamma = get_query_gamma(query_scale=target_feature_bounds, 
                        query_shape=(train_size, n_features), 
                        query_type='identity')

print(sensitivity, gamma)  
print(svd_sensitivity)

(3.1622776601683795, 836.6600265340755)
0.00023529965411


In [147]:
# Allocation percentages in 'key_features_allocation' to key features 
# and remainder to all other features 
sv_allocations = [0.65, 0.7, 0.75, 0.80, 0.85]

tuning_query = centered_sample_covariance_matrix(X=hp_data)

for key in sv_allocations:
    
    start_svd_clock = datetime.datetime.now()
    cov_sample = gaussian_mechanism_matrix_sample(
                data=tuning_query,
                epsilon=epsilon*svd_privacy_allocation,
                delta=delta*svd_privacy_allocation,
                sensitivity=svd_sensitivity,
                symmetric=True,
                verbose=False)

    precision_directions, singular_values, _ = numpy.linalg.svd(cov_sample, full_matrices=True)

    sv_proportions = singular_values / numpy.sum(singular_values)
        
    feature_allocations = dict()
    for allocation in sv_allocations:
        feature_allocations[key] = [ 
            ((1 - allocation) / len(sv_proportions)) + 
            (sv * allocation)
            for sv in sv_proportions
        ] 
    end_svd_clock = datetime.datetime.now()
    
    privacy_params_mvg = dict(
        epsilon=epsilon,
        delta=delta,
        sensitivity=sensitivity,
        gamma=gamma,
        precision_allocation=feature_allocations[key],
        precision_direction=precision_directions,
        covariance_direction='unimodal features',
        covariance_method='binary'
    )
    
    params = svr_parameter_tuning(
        X=hp_data[predictors].values,
        y=hp_data[response].values,
        scoring_function=mean_absolute_error,
        rand_iters=200,
        search_range=(0.001,4.0),
        holdout_split=0.2,
        kernel='rbf',
        privacy_mechanism_sampler=matrixvariate_gaussian_mechanism_sample,
        privacy_mechanism_params=privacy_params_mvg,
        privacy_sample_iterations=5)
    
    for i in range(evaluations):
        
        start_clock = datetime.datetime.now()
        train_ind, test_ind = test_train_split(y_len=len(data), 
                                           test_perc=test_split)
        
        sample_mvg = matrixvariate_gaussian_mechanism_sample(
            data=data.iloc[train_ind],
            **privacy_params_mvg)

        end_clock = datetime.datetime.now()

        model = SVR(kernel='rbf', C=params['best_C'], gamma=params['best_gamma'])
        model.fit(sample_mvg[predictors], sample_mvg[response])

        result = [
            mean_absolute_error(y[test_ind], 
                                model.predict(X[test_ind]).reshape(-1,1), 
                                100)
        ]     
        
        sample_clock_dif = (end_svd_clock - start_svd_clock) + (end_clock - start_clock)
       
        to_record = {
            'query':'identity',
            'mechanism':'mvg directed ' + str(key),
            'metric':'MAE',
            'result':result,
            'sample size':sample_size,
            'mechanism runtime (s)':sample_clock_dif.total_seconds(),
            'iteration':i,
            'sensitivity type':'global',
            'sensitivity':sensitivity,
            'epsilon':epsilon,
            'delta':delta,
            'gamma':gamma,
            'training size':train_size,
            'test holdout size':test_size,
            'tuning holdout size':hyper_param_tuning_size,
            'model':'SVM Regression',
            'kernel':'rbf',
            'global min':target_feature_bounds[0],
            'global max':target_feature_bounds[1],
            'response min':min(y[test_ind]),
            'response max':max(y[test_ind]),
            'response':response,
            'predictors': ' '.join(predictors)
        }
        
        recorded_results = \
            record_result(results=recorded_results, 
                          new_data=to_record,
                          column_names=results_columns)
    
    print((end_clock - start_clock).total_seconds())

2.117705
1.970278
1.911625
1.884837
1.952032


In [None]:
recorded_results[recorded_results['mechanism'].str.startswith('mvg directed', na=False)]

In [149]:
recorded_results.to_pickle(pickle_file)