In [2]:
import datetime
import pandas
import numpy
import glob
import re

# local imports
from differential_privacy_parameters import get_query_point_sensitivity
from differential_privacy_parameters import get_query_row_sensitivity
from differential_privacy_parameters import get_query_gamma

from differential_privacy_mechanisms import gaussian_mechanism_matrix_sample
from differential_privacy_mechanisms import matrixvariate_gaussian_mechanism_sample
from differential_privacy_mechanisms import MVGMechanism

from model_evaluation import test_train_split
from model_evaluation import principle_component_RSS
from model_evaluation import root_mean_squared_error
from model_evaluation import record_result

from preprocessing import centered_sample_covariance_matrix
from preprocessing import scale_data

from models import seq_nn_single_evaluation

In [None]:
def evaluate_dp_mechanism_krr_cv(data,
                                 samples,
                                 krr_kernel,
                                 scoring_function,
                                 privacy_mechanism_sampler,
                                 mechanism_args):
    '''
    evaluate_dp_mechanism_krr_cv
    
    Evaluates differentially private mechanism by fitting a Kernel Ridge Regression (KRR) model
    returning its cross validation score (using the supplied scoring function)
    
        - Generates differentially private sample from supplied privacy mechanism
        - Tune for lowest score KRR hyperparameters
        - Return cross validation scores for model fit with tuned hyperparameters
    '''
    from sklearn.kernel_ridge import KernelRidge
    
    result = numpy.empty(e1_samples)
    
    test_X = data[:,:-1]
    test_y = data[:,-1]
    
    for i in xrange(e1_samples):
    
        mechanism_sample = privacy_mechanism_sampler(
            data=data,
            **mechanism_args)
        
        parameters = krr_private_param_rand_search(
            train_X=mechanism_sample['X'],
            train_y=mechanism_sample['y'],
            test_X=test_X,
            test_y=test_y,
            rand_iters=100,
            krr_kernel=krr_kernel,
            scoring_function=scoring_function)
        
        model = KernelRidge(kernel=krr_kernel, 
                            alpha=parameters['best_alpha'], 
                            gamma=parameters['best_gamma'])

        return krr_private_cross_validate(
            train_X=mechanism_sample['X'],
            train_y=mechanism_sample['y'],
            test_X=test_X,
            test_y=test_y,
            krr=model,
            score_summary_function=numpy.mean,
            scoring_function=scoring_function)

#  Data Processing and Setup

Import and concatonate all data

In [3]:
target_dir = 'data/'

data_load = None
for file_name in glob.glob(target_dir + '*'):
    if not(re.search(r'\.data$',file_name)):
        print('Loading...\t' + file_name)
        if data_load is None:
            data_load = pandas.read_pickle(file_name)
        else:
            data_load = pandas.concat([data_load,
                                       pandas.read_pickle(file_name)], 
                                      sort=False)

Loading...	data\California_100000_20190227
Loading...	data\Florida_100000_20190227
Loading...	data\Georgia_100000_20190227
Loading...	data\Illinois_100000_20190227
Loading...	data\New York_100000_20190227
Loading...	data\Ohio_100000_20190227
Loading...	data\Pennsylvania_100000_20190227
Loading...	data\Texas_100000_20190227


In [None]:
data_load.describe()

Scale data and establish evaluation parameters

In [25]:


evaluation_features = [
    'bmi',
    'diastolic_blood_pressure',
    'systolic_blood_pressure',
    'glucose',
    'hdl_cholesterol',
    'ldl_cholesterol',
    'total_cholesterol',
    'triglycerides',
    'age',
    'framingham'    
]

data_feature_bounds = {
    'bmi':(0,400),
    'diastolic_blood_pressure':(60,140),
    'systolic_blood_pressure':(90,250),
    'glucose':(0,2000),
    'hdl_cholesterol':(0,1500),
    'ldl_cholesterol':(0,2000),
    'total_cholesterol':(0,2100),
    'triglycerides':(0,3000),
    'age':(0,120),
    'framingham':(-10,37)
}
target_feature_bounds = (0,1)

# Setup for estimation of framingham score
response = ['framingham']
predictors = [ f for f in evaluation_features if f not in response]

results_columns = [
    'mechanism', 
    'query', 
    'sample size',
    'iteration', 
    'metric', 
    'result', 
    'mechanism runtime (s)', 
    'total runtime (s)'
]
result_pickle_location = 'results/'

# Scale data 'data_feature_bounds' -> 'target_feature_bounds'
data_scaled = scale_data(data_load[evaluation_features].dropna(),
                         target_bounds=target_feature_bounds,
                         data_bounds=data_feature_bounds)

# Sample data if needed
sample_size = 50000
evaluation_samples = 20

if isinstance(sample_size, int) and sample_size < len(data_scaled):
    data = data_scaled.sample(sample_size)
else:
    data = data_scaled

Differential Privacy parameters

In [26]:
epsilon = 1.0 
# 1 / number of observations
delta = pow(data.shape[0], -1)

In [None]:
data.describe()

In [6]:
import kernel_ridge_regression
from kernel_ridge_regression import kernel_ridge_param_search
from kernel_ridge_regression import kernel_ridge_cv
from kernel_ridge_regression import krr_private_param_rand_search
from kernel_ridge_regression import krr_private_cross_validate


import sklearn.metrics
import sklearn.preprocessing

import sklearn.model_selection
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score

In [7]:
def root_mean_squared_error(y_true, y_pred):
    '''
    Computation of evaluation metric RMSE
    '''
    return pow(sklearn.metrics.mean_squared_error(y_true, y_pred)/ len(y_true), 0.5)


def rmse_scorer():
    '''
    Wrapper function to create sklearn 'scorer' for RMSE metric
    '''
    return sklearn.metrics.make_scorer(root_mean_squared_error, 
                                       greater_is_better=False)

In [34]:
n_obs, n_features = data.shape

evaluation_test_split = 0.1
evalaution_test_size = int(n_obs*evaluation_test_split)
evaluation_train_size = n_obs - evalaution_test_size

evaluation_samples = 2

# Model parameters
model_params = dict(epochs=5, batch_size=128, verbose=0) 

# labelling result values for mechanism
mechanism = 'baseline'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type,
                               str(sample_size),
                               ''.join(response), 
                               'krr',
                               str(evaluation_samples),
                               datetime.date.today().strftime("%Y%m%d")])

data_Xy = dict(X=data[predictors].values, y=data[response].values)

result_pickle_name

'baseline_identity_50000_framingham_krr_2_20190301'

In [None]:
krr_params = kernel_ridge_param_search(
    scoring=rmse_scorer(),
    kernel='rbf', 
    cv_folds=5,
    random=True,
    random_iters=10,
    **data_Xy)

print(krr_params)

# Combine params and data to pass as single dict
krr_params.update(data_Xy)

krr_result = kernel_ridge_cv(
    cv_folds=5,
    kernel='rbf', 
    scoring=rmse_scorer(),
    **krr_params)

In [None]:
result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      1,
                                      metric, 
                                      krr_result, 
                                      0,
                                      0
                                     ]])

result.to_pickle(result_pickle_location + result_pickle_name)

In [None]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(sample_size),
                               ''.join(response),
                               'krr',
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

model_params = dict(epochs=4, batch_size=128, verbose=0)

epsilon = 1.0 
# 1 / number of observations
delta = pow(evaluation_train_size, -1)

sensitivity = get_query_point_sensitivity(query_type='identity',
                                          query_scale=target_feature_bounds,
                                          query_shape=(evaluation_train_size, n_features))

print(sensitivity, epsilon, delta)

In [None]:
result = None

for i in range(3):
    
    start_clock = datetime.datetime.now()

# train_ind, test_ind = \
#         test_train_split(len(data),
#                          evaluation_test_split)

    sample = gaussian_mechanism_matrix_sample(
        data=data,
        epsilon=epsilon,
        delta=delta,
        sensitivity=sensitivity,
        symmetric=False,
        verbose=False)

    end_sample_clock = datetime.datetime.now() 

    krr_params = krr_private_param_rand_search(
        train_X=sample[predictors].values,
        train_y=sample[response].values,
        test_X=data[predictors].values,
        test_y=data[response].values,
        scoring_function=root_mean_squared_error,
        krr_kernel='rbf', 
        rand_iters=10)

#     print(krr_params)

    # Combine params and data to pass as single dict    
    model = KernelRidge(kernel='rbf', 
                        alpha=krr_params['best_alpha'], 
                        gamma=krr_params['best_gamma'])

    metric_result = krr_private_cross_validate(
        train_X=sample[predictors].values,
        train_y=sample[response].values,
        test_X=data[predictors].values,
        test_y=data[response].values,
        krr=model,
        cv_folds=5, 
        scoring_function=root_mean_squared_error,
        score_summary_function=numpy.mean)

    end_loop_clock = datetime.datetime.now() 


    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      metric_result, 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
     print(result.iloc[i], start_clock, end_loop_clock)

result.to_pickle(result_pickle_location + result_pickle_name)

In [24]:
result.to_pickle(result_pickle_location + result_pickle_name)
result.result

0    [0.003661622475886985, 0.003613400244978182, 0...
0    [0.0037581649038623973, 0.0037162187113041396,...
0    [0.0036988310175313155, 0.003752336362770187, ...
Name: result, dtype: object

# Evaluation of Sample Covariance Differential Privacy Methods

## Gaussian Mechanism with symmetric and identity sampling

### Symmetric Gaussian Mechanism

Evaluation setup

In [None]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'covariance'
metric = 'principle component RSS'

result_pickle_location = 'results/'
result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(evaluation_samples), 
                               str(sample_size),
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

Differential Privacy parameters

In [None]:
sensitivity = get_query_point_sensitivity(query_type='covariance',
                                          query_scale=target_feature_bounds,
                                          query_shape=data.shape)

Evaluation of symmetric matrix gaussian mechanism sample

In [None]:
query = centered_sample_covariance_matrix(X=data)

result = None
sample = dict()

for i in range(evaluation_samples): 
    # Sample mechanism
    start_clock = datetime.datetime.now()
    # Add symmetric iid noise
    sample[i] = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon,
                delta=delta,
                sensitivity=sensitivity,
                symmetric=True,
                verbose=False)
    end_sample_clock = datetime.datetime.now() 

    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      principle_component_RSS(true=query, pred=sample[i]), 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_sample_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)


In [None]:
result.describe()

## Identity Gaussian Mechanism

Evaluation setup

In [None]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'itentity'
metric = 'principle component RSS'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(sample_size),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

Differential Privacy parameters

In [None]:
sensitivity = get_query_row_sensitivity(query_type='identity',
                                        query_scale=target_feature_bounds,
                                        query_shape=data.shape)

Evaluation of identity query guassian mechanism sample

In [None]:
query = centered_sample_covariance_matrix(X=data)

result = None

for i in range(evaluation_samples): 
    # Sample mechanism
    start_clock = datetime.datetime.now()
    # Add symmetric iid noise
    sample = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon,
                delta=delta,
                sensitivity=sensitivity,
                symmetric=False,
                verbose=False)
    end_sample_clock = datetime.datetime.now() 

    sample_cov = centered_sample_covariance_matrix(X=sample)
    end_loop_clock = datetime.datetime.now() 
    
    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      principle_component_RSS(true=query, pred=sample_cov), 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)

In [None]:
result.describe()

#  Evaluation of Data Release Differential Privacy Methods

## Gaussian and Matrixvariate Gaussian Mechanisms by regression task

### Identity Query 

Training parameters

In [None]:
n_obs, n_features = data.shape

evaluation_test_split = 0.1
evalaution_test_size = int(n_obs*evaluation_test_split)
evaluation_train_size = n_obs - evalaution_test_size

evaluation_samples = 20

# Model parameters
model_params = dict(epochs=5, batch_size=128, verbose=0) 

### Baseline

Evaluation Setup

In [None]:
# labelling result values for mechanism
mechanism = 'baseline'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type,
                               str(sample_size),
                               ''.join(response), 
                               str(evaluation_samples),
                               datetime.date.today().strftime("%Y%m%d")])

result_pickle_name

Evaluation of baseline model

In [None]:
result = None

for i in range(evaluation_samples):
    start_clock = datetime.datetime.now()
    # Train model and evaluate prediction metric on holdout set   
    metric_result = seq_nn_single_evaluation(train_data=data,
                                             test_data=data,
                                             test_holdout_p=evaluation_test_split,
                                             X_labels=predictors,
                                             y_label=response,
                                             fit_params=model_params)
    
    end_loop_clock = datetime.datetime.now() 
    
    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      metric_result, 
                                      0,
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
    
result.to_pickle(result_pickle_location + result_pickle_name)

In [None]:
result.describe()

### Identity Gaussian Mechanism

Evaluation Setup

In [None]:
# labelling result values for mechanism
mechanism = 'gaussian'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(sample_size),
                               ''.join(response),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

model_params = dict(epochs=4, batch_size=128, verbose=0)

Differential Privacy Parameters

Evaluation of singleton gaussian sequential NN model

In [None]:
epsilon = 1.0 
# 1 / number of observations
delta = pow(evaluation_train_size, -1)

sensitivity = get_query_point_sensitivity(query_type='identity',
                                          query_scale=target_feature_bounds,
                                          query_shape=(evaluation_train_size, n_features))

result = None

for i in range(evaluation_samples):
    
    start_clock = datetime.datetime.now()
    
    train_ind, test_ind = \
            test_train_split(len(data),
                             evaluation_test_split)
        
    sample = gaussian_mechanism_matrix_sample(
        data=data.iloc[train_ind],
        epsilon=epsilon,
        delta=delta,
        sensitivity=sensitivity,
        symmetric=False,
        verbose=False)
    
    end_sample_clock = datetime.datetime.now() 
        
    # Train model and evaluate prediction metric on holdout set   
    metric_result = seq_nn_single_evaluation(train_data=sample,
                                             test_data=data,
                                             X_labels=predictors,
                                             y_label=response,
                                             train_ind=train_ind, 
                                             test_ind=test_ind,
                                             fit_params=model_params)
    
    end_loop_clock = datetime.datetime.now() 
    
    result = record_result(results=result, 
                           column_names=results_columns, 
                           new_data=[[mechanism, 
                                      query_type, 
                                      sample_size,
                                      i+1,
                                      metric, 
                                      metric_result, 
                                      (end_sample_clock - start_clock).total_seconds(),
                                      (end_loop_clock - start_clock).total_seconds()
                                     ]])
    print(result.iloc[i], start_clock, end_loop_clock)
    
result.to_pickle(result_pickle_location + result_pickle_name)

In [None]:
result.describe()

### Matrix-variate Gaussian Mechanism

Binary Allocation Strategy - Key features
   
    key features = ['age','total_cholesterol','framingham'] 
    
    'age' and 'cholesterol' important as contribute the largest scores to the total. 
    'framingham' important as the target variable.


Evaluation Setup

In [None]:
# labelling result values for mechanism
mechanism = 'MVG_binary_knowledge'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(sample_size),
                               ''.join(response),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

model_params = dict(epochs=5, batch_size=64, verbose=1)

Differential Privacy Parameters

In [None]:
sensitivity = get_query_row_sensitivity(query_type='identity',
                                        query_scale=target_feature_bounds,
                                        query_shape=(evaluation_train_size, n_features))

gamma = get_query_gamma(query_scale=target_feature_bounds, 
                        query_shape=(evaluation_train_size, n_features), 
                        query_type='identity')

# Allocation percentages in 'key_features_allocation' to key features 
# and remainder to all other features
key_features_binary_mvg = ['age','total_cholesterol','framingham']  
key_features_allocation = [0.45,0.55,0.65,0.75,0.85,0.95]

feature_allocations = dict()
for allocation in key_features_allocation:
    
    feature_allocations[allocation] = [ 
        allocation / len(key_features_binary_mvg)
        if feature in key_features_binary_mvg 
        else (1 - allocation) / (n_features - len(key_features_binary_mvg))
        for feature in evaluation_features 
    ]

Matrix-variate Gaussian Mechnaism Evaluation

In [None]:
result = None 
for key, allocation in feature_allocations.items():
    
    params = dict(
        epsilon=epsilon,
        delta=delta,
        sensitivity=sensitivity,
        gamma=gamma,
        precision_allocation=allocation,
        precision_direction=numpy.identity(n_features),
        covariance_direction='unimodal features',
        covariance_method='binary'
    )
    
    
    for i in range(evaluation_samples):
        start_clock = datetime.datetime.now() 
        train_ind, test_ind = \
            test_train_split(len(data),
                             evaluation_test_split)
            
        sample = matrixvariate_gaussian_mechanism_sample(data=data.iloc[train_ind],
                                                         **params)
        
        end_sample_clock = datetime.datetime.now() 
        
        metric_result = seq_nn_single_evaluation(train_data=sample,
                                                test_data=data,
                                                X_labels=predictors,
                                                y_label=response,
                                                train_ind=train_ind, 
                                                test_ind=test_ind,
                                                fit_params=model_params)
        
        end_loop_clock = datetime.datetime.now() 
        
        result = record_result(results=result, 
                               column_names=results_columns, 
                               new_data=[[mechanism + '_' + str(key), 
                                          query_type, 
                                          sample_size,
                                          i+1,
                                          metric, 
                                          metric_result, 
                                          (end_sample_clock - start_clock).total_seconds(),
                                          (end_loop_clock - start_clock).total_seconds()
                                         ]])
        
result.to_pickle(result_pickle_location + result_pickle_name)         

In [None]:
result.describe()

Binary Allocation Strategy - Key features
   
    Features allocations are proprotional to the singular values or explained directional variance
    
    Directions are equal to eigenvectors of the sample covariance. 
    These are the orthogonal primary axis of the variation in the sample covariance 

Evaluation setup

In [None]:
# labelling result values for mechanism
mechanism = 'MVG_binary_directed'
query_type = 'identity'
metric = 'rmse'

result_pickle_name = '_'.join([mechanism, 
                               query_type, 
                               str(sample_size),
                               ''.join(response),
                               str(evaluation_samples), 
                               datetime.date.today().strftime("%Y%m%d")])

print(result_pickle_name)

Differential Privacy parameters

In [None]:
# DP SVD parameters
svd_privacy_allocation = 0.2

svd_sensitivity = get_query_row_sensitivity(query_type='covariance',
                                            query_scale=target_feature_bounds,
                                            query_shape=data.shape)

# DP MVG parameters

sensitivity = get_query_row_sensitivity(query_type='identity',
                                        query_scale=target_feature_bounds,
                                        query_shape=(evaluation_train_size, n_features))

gamma = get_query_gamma(query_scale=target_feature_bounds, 
                        query_shape=(evaluation_train_size, n_features), 
                        query_type='identity')

In [None]:
query = centered_sample_covariance_matrix(X=data)

result = None

for i in range(evaluation_samples): 

    start_svd_clock = datetime.datetime.now()
    
    cov_sample = gaussian_mechanism_matrix_sample(
                data=query,
                epsilon=epsilon*svd_privacy_allocation,
                delta=delta*svd_privacy_allocation,
                sensitivity=svd_sensitivity,
                symmetric=True,
                verbose=False)

    precision_directions, singular_values, _ = numpy.linalg.svd(cov_sample, full_matrices=True)

    sv_proportions = singular_values / numpy.sum(singular_values)
    sv_allocations = [0.55,0.75,0.95]

    feature_allocations = dict()
    for allocation in sv_allocations:
        feature_allocations[allocation] = [ 
            ((1 - allocation) / len(sv_proportions)) + 
            (sv * allocation)
            for sv in sv_proportions
        ]    

    end_svd_clock = datetime.datetime.now()
    
    for key, allocation in feature_allocations.items():

        start_mvg_clock = datetime.datetime.now()
        params = dict(
            epsilon=epsilon*(1.0-svd_privacy_allocation),
            delta=delta*(1.0-svd_privacy_allocation),
            sensitivity=sensitivity,
            gamma=gamma,
            precision_allocation=allocation,
            precision_direction=precision_directions,
            covariance_direction='unimodal features',
            covariance_method='binary'
        )        
         
        train_ind, test_ind = \
            test_train_split(len(data),
                             evaluation_test_split)

        sample = matrixvariate_gaussian_mechanism_sample(data=data.iloc[train_ind],
                                                         **params)

        end_mvg_clock = datetime.datetime.now() 

        metric_result = seq_nn_single_evaluation(train_data=sample,
                                                 test_data=data,
                                                 X_labels=predictors,
                                                 y_label=response,
                                                 train_ind=train_ind, 
                                                 test_ind=test_ind,
                                                 fit_params=model_params)

        end_loop_clock = datetime.datetime.now() 
        
        sample_clock_dif = (end_svd_clock - start_svd_clock) + (end_mvg_clock - start_mvg_clock)
        total_clock_dif =  (end_loop_clock - end_mvg_clock) + sample_clock_dif
        
        result = record_result(results=result, 
                               column_names=results_columns, 
                               new_data=[[mechanism + '_' + str(key), 
                                          query_type, 
                                          sample_size,
                                          i+1,
                                          metric, 
                                          metric_result, 
                                          sample_clock_dif.total_seconds(),
                                          total_clock_dif.total_seconds()
                                         ]])

result.to_pickle(result_pickle_location + result_pickle_name) 

In [None]:
result.describe()

In [37]:
import numpy
import operator
import random

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.kernel_ridge import KernelRidge
from scipy.stats import uniform

import sklearn.metrics
import sklearn.preprocessing

def root_mean_squared_error(y_true, y_pred):
    '''
    Computation of evaluation metric RMSE
    '''
    return pow(sklearn.metrics.mean_squared_error(y_true, y_pred)/ len(y_true), 0.5)


def rmse_scorer():
    '''
    Wrapper function to create sklearn 'scorer' for RMSE metric
    '''
    return sklearn.metrics.make_scorer(root_mean_squared_error, 
                                       greater_is_better=False)

def kernel_ridge_param_search(
        X, y,
        scoring,
        kernel,
        cv_folds=5,
        random=True,
        max_alpha=3,
        max_gamma=3,
        random_iters=20):

    krr = KernelRidge(kernel=kernel)

    if random is True:
        # Randomised parameter search
        param_dist = dict(alpha=uniform(0.0001, max_alpha),
                          gamma=uniform(0.0001, max_gamma))
        n_iter_randomised = random_iters

        param_search = RandomizedSearchCV(
            estimator=krr,
            param_distributions=param_dist,
            n_iter=n_iter_randomised,
            scoring=scoring,
            n_jobs=-1,
            pre_dispatch='2*n_jobs',
            cv=cv_folds)
    else:
        # Grid parameter search
        param_grid = dict(alpha=numpy.arange(0.1, max_alpha, 0.1),
                          gamma=numpy.arange(0.1, max_gamma, 0.1))

        param_search = GridSearchCV(
            estimator=krr,
            param_grid=param_grid,
            scoring=scoring,
            cv=cv_folds)

    return param_search.fit(X, y).best_params_


def kernel_ridge_cv(X, y, scoring, cv_folds, **kwargs):

    # Apply Kernel Ridge Regression with empirically selected parameters
    KRR = KernelRidge(**kwargs)

    # Calculate metric RMSE using
    cv_score = cross_val_score(
        estimator=KRR,
        X=X,
        y=y,
        scoring=scoring,
        cv=cv_folds)

    return numpy.array(abs(cv_score))


def krr_private_cross_validate(train_X,
                               train_y,
                               test_X,
                               test_y,
                               krr,
                               scoring_function,
                               score_summary_function=None,
                               cv_folds=5):

    train_selected = [ i for i in range(len(train_y))]
    fold_sample_size = int(len(train_y) / cv_folds)

    res = numpy.empty(cv_folds)

    for fold in range(cv_folds):

        fold_sample = [
            train_selected.pop(random.randrange(len(train_selected)))
            for _ in range(fold_sample_size)]

        krr.fit(train_X[fold_sample],
                train_y[fold_sample])

        y_hat = krr.predict(test_X[fold_sample])
        
        res[fold] = scoring_function(y_true=test_y[fold_sample], y_pred=y_hat)

    if score_summary_function is not None:
        summary_res = score_summary_function(res)
    else:
        summary_res = res

    return summary_res


def krr_private_param_rand_search(train_X,
                                  train_y,
                                  test_X,
                                  test_y,
                                  rand_iters,
                                  krr_kernel,
                                  scoring_function,
                                  train_split=0.8):

    # Randomised parameter search
    param_selections = dict()
    param_dist = dict(alpha=uniform(0.0001, 4),
                      gamma=uniform(0.0001, 4))

    for _ in range(rand_iters):

        a = round(param_dist['alpha'].rvs(1)[0], 4)
        g = round(param_dist['gamma'].rvs(1)[0], 4)

        model = KernelRidge(kernel=krr_kernel, alpha=a, gamma=g)

        param_selections[(a, g)] = krr_private_cross_validate(
            train_X=train_X,
            train_y=train_y,
            test_X=test_X,
            test_y=test_y,
            krr=model,
            score_summary_function=numpy.mean,
            scoring_function=scoring_function)

    best_alpha, best_gamma = \
        min(param_selections.items(), key=operator.itemgetter(1))[0]

    return dict(
        best_alpha=best_alpha,
        best_gamma=best_gamma,
        best_rmse=min(param_selections.values()),
        results=param_selections)


In [None]:
def test_train_split(y_len, test_perc):
    if test_perc >= 0.0 and test_perc <= 1.0:
        selection_pool = [ i for i in range(y_len)]
        test_size = int(y_len * (1.0 - test_perc))

        selected = [
            selection_pool.pop(random.randrange(len(selection_pool)))
            for _ in range(test_size)
        ]

        return selected, selection_pool
    else:
        print('test_train_split_indicies: \tparameter "test_perc" must have ')
        print('\t\t\t\tvalue between 0 and 1, not {p}'.format(p=test_perc))