In [1]:
import numpy
import os
import glob
import random
import pandas
import math 
import pprint

from framingham10yr.framingham10yr import framingham_10year_risk


# Local imports
from calculate_framingham_risk_score import calculate_framingham_risk_score

import preprocessing
from preprocessing import scale_data
from preprocessing import centered_sample_covariance_matrix

import differential_privacy_mechanisms
from differential_privacy_mechanisms import gaussian_mechanism
from differential_privacy_mechanisms import gaussian_mechanism_matrix_sample

from differential_privacy_mechanisms import MVGMechanism
from differential_privacy_mechanisms import centered_covariance_query_sensitivity

import model_evaluation
from model_evaluation import principle_component_RSS
from model_evaluation import root_mean_squared_error

# My Utility Scripts
# from printd import printd
# from plots import plot_curves
# from plots import plot_boxes

In [2]:
new_key = [
    'date',
    'encounter',
    'patient',    
]

new_num_features = [
    'bmi',
    'diastolic_blood_pressure',
    'systolic_blood_pressure',
    'glucose',
    'hdl_cholesterol',
    'ldl_cholesterol',
    'total_cholesterol',
    'triglycerides',
    'age',
    'framingham'
]

new_cat_features = [
    'sex',
    'smoker',
    'blood_pressure_med_treatment'
]

In [36]:
import datetime
import process_synthea_patient_data
from process_synthea_patient_data import process_synthea_patient_data

data_location = 'synthea/output/csv/'
target_dir = 'data/'
target_file_name = 'Florida_100000_20190227' 
# + datetime.date.today().strftime("%Y%m%d")

# data = process_synthea_patient_data(data_dir=data_location,
#                                     data_save_dir=target_dir,
#                                     data_save_name=target_file_name).describe()

# data = pandas.read_pickle(target_dir+target_file_name)


In [39]:
data[new_num_features].dropna().describe()

Unnamed: 0,bmi,diastolic_blood_pressure,systolic_blood_pressure,glucose,hdl_cholesterol,ldl_cholesterol,total_cholesterol,triglycerides,age,framingham
count,180760.0,180760.0,180760.0,180760.0,180760.0,180760.0,180760.0,180760.0,180760.0,180760.0
mean,35.407962,88.490405,137.443018,89.580627,63.520117,98.072879,192.645247,151.715086,54.758556,10.569263
std,6.007346,13.445424,26.793899,23.388825,13.512145,27.857323,27.445294,68.380698,11.852553,5.442565
min,17.3,67.4,97.0,36.45,20.0,50.9,160.0,100.0,20.889802,-6.0
25%,31.5,77.9,116.3,74.6,59.9,78.5,173.0,116.1,46.053388,8.0
50%,35.0,83.5,127.7,85.0,66.5,91.4,186.0,132.1,56.095825,12.0
75%,38.7,100.2,161.0,95.7,73.2,108.35,198.9,147.9,64.301164,14.0
max,254.8,123.5,200.9,200.0,80.0,200.0,305.0,528.8,78.806297,26.0


In [4]:
#  Preprocess data
    # establish bounds
    # centre and scale (bounds -> [-1,1])    

# Gathered from anecdotal internet sources
# We require reasonable lower and upper bounds on values for Global Sensitivity calculation
bounds = {
    'bmi':(0,200),
    'diastolic_blood_pressure':(60,140),
    'systolic_blood_pressure':(90,250),
    'glucose':(0,1000),
    'hdl_cholesterol':(0,400),
    'ldl_cholesterol':(0,1000),
    'total_cholesterol':(0,1500),
    'triglycerides':(0,3000),
    'age':(0,120),
    'framingham':(-10,37)
}

feature_scale = (-1,1)
results = dict()

X = scale_data(data=data[new_num_features].dropna(), data_bounds=bounds, target_bounds=feature_scale)

obs, features = X.shape

In [None]:
# DP COVARIANCE ESTIMATION - SIMPLE GAUSSIAN NOISE TO COVARIANCE ESTIMATION QUERY
# Add symmetric noise to covariance matrix estimate f(X) = (1/n)*transpose(X)X

# Compute sample correlation
query = centered_sample_covariance_matrix(X=X)

# Add symmetric iid noise
sample_symmetric = gaussian_mechanism_matrix_sample(
            data=query,
            epsilon=epsilon,
            delta=delta,
            sensitivity=symmetric_unit_sensitivity,
            symmetric=True,
            verbose=False)

print('Symmetric Sample')
print([principle_component_RSS(true=query, pred=s) for s in sample_symmetric])

In [None]:
# DP COVARIANCE ESTIMATION - SIMPLE GAUSSIAN NOISE TO IDENTITY QUERY 
# Add noise to scaled dataset f(X) = X
sample_identity = gaussian_mechanism_matrix_sample(
    data=X,
    epsilon=epsilon,
    delta=delta,
    sensitivity=unit_sensitivity,
    symmetric=False,
    verbose=False)

# Compute sample covariance
sample_identity_cov = centered_sample_covariance_matrix(X=sample_identity)

print('Identity Sample')
print(principle_component_RSS(true=query, pred=sample_identity_cov))

In [12]:
# Setup for estimation of framingham score
y_name = ['framingham']
X_names = [ f for f in new_num_features if not(f=='framingham')]

In [None]:
# DP DATA RELEASE - BASELINE MODEL ON UNPRIVATISED DATA
# Create sequential NN model
results['baseline'] = list()

nn_fit_params = dict(epochs=10, batch_size=16, verbose=0) 

results['baseline'] = \
    seq_nn_cross_validation(train_data=X, 
                            test_data=X, 
                            folds=10,
                            X_labels=X_names,
                            y_label=y_name,
                            fit_params=nn_fit_params)

In [None]:
pprint.pprint(results['baseline'])

In [None]:
# DP DATA RELEASE - SIMPLE GAUSSIAN NOISE TO IDENTITY QUERY 
# Generate differentially private sample from simple Gaussian Mechanism
X_gaus_dp = gaussian_mechanism_matrix_sample(
    data=X,
    epsilon=epsilon,
    delta=delta,
    sensitivity=unit_sensitivity,
    symmetric=False,
    verbose=False)

nn_fit_params = dict(
    epochs=10, 
    batch_size=16,
    verbose=0
)

results['gaussian'] = \
    seq_nn_cross_validation(train_data=X_gaus_dp, 
                            test_data=X, 
                            folds=5,
                            X_labels=X_names,
                            y_label=y_name,
                            fit_params=nn_fit_params)

In [None]:
pprint.pprint(results['gaussian'])

In [None]:
# DP COVARIANCE ESTIMATION - MATRIX-VARIATE GAUSSIAN (MVG) NOISE TO IDENTITY QUERY 
# Generate differentially private sample from Matrix-variate Gaussian mechanism
'''
    Binary Allocation Strategy
    
    key features = ['age','total_cholesterol','framingham'] 
    
    'age' and 'cholesterol' important as contribute the largest scores to the total. 
    'framingham' important as the target variable.
'''

results['mvg binary allocation'] = dict()

# Allocation percentages in 'key_features_allocation' to key features 
# and remainder to all other features
key_features_binary_mvg = ['age','total_cholesterol','framingham']  
key_features_allocation = [0.45,0.55,0.65,0.75,0.85,0.95]

feature_allocations = dict()
for allocation in key_features_allocation:
    feature_allocations[allocation] = [ 
        allocation / len(key_features_binary_mvg)
        if feature in key_features_binary_mvg 
        else (1 - allocation) / (features - len(key_features_binary_mvg))
        for feature in new_num_features 
    ]
    
    params = dict(
        epsilon=epsilon,
        delta=delta,
        sensitivity=obs_sensitivity,
        gamma=gamma,
        precision_allocation=feature_allocations[allocation],
        precision_direction=numpy.identity(features),
        covariance_direction='unimodal features',
        covariance_method='binary'
    )

    X_mvg_sdp = matrixvariate_gaussian_mechanism_sample(data=X,
                                                        **params)
    nn_fit_params = dict(
        epochs=10, 
        batch_size=16,
        verbose=0
    )

    results['mvg binary allocation'][allocation] = \
        seq_nn_cross_validation(train_data=X_mvg_sdp, 
                                test_data=X, 
                                folds=5,
                                X_labels=X_names,
                                y_label=y_name,
                                fit_params=nn_fit_params)

In [None]:
results['mvg binary allocation']

In [32]:
'''
    Binary Allocation Strategy
    
    Features allocations are proprotional to the singular values or explained directional variance
    
    Directions are equal to eigenvectors of the sample covariance. 
    These are the orthogonal primary axis of the variation in the sample covariance 
'''

results['mvg directed binary allocation'] = dict()

dp_sample_cov = gaussian_mechanism_matrix_sample(
            data=query,
            epsilon=epsilon*0.2,
            delta=delta*0.2,
            sensitivity=symmetric_unit_sensitivity,
            symmetric=True,
            verbose=False)

Q, XX_sv, Qt = numpy.linalg.svd(dp_sample_cov, full_matrices=True)

sv_proportion = XX_sv / numpy.sum(XX_sv)
sv_allocation = [0.55,0.75,0.95]

feature_allocations = dict()
for allocation in sv_allocation:
    feature_allocations[allocation] = [ 
        ((1 - allocation) / len(sv_proportion)) + 
        (sv*allocation)
        for sv in sv_proportion
    ]
    
    params = dict(
        epsilon=epsilon,
        delta=delta,
        sensitivity=obs_sensitivity,
        gamma=gamma,
        precision_allocation=feature_allocations[allocation],
        precision_direction=Q,
        covariance_direction='unimodal features',
        covariance_method='binary'
    )

    X_mvg_sdp = matrixvariate_gaussian_mechanism_sample(data=X,
                                                        **params)
    nn_fit_params = dict(
        epochs=10, 
        batch_size=16,
        verbose=0
    )

    results['mvg directed binary allocation'][allocation] = \
        seq_nn_cross_validation(train_data=X_mvg_sdp, 
                                test_data=X, 
                                folds=5,
                                X_labels=X_names,
                                y_label=y_name,
                                fit_params=nn_fit_params)

In [185]:
results['mvg directed binary allocation']