In [23]:
import pandas as pd
import numpy as np
import collections
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer,DPAdamGaussianOptimizer,DPRMSPropOptimizer
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import Callback
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)

In [2]:
path = 'creditcard.csv'
data = pd.read_csv(path)
# Time column is not necessary, drop it
data.drop(['Time'], axis=1)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [3]:
X_train, X_val = train_test_split(data, test_size=14807)

scaler = StandardScaler()

y_train = X_train.Class.to_numpy()
X_train = X_train.drop(['Class'], axis=1).to_numpy()
y_val= X_val.Class.to_numpy()
X_val = X_val.drop(['Class'], axis=1).to_numpy()

# Scale the dataset
X_train_scaled = scaler.fit_transform(X_train,y_train)
X_val_scaled = scaler.transform(X_val,y_val)




In [4]:
X_val, X_test, y_val, y_test = train_test_split(X_val_scaled, y_val, test_size=7807, random_state=0)

def fit_batchsize(X,y,batch_size):
    n_size = (len(X)//batch_size)*batch_size
    X = X[0:n_size]
    y = y[0:n_size]
    return X, y

batch_size = 1000

X_train, y_train = fit_batchsize(X_train_scaled,y_train, batch_size)
X_val, y_val = fit_batchsize(X_val_scaled, y_val, batch_size)

X_test,y_test = fit_batchsize(X_test,y_test, batch_size)

In [5]:
# Input layer to encoder
input_dim = X_train.shape[1]
inputs = Input(shape=(input_dim,))

# Encoder
latent_space = 2

x = Dense(64, activation='relu')(inputs)
x = Dense(16, activation='relu')(x)
mu = Dense(latent_space, activation='linear')(x)
log_sigma = Dense(latent_space, activation='linear')(x)

encoder = Model(inputs, mu)


In [6]:
def sample_z(args):
    mu, log_sigma = args
    eps = K.random_normal(shape=(batch_size, latent_space), mean=0., stddev=1.)
    return mu + K.exp(log_sigma / 2) * eps

z = Lambda(sample_z, name='sample_z', output_shape=(latent_space,))([mu, log_sigma])

In [11]:
# Decoder
decoder_input = Dense(latent_space, activation='relu')
decoder_hidden = Dense(16, activation='relu')
decoder_out = Dense(input_dim, activation='sigmoid')

x = decoder_input(z)
x = decoder_hidden(x)
outputs = decoder_out(x)

In [12]:
# Overall VAE model, for reconstruction and training
vae = Model(inputs, outputs)

In [13]:
# Generates new data points
d_in = Input(shape=(latent_space,))
d_input = decoder_input(d_in)
d_hidden = decoder_hidden(d_in)
d_out = decoder_out(d_hidden)
decoder = Model(d_in, d_out)

In [14]:
def vae_loss(y_true, y_pred):
    recon = K.sum(K.binary_crossentropy(y_pred, y_true), axis=1)
    kl = 0.5 * K.sum(K.exp(log_sigma) + K.square(mu) - 1. - log_sigma, axis=1)

    return recon + kl

In [15]:
# Hyperparameters
l2_norm_clip = 7
noise_multiplier = 1.3
num_microbatches = 1
learning_rate = .001
n_epoch = 200

In [16]:
# Use DPAdamGaussianOptimizer from Tensorflow Privacy to guarantee differential privacy
vae.compile(optimizer=DPAdamGaussianOptimizer(l2_norm_clip=l2_norm_clip,
    noise_multiplier=noise_multiplier,
    num_microbatches=num_microbatches,
    learning_rate=learning_rate), loss=vae_loss)

vae_history = vae.fit(X_train, X_train, batch_size=batch_size, shuffle=True, 
                      validation_data = (X_val, X_val),
                    epochs=n_epoch, callbacks = [EarlyStopping(monitor='loss',patience = 3)])

Train on 270000 samples, validate on 14000 samples
Epoch 1/200

W1201 09:12:22.532326 4632311232 deprecation.py:323] From /Users/mirayyuce/.virtualenvs/thesis/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2048: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200


Epoch 74/200
Epoch 75/200
Epoch 76/200


In [17]:
compute_dp_sgd_privacy.compute_dp_sgd_privacy(n=60000, batch_size=250, noise_multiplier=1.3, epochs=15, delta=1e-5)

DP-SGD with sampling rate = 0.417% and noise_multiplier = 1.3 iterated over 3600 steps satisfies differential privacy with eps = 1.18 and delta = 1e-05.
The optimal RDP order is 17.0.


(1.1799006739827, 17.0)

In [18]:
x_train_encoded = encoder.predict(X_train)

pred_train = decoder.predict(x_train_encoded)


In [19]:
x_val_encoded = encoder.predict(X_val)

pred = decoder.predict(x_val_encoded)


In [20]:
forest_real = RandomForestClassifier()
forest_real.fit(X_train, y_train)
y_pred = forest_real.predict(X_test)
print("acc", metrics.roc_auc_score(y_test, y_pred))



acc 0.8461538461538461


In [21]:
forest_synt = RandomForestClassifier()
forest_synt.fit(pred_train, y_train)
y_pred = forest_synt.predict(X_test)
print("acc", metrics.roc_auc_score(y_test, y_pred))



acc 0.5151600224592925


In [24]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("acc", metrics.roc_auc_score(y_test, y_pred))


acc 0.8075491847496999


In [25]:
knn_synt = KNeighborsClassifier(n_neighbors=10)
knn_synt.fit(pred_train, y_train)
y_pred = knn_synt.predict(X_test)
print("acc", metrics.roc_auc_score(y_test, y_pred))

acc 0.4446114212108201
