In [22]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib
import matplotlib.pyplot as plt
import random
import math
import tensorflow as tf
from tensorflow import keras as k
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard
from tensorflow.keras.layers import Conv1D, Input, Dense, Flatten, Add, Subtract, Activation, BatchNormalization, Reshape, Permute
from tensorflow.keras.layers import LayerNormalization

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

%matplotlib inline

tf.__version__

'1.14.0'

In [2]:
def tolist(f):
    xn = pd.read_csv(f, sep='\t', index_col=0)
    xn = xn.fillna(xn.mean())
    xn = xn.transpose()
    xn = [x[1:] for x in xn.itertuples()]
    return xn

In [16]:
# read data
dataset = 'BC-TCGA' #'BC-TCGA' #'GSE2034' #'GSE25066'
x_normal = tolist(dataset + '/' + dataset + '-Normal.txt')
x_tumor = tolist(dataset + '/' + dataset + '-Tumor.txt')

print(len(x_normal), len(x_tumor))

61 529


In [17]:
# split into testing, validation and training set
# make sure that the testing set have balenced number of positive and negative samples
n = 16

# this example code runs without shuffle samples
# np.random.shuffle(x_tumor)
# np.random.shuffle(x_normal)

x_tumor_test, x_tumor_val, x_tumor_train = x_tumor[:n], x_tumor[n:n * 2], x_tumor[n * 2:]

x_normal_test, x_normal_val, x_normal_train = x_normal[:n], x_normal[n:n * 2], x_normal[n * 2:]

# much less Normal samples we have,
# Oversampling to balence number of positive and negative samples in training set
x_normal_train = x_normal_train * math.ceil(len(x_tumor_train) / len(x_normal_train))
x_normal_train = x_normal_train[:len(x_tumor_train)]

In [18]:
y_train = np.append(np.zeros(len(x_normal_train)), np.ones(len(x_tumor_train)))
x_train = np.float32(x_normal_train + x_tumor_train)

xtest = np.float32(x_normal_test + x_tumor_test)
ytest = np.append(np.zeros(len(x_normal_test)), np.ones(len(x_tumor_test)))

xval = np.float32(x_normal_val + x_tumor_val)
yval = np.append(np.zeros(len(x_normal_val)), np.ones(len(x_tumor_val)))

len(x_train), len(xval), len(xtest)

(994, 32, 32)

In [19]:
dim = len(x_tumor_val[0])

def build(act='relu'):
    x1 = Input(shape=(dim,))
    v1 = x1

    v1 = k.layers.Reshape((-1, 1))(v1)

    for i in (8, 16, 32, 64, 128, 256, 512, 512):
        v1 = Conv1D(i, kernel_size=9, padding='same', strides=2)(v1)
        v1 = BatchNormalization()(v1)
        v1 = k.layers.Activation(act)(v1)

    v1 = Conv1D(1, kernel_size=1, padding='same')(v1)
    v1 = k.layers.Activation('sigmoid')(v1)
    v1 = k.layers.GlobalAveragePooling1D()(v1)

    model = k.models.Model(inputs=x1, outputs=v1)

    return model

In [20]:
# warning, tensorflow_privacy works with tf 1.X only

from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer

In [23]:
# set noise_multiplier to controll privay level (it's not budget)
# refer to https://github.com/tensorflow/privacy
noise_multiplier = 0.5

num_microbatches = 1
l2_norm_clip = 1.5
learning_rate = 0.0005
batch_size = 32
epochs = 20

if batch_size % num_microbatches != 0:
    raise ValueError('Batch size should be an integer multiple of the number of microbatches')
with tf.device('/cpu:1'):
    opt = DPGradientDescentGaussianOptimizer(
    l2_norm_clip=l2_norm_clip,
    noise_multiplier=noise_multiplier,
    num_microbatches=num_microbatches,
    learning_rate=learning_rate)

    model = build()
    model.compile(optimizer=opt, metrics=['acc'], loss='binary_crossentropy') #binary_crossentropy
    md_save = ModelCheckpoint('weight.hdf5', save_best_only=True, monitor='val_loss')
    rlt = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=1)
    history = model.fit(x_train, y_train, validation_data = (xval, yval), verbose=1, epochs=epochs,
                        callbacks=[md_save], batch_size=batch_size)

    budget, r = compute_dp_sgd_privacy.compute_dp_sgd_privacy(n=len(x_train), batch_size=batch_size,
               epochs=epochs, noise_multiplier=noise_multiplier, delta=1e-5)

    model.load_weights('weight.hdf5')
    print('budget', budget, model.evaluate(xtest, ytest, verbose=0))

Train on 994 samples, validate on 32 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
DP-SGD with sampling rate = 3.22% and noise_multiplier = 0.5 iterated over 622 steps satisfies differential privacy with eps = 35.7 and delta = 1e-05.
The optimal RDP order is 1.75.
budget 35.6824198640258 [0.4644351601600647, 0.875]


In [10]:
# In this sample run, we get 0.875% acc with noise_multiplier = 0.5