# Tutorial PATE-GAN

## Generating Synthetic Data with Differential Privacy Guarantees 

This tutorial shows how to use [PATE-GAN](https://openreview.net/forum?id=S1zk9iRqF7). We are using the UCI spam dataset.

See [installation instructions](../../doc/install.md) to install the dependencies.

Load dataset and show the first five samples:

In [1]:
import pandas as pd
from collections import Counter
import initpath_alg
initpath_alg.init_sys_path()
import utilmlab

fn_csv = '{}/spambase.csv.gz'.format(utilmlab.get_data_dir())
df = pd.read_csv(fn_csv)  # get UCI spam dataset
target = 'label'

df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x48,x49,x50,x51,x52,x53,x54,x55,x56,label
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


Split the dataset and save the training and test data in separate csv files:

In [2]:
import numpy as np

train_ratio = 0.8
fn_train = 'train.csv'
fn_test = 'test.csv'

idx = np.random.permutation(len(df))

train_idx = idx[:int(train_ratio * len(df))]
test_idx = idx[int(train_ratio * len(df)):]
        
df_train = df.iloc[train_idx]
df_test = df.iloc[test_idx]

df_train.to_csv(fn_train, index=False)
df_test.to_csv(fn_test, index=False)

Run PATE-GAN:

In [3]:
python_exe='python3'
niter=10000
fn_o_train = 'otrain.csv'
fn_o_test =  'otest.csv'
teachers = 50  # use a reduced number of teachers to limit the execution time
epsilon = 1
delta = 5

cmd_arg = '--iter {} --target {} --itrain {} --itest {} --otrain {} --otest {} --teachers {} --epsilon {} --delta {}'.format(niter, target, fn_train, fn_test, fn_o_train, fn_o_test, teachers, epsilon, delta)
!{python_exe} pategan.py {cmd_arg}

Traceback (most recent call last):
  File "pategan.py", line 6, in <module>
    from PATE_GAN import PATE_GAN
  File "/home/kamil/Dropbox/Current_research/privacy/DPDR/mvdschaar-mlforhealthlabpub-4fb84b06c83b/alg/pategan/PATE_GAN.py", line 7, in <module>
    import tensorflow as tf
ModuleNotFoundError: No module named 'tensorflow'


Train a classifier on synthetic data and evaluate on the original test data and compare the AUCROC score with that of the same classifier on the original data.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from collections import Counter


def train_evaluate(df_trn, df_tst):
    model = LogisticRegression(solver='lbfgs', max_iter=4000)

    features = list(df_trn.columns)
    features.remove(target)

    model.fit(df_trn[features], df_trn[target])
    pred_proba = model.predict_proba(df_tst[features])
    return metrics.roc_auc_score(df_tst[target], pred_proba[:,1])

auc = dict()
auc['org'] = train_evaluate(df_train, df_test)

df_pategan_train = pd.read_csv(fn_o_train)

auc['pategan'] = train_evaluate(df_pategan_train, df_test)
    
print('aucroc orignal data {:0.4f} auc synthetic data {:0.4f}'.format(auc['org'], auc['pategan']))