# Privacy


## Implementation

In [1]:
import os.path
import pandas as pd
import numpy as np
import faiss
from os import listdir
from os.path import isfile, join

def priv(trn, val, syn, is_val, c = 100):
    n_trn = trn.shape[0]
    n_val = val.shape[0]
    n_syn = syn.shape[0]
    all = pd.concat([trn, val, syn])
    # quantile binning of numerics
    num_cols = trn.dtypes[trn.dtypes!='object'].index
    for col in num_cols:
        vals = pd.to_numeric(all[col], 'coerce')
        vals = pd.qcut(vals, q=c, duplicates='drop')
        all[col] = vals
    # one hot encode
    all_hot = np.ascontiguousarray(pd.get_dummies(all, dtype='float32', dummy_na=True).to_numpy())
    trn_hot = all_hot[0:n_trn,:]
    val_hot = all_hot[(n_trn):(n_trn+n_val),:]
    syn_hot = all_hot[(n_trn+n_val):,:]
    dim = all_hot.shape[1]
    # calculate DCR of `syn` with respect to `trn`
    index = faiss.IndexFlatL2(dim)
    index.add(trn_hot)
    dists_trn, idx_trn = index.search(syn_hot, 1)
    dists_trn = dists_trn[:,0] / 2
    idx_trn = idx_trn[:,0]
    # calculate DCR of `syn` with respect to `val`
    index = faiss.IndexFlatL2(dim)
    index.add(val_hot)
    dists_val, idx_val = index.search(syn_hot, 2)
    k = 2 if is_val else 1
    dists_val = dists_val[:,k-1] / 2
    idx_val = idx_val[:,k-1]
    # results
    share = np.mean(dists_trn<dists_val) + (n_trn/(n_trn+n_val)) * np.mean(dists_trn==dists_val)
    dists_trn_mean = np.mean(dists_trn)
    dists_val_mean = np.mean(dists_val)
    out = pd.DataFrame({
        'n_syn': [n_syn],
        'n_closer': np.sum([dists_trn<dists_val]),
        'n_further': np.sum([dists_trn>dists_val]),
        'n_equal': np.sum([dists_trn==dists_val]),
        'share': [share],
        'dists_trn_mean': [dists_trn_mean],
        'dists_val_mean': [dists_val_mean]
    })
    return out

## Test-Drive

In [2]:
trn = pd.read_csv('data/adult_trn.csv.gz')
val = pd.read_csv('data/adult_val.csv.gz')
syn = pd.read_csv('data/adult_mostly.csv.gz')

priv(trn, val, syn, False, c=100)

Unnamed: 0,n_syn,n_closer,n_further,n_equal,share,dists_trn_mean,dists_val_mean
0,50000,9239,8608,32153,0.50631,2.3384,2.35196


## Benchmark

In [4]:
%%time

datasets = ['adult', 'credit-default', 'bank-marketing', 'online-shoppers']
fns = ['mostly', 'copulagan', 'ctgan', 'tvae', 'gaussian_copula', 'gretel', 'synthpop',
       'mostly_e1', 'mostly_e2', 'mostly_e4', 'mostly_e8', 'mostly_e16',
       'flip10', 'flip20', 'flip30', 'flip40', 'flip50', 
       'flip60', 'flip70', 'flip80', 'flip90',
       'val']

results = []
for dataset in datasets:
    trn = pd.read_csv('data/' + dataset + '_trn.csv.gz')
    val = pd.read_csv('data/' + dataset + '_val.csv.gz')
    for fn in fns:
        syn_fn = 'data/' + dataset  + '_' + fn + '.csv.gz'
        print(syn_fn)
        is_val = fn=='val'
        if (os.path.exists(syn_fn)):
            syn = pd.read_csv(syn_fn)
            pri = priv(trn, val, syn, is_val, c=100)
            pri['dataset'] = dataset
            pri['synthesizer'] = fn
            results.append(pri)
            
x = pd.concat(results)
x.to_csv('privacy_c100.csv', index=False)
x

data/adult_mostly.csv.gz
data/adult_copulagan.csv.gz
data/adult_ctgan.csv.gz
data/adult_tvae.csv.gz
data/adult_gaussian_copula.csv.gz
data/adult_gretel.csv.gz
data/adult_synthpop.csv.gz
data/adult_mostly_e1.csv.gz
data/adult_mostly_e2.csv.gz
data/adult_mostly_e4.csv.gz
data/adult_mostly_e8.csv.gz
data/adult_mostly_e16.csv.gz
data/adult_flip10.csv.gz
data/adult_flip20.csv.gz
data/adult_flip30.csv.gz
data/adult_flip40.csv.gz
data/adult_flip50.csv.gz
data/adult_flip60.csv.gz
data/adult_flip70.csv.gz
data/adult_flip80.csv.gz
data/adult_flip90.csv.gz
data/adult_val.csv.gz
data/credit-default_mostly.csv.gz
data/credit-default_copulagan.csv.gz
data/credit-default_ctgan.csv.gz
data/credit-default_tvae.csv.gz
data/credit-default_gaussian_copula.csv.gz
data/credit-default_gretel.csv.gz




data/credit-default_synthpop.csv.gz
data/credit-default_mostly_e1.csv.gz
data/credit-default_mostly_e2.csv.gz
data/credit-default_mostly_e4.csv.gz
data/credit-default_mostly_e8.csv.gz
data/credit-default_mostly_e16.csv.gz
data/credit-default_flip10.csv.gz
data/credit-default_flip20.csv.gz
data/credit-default_flip30.csv.gz
data/credit-default_flip40.csv.gz
data/credit-default_flip50.csv.gz
data/credit-default_flip60.csv.gz
data/credit-default_flip70.csv.gz
data/credit-default_flip80.csv.gz
data/credit-default_flip90.csv.gz
data/credit-default_val.csv.gz
data/bank-marketing_mostly.csv.gz
data/bank-marketing_copulagan.csv.gz
data/bank-marketing_ctgan.csv.gz
data/bank-marketing_tvae.csv.gz
data/bank-marketing_gaussian_copula.csv.gz
data/bank-marketing_gretel.csv.gz
data/bank-marketing_synthpop.csv.gz
data/bank-marketing_mostly_e1.csv.gz
data/bank-marketing_mostly_e2.csv.gz
data/bank-marketing_mostly_e4.csv.gz
data/bank-marketing_mostly_e8.csv.gz
data/bank-marketing_mostly_e16.csv.gz
data/b



data/online-shoppers_synthpop.csv.gz
data/online-shoppers_mostly_e1.csv.gz
data/online-shoppers_mostly_e2.csv.gz
data/online-shoppers_mostly_e4.csv.gz
data/online-shoppers_mostly_e8.csv.gz
data/online-shoppers_mostly_e16.csv.gz
data/online-shoppers_flip10.csv.gz
data/online-shoppers_flip20.csv.gz
data/online-shoppers_flip30.csv.gz
data/online-shoppers_flip40.csv.gz
data/online-shoppers_flip50.csv.gz
data/online-shoppers_flip60.csv.gz
data/online-shoppers_flip70.csv.gz
data/online-shoppers_flip80.csv.gz
data/online-shoppers_flip90.csv.gz
data/online-shoppers_val.csv.gz
CPU times: user 1h 12min 51s, sys: 36.4 s, total: 1h 13min 27s
Wall time: 11min 39s


Unnamed: 0,n_syn,n_closer,n_further,n_equal,share,dists_trn_mean,dists_val_mean,dataset,synthesizer
0,50000,9239,8608,32153,0.506310,2.338400,2.351960,adult,mostly
0,50000,8343,8357,33300,0.499860,4.187960,4.187860,adult,copulagan
0,50000,8594,8180,33226,0.504140,4.492840,4.501100,adult,ctgan
0,50000,6305,6372,37323,0.499330,3.893400,3.892200,adult,tvae
0,50000,7356,7390,35254,0.499660,5.537340,5.536700,adult,gaussian_copula
...,...,...,...,...,...,...,...,...,...
0,50000,17168,5958,26874,0.612100,4.389520,4.736060,online-shoppers,flip60
0,50000,12370,7463,30167,0.549070,4.626080,4.761320,online-shoppers,flip70
0,50000,9985,8098,31917,0.518870,4.762340,4.807000,online-shoppers,flip80
0,50000,9071,8439,32490,0.506320,4.827220,4.840060,online-shoppers,flip90
