# Fidelity

## Implementation

In [1]:
import os.path
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

In [2]:
# original implementations

def bin_data(dt1, dt2, c = 10):
    dt1 = dt1.copy()
    dt2 = dt2.copy()
    # quantile binning of numerics
    num_cols = dt1.dtypes[dt1.dtypes!='object'].index
    for col in num_cols:
        # determine breaks based on `dt1`
        breaks = dt1[col].quantile(np.linspace(0, 1, c+1)).unique()
        dt1[col] = pd.cut(dt1[col], bins=breaks, include_lowest=True).astype(str)
        dt2_vals = pd.to_numeric(dt2[col], 'coerce')
        dt2_bins = pd.cut(dt2_vals, bins=breaks, include_lowest=True).astype(str)
        dt2_bins[dt2_vals < min(breaks)] = '_other_'
        dt2_bins[dt2_vals > max(breaks)] = '_other_'
        dt2[col] = dt2_bins
    # top-C binning of categoricals
    cat_cols = dt1.dtypes[dt1.dtypes=='object'].index
    for col in cat_cols:
        # determine top values based on `dt1`
        top_vals = dt1[col].value_counts().head(c).index.tolist()
        dt1[col].replace(np.setdiff1d(dt1[col].unique().tolist(), top_vals), '_other_', inplace=True)
        dt2[col].replace(np.setdiff1d(dt2[col].unique().tolist(), top_vals), '_other_', inplace=True)
    return [dt1, dt2]

def hellinger(p1, p2):
  return np.sqrt(1 - np.sum(np.sqrt(p1*p2)))

def kullback_leibler(p1, p2):
  idx = p1>0
  return np.sum(p1[idx] * np.log(p1[idx]/p2[idx]))

def jensen_shannon(p1, p2):
  m = 0.5 * (p1 + p2)
  return 0.5 * kullback_leibler(p1, m) + 0.5 * kullback_leibler(p2, m)

def fidelity(dt1, dt2, c = 100, k = 1):
    [dt1_bin, dt2_bin] = bin_data(dt1, dt2, c = c)
    # build grid of all cross-combinations
    
    # >>> use correct naming to refer to training df
    # cols = trn.columns # original naming
    cols = dt1.columns
    interactions = pd.DataFrame(np.array(np.meshgrid(cols, cols, cols)).reshape(3, len(cols)**3).T)
    interactions.columns = ['dim1', 'dim2', 'dim3']
    if k == 1:
        interactions = interactions.loc[(interactions['dim1']==interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
    elif k == 2:
        interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
    elif k == 3:
        interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']<interactions['dim3'])]
    else:
        raise('k>3 not supported')

    results = []
    for idx in range(interactions.shape[0]):
        row = interactions.iloc[idx]
        val1 = dt1_bin[row.dim1] + dt1_bin[row.dim2] + dt1_bin[row.dim3]
        val2 = dt2_bin[row.dim1] + dt2_bin[row.dim2] + dt2_bin[row.dim3]
        freq1 = val1.value_counts(normalize=True).to_frame(name='p1')
        freq2 = val2.value_counts(normalize=True).to_frame(name='p2')
        freq = freq1.join(freq2, how='outer').fillna(0.0)
        p1 = freq['p1']
        p2 = freq['p2']
        out = pd.DataFrame({
          'k': k,
          'dim1': [row.dim1], 'dim2': [row.dim2], 'dim3': [row.dim3],
          'tvd': [np.sum(np.abs(p1 - p2)) / 2], 
          'mae': [np.mean(np.abs(p1 - p2))], 
          'max': [np.max(np.abs(p1 - p2))],
          'l1d': [np.sum(np.abs(p1 - p2))],
          'l2d': [np.sqrt(np.sum((p1 - p2)**2))],
          'hellinger': [hellinger(p1, p2)],
          'jensen_shannon': [jensen_shannon(p1, p2)]})
        results.append(out)

    return pd.concat(results)
    

In [3]:
trn = pd.read_csv('data/online-shoppers_trn.csv.gz')
syn = pd.read_csv('data/online-shoppers_mostly.csv.gz')

In [4]:
# bin_data fails
bin_data(trn, syn, c=10)

TypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.

In [5]:
# updated fidelity function (minor change)

def fidelity(dt1, dt2, c = 100, k = 1):
    [dt1_bin, dt2_bin] = bin_data(dt1, dt2, c = c)
    # build grid of all cross-combinations
    
    # >>> use correct naming to refer to training df
    # cols = trn.columns # original naming
    cols = dt1.columns # correct naming
    
    interactions = pd.DataFrame(np.array(np.meshgrid(cols, cols, cols)).reshape(3, len(cols)**3).T)
    interactions.columns = ['dim1', 'dim2', 'dim3']
    if k == 1:
        interactions = interactions.loc[(interactions['dim1']==interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
    elif k == 2:
        interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
    elif k == 3:
        interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']<interactions['dim3'])]
    else:
        raise('k>3 not supported')

    results = []
    for idx in range(interactions.shape[0]):
        row = interactions.iloc[idx]
        val1 = dt1_bin[row.dim1] + dt1_bin[row.dim2] + dt1_bin[row.dim3]
        val2 = dt2_bin[row.dim1] + dt2_bin[row.dim2] + dt2_bin[row.dim3]
        freq1 = val1.value_counts(normalize=True).to_frame(name='p1')
        freq2 = val2.value_counts(normalize=True).to_frame(name='p2')
        freq = freq1.join(freq2, how='outer').fillna(0.0)
        p1 = freq['p1']
        p2 = freq['p2']
        out = pd.DataFrame({
          'k': k,
          'dim1': [row.dim1], 'dim2': [row.dim2], 'dim3': [row.dim3],
          'tvd': [np.sum(np.abs(p1 - p2)) / 2], 
          'mae': [np.mean(np.abs(p1 - p2))], 
          'max': [np.max(np.abs(p1 - p2))],
          'l1d': [np.sum(np.abs(p1 - p2))],
          'l2d': [np.sqrt(np.sum((p1 - p2)**2))],
          'hellinger': [hellinger(p1, p2)],
          'jensen_shannon': [jensen_shannon(p1, p2)]})
        results.append(out)

    return pd.concat(results)

In [6]:
# updated implementation of bin_data
# changes:
# 1. use .select_dtypes method to more accurately subset cols
# 2. handle boolean columns separatelt
# 3. cast boolean cols to object (no int) so they can be processed correctly in the fidelity() function
# ---> specifically: we need to be able to sum values together in `val1 = dt1_bin[row.dim1] + dt1_bin[row.dim2] + dt1_bin[row.dim3]`

def bin_data(dt1, dt2, c = 10):
    dt1 = dt1.copy()
    dt2 = dt2.copy()
    # quantile binning of numerics
    
    # define all non-object dtypes (so incl. boolean) as numeric
    # num_cols = dt1.dtypes[dt1.dtypes!='object'].index
    num_cols = dt1.select_dtypes(include='number').columns
    
    for col in num_cols:
        # determine breaks based on `dt1`
        breaks = dt1[col].quantile(np.linspace(0, 1, c+1)).unique()
        dt1[col] = pd.cut(dt1[col], bins=breaks, include_lowest=True).astype(str)
        dt2_vals = pd.to_numeric(dt2[col], 'coerce')
        dt2_bins = pd.cut(dt2_vals, bins=breaks, include_lowest=True).astype(str)
        dt2_bins[dt2_vals < min(breaks)] = '_other_'
        dt2_bins[dt2_vals > max(breaks)] = '_other_'
        dt2[col] = dt2_bins
        # for debugging
        # print('column ' + col + ' processing has finished.')
    
    # top-C binning of categoricals
    
    #cat_cols = dt1.dtypes[dt1.dtypes=='object'].index 
    cat_cols = dt1.select_dtypes(include=['object', 'category'])
    for col in cat_cols:
        # determine top values based on `dt1`
        top_vals = dt1[col].value_counts().head(c).index.tolist()
        dt1[col].replace(np.setdiff1d(dt1[col].unique().tolist(), top_vals), '_other_', inplace=True)
        dt2[col].replace(np.setdiff1d(dt2[col].unique().tolist(), top_vals), '_other_', inplace=True)
        # for debugging
        # print('column ' + col + ' processing has finished.')

    # add separate binning of booleans
    bool_cols = dt1.select_dtypes(include=['bool'])
    for col in bool_cols:
        dt1[col] = dt1[col].replace({True: '1', False: '0'})
        dt2[col] = dt2[col].replace({True: '1', False: '0'})
        # for debugging
        # print('column ' + col + ' processing has finished.')
    return [dt1, dt2]

## Test Drive

In [7]:
bin_data(trn, syn, c=10)

[     Administrative Administrative_Duration  Informational   
 0     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]  \
 1     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 2        (1.0, 2.0]             (5.5, 35.0]  (-0.001, 1.0]   
 3     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 4     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 ...             ...                     ...            ...   
 6160  (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 6161  (-0.001, 1.0]           (-0.001, 5.5]    (2.0, 24.0]   
 6162  (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 6163  (-0.001, 1.0]          (35.0, 68.667]  (-0.001, 1.0]   
 6164  (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 
      Informational_Duration ProductRelated ProductRelated_Duration   
 0            (-0.001, 65.5]    (9.0, 13.0]          (129.2, 242.8]  \
 1            (-0.001, 65.5]     (6.0, 9.0]          (129.2, 242.8]   
 2            (-0.001, 65.5] 

In [8]:
# fidelity with k=1
fidelity(trn, syn, k=1, c=100)

Unnamed: 0,k,dim1,dim2,dim3,tvd,mae,max,l1d,l2d,hellinger,jensen_shannon
0,1,Administrative,Administrative,Administrative,0.023005,0.003286,0.011364,0.04601,0.016016,0.035256,0.001239
0,1,Administrative_Duration,Administrative_Duration,Administrative_Duration,0.027675,0.001064,0.005043,0.05535,0.010063,0.034553,0.001192
0,1,Informational,Informational,Informational,0.02249,0.006426,0.02249,0.04498,0.024972,0.02564,0.000656
0,1,Informational_Duration,Informational_Duration,Informational_Duration,0.026242,0.002762,0.024405,0.052483,0.025931,0.030479,0.000927
0,1,ProductRelated,ProductRelated,ProductRelated,0.044678,0.001441,0.004942,0.089357,0.014215,0.039778,0.001581
0,1,ProductRelated_Duration,ProductRelated_Duration,ProductRelated_Duration,0.064227,0.001352,0.006383,0.128455,0.017369,0.059901,0.003581
0,1,BounceRates,BounceRates,BounceRates,0.073715,0.002891,0.035519,0.14743,0.04054,0.073615,0.005383
0,1,ExitRates,ExitRates,ExitRates,0.103271,0.002347,0.007585,0.206541,0.027157,0.098609,0.009438
0,1,PageValues,PageValues,PageValues,0.015698,0.001427,0.007443,0.031397,0.010087,0.023769,0.000564
0,1,SpecialDay,SpecialDay,SpecialDay,0.01742,0.006968,0.017404,0.034839,0.020145,0.024891,0.000618


In [19]:
# fidelity with k=2
k2 = fidelity(trn, syn, k=2, c=100)
k2

Unnamed: 0,k,dim1,dim2,dim3,tvd,mae,max,l1d,l2d,hellinger,jensen_shannon
0,2,Administrative,Administrative_Duration,Administrative_Duration,0.109165,0.000417,0.014436,0.218329,0.018681,0.163814,0.022991
0,2,Administrative,Informational,Informational,0.038929,0.000786,0.007299,0.077859,0.013515,0.073273,0.004936
0,2,Administrative_Duration,Informational,Informational,0.073479,0.000470,0.011517,0.146959,0.016534,0.138536,0.015564
0,2,BounceRates,Informational,Informational,0.070235,0.000489,0.008273,0.140470,0.015446,0.134054,0.014363
0,2,ExitRates,Informational,Informational,0.105272,0.000460,0.005028,0.210543,0.016454,0.174552,0.023895
...,...,...,...,...,...,...,...,...,...,...,...
0,2,PageValues,Revenue,Revenue,0.027737,0.001233,0.010381,0.055474,0.013338,0.043873,0.001893
0,2,Month,Revenue,Revenue,0.018816,0.001882,0.005515,0.037632,0.010917,0.022917,0.000525
0,2,OperatingSystems,Revenue,Revenue,0.005353,0.001338,0.002433,0.010706,0.004142,0.014003,0.000196
0,2,Browser,Revenue,Revenue,0.007948,0.000994,0.003406,0.015896,0.005306,0.020556,0.000419


In [22]:
fidelity(trn, syn, k=2, c=100)[['k', 'tvd', 'mae', 'max', 'l1d', 'l2d','hellinger', 'jensen_shannon']].agg('mean')

k                 2.000000
tvd               0.083080
mae               0.000931
max               0.007765
l1d               0.166160
l2d               0.015566
hellinger         0.126455
jensen_shannon    0.022595
dtype: float64

## Benchmark

In [10]:
%%time

# benchmark all
datasets = ['adult', 'bank-marketing', 'credit-default', 'online-shoppers']
fns = ['mostly', 'copulagan', 'ctgan', 'tvae', 'gaussian_copula', 'gretel', 'synthpop',
       'mostly_e1', 'mostly_e2', 'mostly_e4', 'mostly_e8', 'mostly_e16',
       'flip10', 'flip20', 'flip30', 'flip40', 'flip50', 
       'flip60', 'flip70', 'flip80', 'flip90',
       'val']

results = []
for dataset in datasets:
    trn = pd.read_csv('data/' + dataset + '_trn.csv.gz')
    for fn in fns:
        syn_fn = 'data/' + dataset  + '_' + fn + '.csv.gz'
        print(syn_fn)
        if (os.path.exists(syn_fn)):
            syn = pd.read_csv(syn_fn)
            fid1 = fidelity(trn, syn, k=1, c=100)
            fid2 = fidelity(trn, syn, k=2, c=10)
            fid3 = fidelity(trn, syn, k=3, c=5)
            out = pd.concat([fid1, fid2, fid3])
            out['dataset'] = dataset
            out['synthesizer'] = fn
            results.append(out)

x = pd.concat(results)
x.to_csv('fidelity.csv', index=False)
x

data/adult_mostly.csv.gz
data/adult_copulagan.csv.gz
data/adult_ctgan.csv.gz
data/adult_tvae.csv.gz
data/adult_gaussian_copula.csv.gz
data/adult_gretel.csv.gz
data/adult_synthpop.csv.gz
data/adult_mostly_e1.csv.gz
data/adult_mostly_e2.csv.gz
data/adult_mostly_e4.csv.gz
data/adult_mostly_e8.csv.gz
data/adult_mostly_e16.csv.gz
data/adult_flip10.csv.gz
data/adult_flip20.csv.gz
data/adult_flip30.csv.gz
data/adult_flip40.csv.gz
data/adult_flip50.csv.gz
data/adult_flip60.csv.gz
data/adult_flip70.csv.gz
data/adult_flip80.csv.gz
data/adult_flip90.csv.gz
data/adult_val.csv.gz
data/bank-marketing_mostly.csv.gz
data/bank-marketing_copulagan.csv.gz
data/bank-marketing_ctgan.csv.gz
data/bank-marketing_tvae.csv.gz
data/bank-marketing_gaussian_copula.csv.gz
data/bank-marketing_gretel.csv.gz
data/bank-marketing_synthpop.csv.gz
data/bank-marketing_mostly_e1.csv.gz
data/bank-marketing_mostly_e2.csv.gz
data/bank-marketing_mostly_e4.csv.gz
data/bank-marketing_mostly_e8.csv.gz
data/bank-marketing_mostly_e1



data/credit-default_synthpop.csv.gz
data/credit-default_mostly_e1.csv.gz
data/credit-default_mostly_e2.csv.gz
data/credit-default_mostly_e4.csv.gz
data/credit-default_mostly_e8.csv.gz
data/credit-default_mostly_e16.csv.gz
data/credit-default_flip10.csv.gz
data/credit-default_flip20.csv.gz
data/credit-default_flip30.csv.gz
data/credit-default_flip40.csv.gz
data/credit-default_flip50.csv.gz
data/credit-default_flip60.csv.gz
data/credit-default_flip70.csv.gz
data/credit-default_flip80.csv.gz
data/credit-default_flip90.csv.gz
data/credit-default_val.csv.gz
data/online-shoppers_mostly.csv.gz
data/online-shoppers_copulagan.csv.gz
data/online-shoppers_ctgan.csv.gz
data/online-shoppers_tvae.csv.gz
data/online-shoppers_gaussian_copula.csv.gz
data/online-shoppers_gretel.csv.gz




data/online-shoppers_synthpop.csv.gz
data/online-shoppers_mostly_e1.csv.gz
data/online-shoppers_mostly_e2.csv.gz
data/online-shoppers_mostly_e4.csv.gz
data/online-shoppers_mostly_e8.csv.gz
data/online-shoppers_mostly_e16.csv.gz
data/online-shoppers_flip10.csv.gz
data/online-shoppers_flip20.csv.gz
data/online-shoppers_flip30.csv.gz
data/online-shoppers_flip40.csv.gz
data/online-shoppers_flip50.csv.gz
data/online-shoppers_flip60.csv.gz
data/online-shoppers_flip70.csv.gz
data/online-shoppers_flip80.csv.gz
data/online-shoppers_flip90.csv.gz
data/online-shoppers_val.csv.gz
CPU times: user 18min 36s, sys: 13.4 s, total: 18min 49s
Wall time: 18min 51s


Unnamed: 0,k,dim1,dim2,dim3,tvd,mae,max,l1d,l2d,hellinger,jensen_shannon,dataset,synthesizer
0,1,age,age,age,0.028415,0.001114,0.005074,0.056831,0.010763,0.029035,0.000842,adult,mostly
0,1,workclass,workclass,workclass,0.005931,0.001186,0.003501,0.011862,0.005244,0.023622,0.000406,adult,mostly
0,1,fnlwgt,fnlwgt,fnlwgt,0.034485,0.000690,0.002471,0.068970,0.008635,0.030822,0.000949,adult,mostly
0,1,education,education,education,0.012151,0.001519,0.004540,0.024302,0.008049,0.013165,0.000173,adult,mostly
0,1,education-num,education-num,education-num,0.011926,0.001704,0.004540,0.023853,0.008105,0.013329,0.000178,adult,mostly
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,3,Browser,Revenue,Weekend,0.018978,0.004745,0.010543,0.037956,0.016885,0.017973,0.000323,online-shoppers,val
0,3,Region,Revenue,SpecialDay,0.012165,0.003041,0.006975,0.024331,0.010594,0.013020,0.000170,online-shoppers,val
0,3,Region,Revenue,TrafficType,0.030819,0.001926,0.007461,0.061638,0.014124,0.033558,0.001123,online-shoppers,val
0,3,Region,Revenue,VisitorType,0.015085,0.001371,0.005028,0.030170,0.009101,0.023520,0.000526,online-shoppers,val


In [11]:
x.groupby(['dataset', 'synthesizer', 'k']).mean().head(20)

TypeError: Could not convert ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countryincome to numeric

In [12]:
# ^^ this groupby doesn't work because - again - pandas no longer supports doing mean over object cols
# instead rewrite as:

In [18]:
x.groupby(['dataset', 'synthesizer', 'k'])[['tvd', 'mae', 'max', 'l1d', 'l2d', 'hellinger', 'jensen_shannon']].mean().head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tvd,mae,max,l1d,l2d,hellinger,jensen_shannon
dataset,synthesizer,k,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
adult,copulagan,1,0.131132,0.024476,0.088083,0.262265,0.127586,0.140999,0.035665
adult,copulagan,2,0.207133,0.027109,0.093471,0.414266,0.147127,0.215095,0.063381
adult,copulagan,3,0.264201,0.012318,0.082287,0.528402,0.142305,0.275851,0.092314
adult,ctgan,1,0.158024,0.032822,0.109902,0.316048,0.150377,0.156599,0.040073
adult,ctgan,2,0.209406,0.026458,0.097115,0.418813,0.148934,0.215927,0.058027
adult,ctgan,3,0.263186,0.011949,0.081526,0.526372,0.140765,0.272194,0.083775
adult,flip10,1,0.005298,0.000953,0.002204,0.010597,0.003359,0.006028,5.1e-05
adult,flip10,2,0.016747,0.001341,0.005056,0.033495,0.008773,0.023427,0.001432
adult,flip10,3,0.029523,0.001281,0.006973,0.059046,0.013089,0.043117,0.003421
adult,flip20,1,0.005175,0.000588,0.001766,0.01035,0.002896,0.005788,5.4e-05
