# Fidelity

## Implementation

In [1]:
import os.path
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

In [2]:
# original implementations

def bin_data(dt1, dt2, c = 10):
    dt1 = dt1.copy()
    dt2 = dt2.copy()
    # quantile binning of numerics
    num_cols = dt1.dtypes[dt1.dtypes!='object'].index
    for col in num_cols:
        # determine breaks based on `dt1`
        breaks = dt1[col].quantile(np.linspace(0, 1, c+1)).unique()
        dt1[col] = pd.cut(dt1[col], bins=breaks, include_lowest=True).astype(str)
        dt2_vals = pd.to_numeric(dt2[col], 'coerce')
        dt2_bins = pd.cut(dt2_vals, bins=breaks, include_lowest=True).astype(str)
        dt2_bins[dt2_vals < min(breaks)] = '_other_'
        dt2_bins[dt2_vals > max(breaks)] = '_other_'
        dt2[col] = dt2_bins
    # top-C binning of categoricals
    cat_cols = dt1.dtypes[dt1.dtypes=='object'].index
    for col in cat_cols:
        # determine top values based on `dt1`
        top_vals = dt1[col].value_counts().head(c).index.tolist()
        dt1[col].replace(np.setdiff1d(dt1[col].unique().tolist(), top_vals), '_other_', inplace=True)
        dt2[col].replace(np.setdiff1d(dt2[col].unique().tolist(), top_vals), '_other_', inplace=True)
    return [dt1, dt2]

def hellinger(p1, p2):
  return np.sqrt(1 - np.sum(np.sqrt(p1*p2)))

def kullback_leibler(p1, p2):
  idx = p1>0
  return np.sum(p1[idx] * np.log(p1[idx]/p2[idx]))

def jensen_shannon(p1, p2):
  m = 0.5 * (p1 + p2)
  return 0.5 * kullback_leibler(p1, m) + 0.5 * kullback_leibler(p2, m)

def fidelity(dt1, dt2, c = 100, k = 1):
    [dt1_bin, dt2_bin] = bin_data(dt1, dt2, c = c)
    # build grid of all cross-combinations
    
    # >>> use correct naming to refer to training df
    # cols = trn.columns # original naming
    cols = dt1.columns
    interactions = pd.DataFrame(np.array(np.meshgrid(cols, cols, cols)).reshape(3, len(cols)**3).T)
    interactions.columns = ['dim1', 'dim2', 'dim3']
    if k == 1:
        interactions = interactions.loc[(interactions['dim1']==interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
    elif k == 2:
        interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
    elif k == 3:
        interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']<interactions['dim3'])]
    else:
        raise('k>3 not supported')

    results = []
    for idx in range(interactions.shape[0]):
        row = interactions.iloc[idx]
        val1 = dt1_bin[row.dim1] + dt1_bin[row.dim2] + dt1_bin[row.dim3]
        val2 = dt2_bin[row.dim1] + dt2_bin[row.dim2] + dt2_bin[row.dim3]
        freq1 = val1.value_counts(normalize=True).to_frame(name='p1')
        freq2 = val2.value_counts(normalize=True).to_frame(name='p2')
        freq = freq1.join(freq2, how='outer').fillna(0.0)
        p1 = freq['p1']
        p2 = freq['p2']
        out = pd.DataFrame({
          'k': k,
          'dim1': [row.dim1], 'dim2': [row.dim2], 'dim3': [row.dim3],
          'tvd': [np.sum(np.abs(p1 - p2)) / 2], 
          'mae': [np.mean(np.abs(p1 - p2))], 
          'max': [np.max(np.abs(p1 - p2))],
          'l1d': [np.sum(np.abs(p1 - p2))],
          'l2d': [np.sqrt(np.sum((p1 - p2)**2))],
          'hellinger': [hellinger(p1, p2)],
          'jensen_shannon': [jensen_shannon(p1, p2)]})
        results.append(out)

    return pd.concat(results)
    

In [3]:
trn = pd.read_csv('data/online-shoppers_trn.csv.gz')
syn = pd.read_csv('data/online-shoppers_mostly.csv.gz')

In [4]:
# bin_data fails
bin_data(trn, syn, c=10)

TypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.

In [5]:
# updated fidelity function (minor change)

def fidelity(dt1, dt2, c = 100, k = 1):
    [dt1_bin, dt2_bin] = bin_data(dt1, dt2, c = c)
    # build grid of all cross-combinations
    
    # >>> use correct naming to refer to training df
    # cols = trn.columns # original naming
    cols = dt1.columns # correct naming
    
    interactions = pd.DataFrame(np.array(np.meshgrid(cols, cols, cols)).reshape(3, len(cols)**3).T)
    interactions.columns = ['dim1', 'dim2', 'dim3']
    if k == 1:
        interactions = interactions.loc[(interactions['dim1']==interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
    elif k == 2:
        interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
    elif k == 3:
        interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']<interactions['dim3'])]
    else:
        raise('k>3 not supported')

    results = []
    for idx in range(interactions.shape[0]):
        row = interactions.iloc[idx]
        val1 = dt1_bin[row.dim1] + dt1_bin[row.dim2] + dt1_bin[row.dim3]
        val2 = dt2_bin[row.dim1] + dt2_bin[row.dim2] + dt2_bin[row.dim3]
        freq1 = val1.value_counts(normalize=True).to_frame(name='p1')
        freq2 = val2.value_counts(normalize=True).to_frame(name='p2')
        freq = freq1.join(freq2, how='outer').fillna(0.0)
        p1 = freq['p1']
        p2 = freq['p2']
        out = pd.DataFrame({
          'k': k,
          'dim1': [row.dim1], 'dim2': [row.dim2], 'dim3': [row.dim3],
          'tvd': [np.sum(np.abs(p1 - p2)) / 2], 
          'mae': [np.mean(np.abs(p1 - p2))], 
          'max': [np.max(np.abs(p1 - p2))],
          'l1d': [np.sum(np.abs(p1 - p2))],
          'l2d': [np.sqrt(np.sum((p1 - p2)**2))],
          'hellinger': [hellinger(p1, p2)],
          'jensen_shannon': [jensen_shannon(p1, p2)]})
        results.append(out)

    return pd.concat(results)

In [6]:
# updated implementation of bin_data
# changes:
# 1. use .select_dtypes method to more accurately subset cols
# 2. handle boolean columns separatelt
# 3. cast boolean cols to object (no int) so they can be processed correctly in the fidelity() function
# ---> specifically: we need to be able to sum values together in `val1 = dt1_bin[row.dim1] + dt1_bin[row.dim2] + dt1_bin[row.dim3]`

def bin_data(dt1, dt2, c = 10):
    dt1 = dt1.copy()
    dt2 = dt2.copy()
    # quantile binning of numerics
    
    # define all non-object dtypes (so incl. boolean) as numeric
    # num_cols = dt1.dtypes[dt1.dtypes!='object'].index
    num_cols = dt1.select_dtypes(include='number').columns
    
    for col in num_cols:
        # determine breaks based on `dt1`
        breaks = dt1[col].quantile(np.linspace(0, 1, c+1)).unique()
        dt1[col] = pd.cut(dt1[col], bins=breaks, include_lowest=True).astype(str)
        dt2_vals = pd.to_numeric(dt2[col], 'coerce')
        dt2_bins = pd.cut(dt2_vals, bins=breaks, include_lowest=True).astype(str)
        dt2_bins[dt2_vals < min(breaks)] = '_other_'
        dt2_bins[dt2_vals > max(breaks)] = '_other_'
        dt2[col] = dt2_bins
        # for debugging
        # print('column ' + col + ' processing has finished.')
    
    # top-C binning of categoricals
    
    #cat_cols = dt1.dtypes[dt1.dtypes=='object'].index 
    cat_cols = dt1.select_dtypes(include=['object', 'category'])
    for col in cat_cols:
        # determine top values based on `dt1`
        top_vals = dt1[col].value_counts().head(c).index.tolist()
        dt1[col].replace(np.setdiff1d(dt1[col].unique().tolist(), top_vals), '_other_', inplace=True)
        dt2[col].replace(np.setdiff1d(dt2[col].unique().tolist(), top_vals), '_other_', inplace=True)
        # for debugging
        # print('column ' + col + ' processing has finished.')

    # add separate binning of booleans
    bool_cols = dt1.select_dtypes(include=['bool'])
    for col in bool_cols:
        dt1[col] = dt1[col].replace({True: '1', False: '0'})
        dt2[col] = dt2[col].replace({True: '1', False: '0'})
        # for debugging
        # print('column ' + col + ' processing has finished.')
    return [dt1, dt2]

In [7]:
bin_data(trn, syn, c=10)

[     Administrative Administrative_Duration  Informational   
 0     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]  \
 1     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 2        (1.0, 2.0]             (5.5, 35.0]  (-0.001, 1.0]   
 3     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 4     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 ...             ...                     ...            ...   
 6160  (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 6161  (-0.001, 1.0]           (-0.001, 5.5]    (2.0, 24.0]   
 6162  (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 6163  (-0.001, 1.0]          (35.0, 68.667]  (-0.001, 1.0]   
 6164  (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 
      Informational_Duration ProductRelated ProductRelated_Duration   
 0            (-0.001, 65.5]    (9.0, 13.0]          (129.2, 242.8]  \
 1            (-0.001, 65.5]     (6.0, 9.0]          (129.2, 242.8]   
 2            (-0.001, 65.5] 

In [8]:
# fidelity with k=1
fidelity(trn, syn, k=1, c=100)

Unnamed: 0,k,dim1,dim2,dim3,tvd,mae,max,l1d,l2d,hellinger,jensen_shannon
0,1,Administrative,Administrative,Administrative,0.023005,0.003286,0.011364,0.04601,0.016016,0.035256,0.001239
0,1,Administrative_Duration,Administrative_Duration,Administrative_Duration,0.027675,0.001064,0.005043,0.05535,0.010063,0.034553,0.001192
0,1,Informational,Informational,Informational,0.02249,0.006426,0.02249,0.04498,0.024972,0.02564,0.000656
0,1,Informational_Duration,Informational_Duration,Informational_Duration,0.026242,0.002762,0.024405,0.052483,0.025931,0.030479,0.000927
0,1,ProductRelated,ProductRelated,ProductRelated,0.044678,0.001441,0.004942,0.089357,0.014215,0.039778,0.001581
0,1,ProductRelated_Duration,ProductRelated_Duration,ProductRelated_Duration,0.064227,0.001352,0.006383,0.128455,0.017369,0.059901,0.003581
0,1,BounceRates,BounceRates,BounceRates,0.073715,0.002891,0.035519,0.14743,0.04054,0.073615,0.005383
0,1,ExitRates,ExitRates,ExitRates,0.103271,0.002347,0.007585,0.206541,0.027157,0.098609,0.009438
0,1,PageValues,PageValues,PageValues,0.015698,0.001427,0.007443,0.031397,0.010087,0.023769,0.000564
0,1,SpecialDay,SpecialDay,SpecialDay,0.01742,0.006968,0.017404,0.034839,0.020145,0.024891,0.000618


In [9]:
# fidelity with k=2
fidelity(trn, syn, k=2, c=100)

Unnamed: 0,k,dim1,dim2,dim3,tvd,mae,max,l1d,l2d,hellinger,jensen_shannon
0,2,Administrative,Administrative_Duration,Administrative_Duration,0.116980,0.000344,0.011328,0.233960,0.017906,0.180625,0.026962
0,2,Administrative,Informational,Informational,0.038380,0.000783,0.017241,0.076760,0.019715,0.057321,0.003092
0,2,Administrative_Duration,Informational,Informational,0.059199,0.000325,0.006151,0.118399,0.011867,0.108637,0.009696
0,2,BounceRates,Informational,Informational,0.098630,0.000580,0.040036,0.197260,0.043749,0.133837,0.015139
0,2,ExitRates,Informational,Informational,0.136042,0.000467,0.007161,0.272083,0.025334,0.173313,0.025106
...,...,...,...,...,...,...,...,...,...,...,...
0,2,PageValues,Revenue,Revenue,0.037563,0.001707,0.020544,0.075127,0.025543,0.045158,0.002034
0,2,Month,Revenue,Revenue,0.027127,0.002713,0.010333,0.054253,0.016586,0.028866,0.000829
0,2,OperatingSystems,Revenue,Revenue,0.009755,0.002439,0.007337,0.019510,0.009730,0.012037,0.000145
0,2,Browser,Revenue,Revenue,0.017780,0.002223,0.012824,0.035560,0.015822,0.025054,0.000624


## Benchmark

In [None]:
%%time

# benchmark all
datasets = ['adult', 'bank-marketing', 'credit-default', 'online-shoppers']
fns = ['mostly', 'copulagan', 'ctgan', 'tvae', 'gaussian_copula', 'gretel', 'synthpop',
       'mostly_e1', 'mostly_e2', 'mostly_e4', 'mostly_e8', 'mostly_e16',
       'flip10', 'flip20', 'flip30', 'flip40', 'flip50', 
       'flip60', 'flip70', 'flip80', 'flip90',
       'val']

results = []
for dataset in datasets:
    trn = pd.read_csv('data/' + dataset + '_trn.csv.gz')
    for fn in fns:
        syn_fn = 'data/' + dataset  + '_' + fn + '.csv.gz'
        print(syn_fn)
        if (os.path.exists(syn_fn)):
            syn = pd.read_csv(syn_fn)
            fid1 = fidelity(trn, syn, k=1, c=100)
            fid2 = fidelity(trn, syn, k=2, c=10)
            fid3 = fidelity(trn, syn, k=3, c=5)
            out = pd.concat([fid1, fid2, fid3])
            out['dataset'] = dataset
            out['synthesizer'] = fn
            results.append(out)

x = pd.concat(results)
x.to_csv('fidelity.csv', index=False)
x

data/adult_mostly.csv.gz


In [15]:
# x.groupby(['dataset', 'synthesizer', 'k']).mean().head(20)

dataset          synthesizer
adult            copulagan      2.765217
                 ctgan          2.765217
                 mostly         2.765217
bank-marketing   copulagan      2.795918
                 ctgan          2.795918
                 mostly         2.795918
credit-default   copulagan      2.860585
                 ctgan          2.860585
                 mostly         2.860585
online-shoppers  copulagan      2.808511
                 ctgan          2.808511
                 mostly         2.808511
Name: k, dtype: float64

In [None]:
# ^^ this groupby doesn't work because - again - pandas no longer supports doing mean over object cols
# instead rewrite as:

In [16]:
x.groupby(['dataset', 'synthesizer']).k.mean().head(20)

dataset          synthesizer
adult            copulagan      2.765217
                 ctgan          2.765217
                 mostly         2.765217
bank-marketing   copulagan      2.795918
                 ctgan          2.795918
                 mostly         2.795918
credit-default   copulagan      2.860585
                 ctgan          2.860585
                 mostly         2.860585
online-shoppers  copulagan      2.808511
                 ctgan          2.808511
                 mostly         2.808511
Name: k, dtype: float64

## Code for debugging

In [None]:
# so code above isn't working
# fidelity() with k>1
# let's debug step-by-step

In [7]:
dt1 = debug_trn
dt2 = debug_syn

In [8]:
# create binned dataframes
[dt1_bin, dt2_bin] = bin_data(dt1, dt2, c = 10)

column Administrative processing has finished.
column Administrative_Duration processing has finished.
column Informational processing has finished.
column Informational_Duration processing has finished.
column ProductRelated processing has finished.
column ProductRelated_Duration processing has finished.
column BounceRates processing has finished.
column ExitRates processing has finished.
column PageValues processing has finished.
column SpecialDay processing has finished.
column OperatingSystems processing has finished.
column Browser processing has finished.
column Region processing has finished.
column TrafficType processing has finished.
column Administrative processing has finished.
column Administrative_Duration processing has finished.
column Informational processing has finished.
column Informational_Duration processing has finished.
column ProductRelated processing has finished.
column ProductRelated_Duration processing has finished.
column BounceRates processing has finished

In [9]:
dt1_bin.head(3)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,"(-0.001, 1.0]","(-0.001, 5.5]","(-0.001, 1.0]","(-0.001, 65.5]","(9.0, 13.0]","(129.2, 242.8]","(0.0129, 0.0231]","(0.06, 0.1]","(-0.001, 2.517]","(0.2, 1.0]",May,"(0.999, 2.0]","(4.0, 13.0]","(0.999, 2.0]","(3.0, 4.0]",Returning_Visitor,0,0
1,"(-0.001, 1.0]","(-0.001, 5.5]","(-0.001, 1.0]","(-0.001, 65.5]","(6.0, 9.0]","(129.2, 242.8]","(-0.001, 0.00316]","(0.0206, 0.0253]","(-0.001, 2.517]","(-0.001, 0.2]",May,"(0.999, 2.0]","(0.999, 2.0]","(0.999, 2.0]","(0.999, 2.0]",Returning_Visitor,0,0
2,"(1.0, 2.0]","(5.5, 35.0]","(-0.001, 1.0]","(-0.001, 65.5]","(23.0, 32.0]","(242.8, 388.3]","(0.0129, 0.0231]","(0.0417, 0.06]","(-0.001, 2.517]","(-0.001, 0.2]",Nov,"(0.999, 2.0]","(0.999, 2.0]","(2.0, 3.0]","(4.0, 6.0]",Returning_Visitor,0,0


In [12]:
# build grid of all cross-combinations
cols = dt1.columns
interactions = pd.DataFrame(np.array(np.meshgrid(cols, cols, cols)).reshape(3, len(cols)**3).T)
interactions.columns = ['dim1', 'dim2', 'dim3']
interactions

Unnamed: 0,dim1,dim2,dim3
0,Administrative,Administrative,Administrative
1,Administrative,Administrative,Administrative_Duration
2,Administrative,Administrative,Informational
3,Administrative,Administrative,Informational_Duration
4,Administrative,Administrative,ProductRelated
...,...,...,...
5827,Revenue,Revenue,Region
5828,Revenue,Revenue,TrafficType
5829,Revenue,Revenue,VisitorType
5830,Revenue,Revenue,Weekend


In [13]:
# if k=2
interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
interactions

Unnamed: 0,dim1,dim2,dim3
325,Administrative,Administrative_Duration,Administrative_Duration
650,Administrative,Informational,Informational
668,Administrative_Duration,Informational,Informational
758,BounceRates,Informational,Informational
776,ExitRates,Informational,Informational
...,...,...,...
5669,PageValues,Revenue,Revenue
5705,Month,Revenue,Revenue
5723,OperatingSystems,Revenue,Revenue
5741,Browser,Revenue,Revenue


In [14]:
# create empty list to store results
results = []

In [15]:
row = interactions.iloc[61]
row

dim1       Revenue
dim2    SpecialDay
dim3    SpecialDay
Name: 3231, dtype: object

In [16]:
val1 = dt1_bin[row.dim1] + dt1_bin[row.dim2] + dt1_bin[row.dim3]

In [17]:
val1

0             0(0.2, 1.0](0.2, 1.0]
1       0(-0.001, 0.2](-0.001, 0.2]
2       0(-0.001, 0.2](-0.001, 0.2]
3       0(-0.001, 0.2](-0.001, 0.2]
4       0(-0.001, 0.2](-0.001, 0.2]
                   ...             
6160          0(0.2, 1.0](0.2, 1.0]
6161    1(-0.001, 0.2](-0.001, 0.2]
6162    0(-0.001, 0.2](-0.001, 0.2]
6163    0(-0.001, 0.2](-0.001, 0.2]
6164    1(-0.001, 0.2](-0.001, 0.2]
Length: 6165, dtype: object

In [20]:
dt2_bin.dtypes

Administrative             object
Administrative_Duration    object
Informational              object
Informational_Duration     object
ProductRelated             object
ProductRelated_Duration    object
BounceRates                object
ExitRates                  object
PageValues                 object
SpecialDay                 object
Month                      object
OperatingSystems           object
Browser                    object
Region                     object
TrafficType                object
VisitorType                object
Weekend                      bool
Revenue                      bool
dtype: object

In [18]:
val2 = dt2_bin[row.dim1] + dt2_bin[row.dim2] + dt2_bin[row.dim3]

TypeError: unsupported operand type(s) for +: 'bool' and 'str'

In [None]:
# interesting, this seems to work fine when i run it row by row
# let's see what happens when we run the for loop

In [36]:
# iterate over each row of interactions
for idx in range(interactions.shape[0]):
    row = interactions.iloc[idx]
    val1 = dt1_bin[row.dim1] + dt1_bin[row.dim2] + dt1_bin[row.dim3]
    val2 = dt2_bin[row.dim1] + dt2_bin[row.dim2] + dt2_bin[row.dim3]
    print('Row ' + str(idx) + ' completed.')    

Row 0 completed.
Row 1 completed.
Row 2 completed.
Row 3 completed.
Row 4 completed.
Row 5 completed.
Row 6 completed.
Row 7 completed.
Row 8 completed.
Row 9 completed.
Row 10 completed.
Row 11 completed.
Row 12 completed.
Row 13 completed.
Row 14 completed.
Row 15 completed.
Row 16 completed.
Row 17 completed.
Row 18 completed.
Row 19 completed.
Row 20 completed.
Row 21 completed.
Row 22 completed.
Row 23 completed.
Row 24 completed.
Row 25 completed.
Row 26 completed.
Row 27 completed.
Row 28 completed.
Row 29 completed.
Row 30 completed.
Row 31 completed.
Row 32 completed.
Row 33 completed.
Row 34 completed.
Row 35 completed.
Row 36 completed.
Row 37 completed.
Row 38 completed.
Row 39 completed.
Row 40 completed.
Row 41 completed.
Row 42 completed.
Row 43 completed.
Row 44 completed.
Row 45 completed.
Row 46 completed.
Row 47 completed.
Row 48 completed.
Row 49 completed.
Row 50 completed.
Row 51 completed.
Row 52 completed.
Row 53 completed.
Row 54 completed.
Row 55 completed.
Ro

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [None]:
# ok so problem arises in row 61
# let's see what's going on there

In [48]:
row = interactions.iloc[60]
row

dim1        Region
dim2    SpecialDay
dim3    SpecialDay
Name: 3159, dtype: object

In [49]:
dt1_bin[row.dim1]

0       (0.999, 2.0]
1       (0.999, 2.0]
2         (2.0, 3.0]
3       (0.999, 2.0]
4         (4.0, 6.0]
            ...     
6160    (0.999, 2.0]
6161    (0.999, 2.0]
6162      (4.0, 6.0]
6163    (0.999, 2.0]
6164    (0.999, 2.0]
Name: Region, Length: 6165, dtype: object

In [50]:
dt1_bin[row.dim2]

0          (0.2, 1.0]
1       (-0.001, 0.2]
2       (-0.001, 0.2]
3       (-0.001, 0.2]
4       (-0.001, 0.2]
            ...      
6160       (0.2, 1.0]
6161    (-0.001, 0.2]
6162    (-0.001, 0.2]
6163    (-0.001, 0.2]
6164    (-0.001, 0.2]
Name: SpecialDay, Length: 6165, dtype: object

In [51]:
val1 = dt1_bin[row.dim1] + dt1_bin[row.dim2] + dt1_bin[row.dim3]

In [52]:
val1

0             (0.999, 2.0](0.2, 1.0](0.2, 1.0]
1       (0.999, 2.0](-0.001, 0.2](-0.001, 0.2]
2         (2.0, 3.0](-0.001, 0.2](-0.001, 0.2]
3       (0.999, 2.0](-0.001, 0.2](-0.001, 0.2]
4         (4.0, 6.0](-0.001, 0.2](-0.001, 0.2]
                         ...                  
6160          (0.999, 2.0](0.2, 1.0](0.2, 1.0]
6161    (0.999, 2.0](-0.001, 0.2](-0.001, 0.2]
6162      (4.0, 6.0](-0.001, 0.2](-0.001, 0.2]
6163    (0.999, 2.0](-0.001, 0.2](-0.001, 0.2]
6164    (0.999, 2.0](-0.001, 0.2](-0.001, 0.2]
Length: 6165, dtype: object

In [None]:
# OK looks like Revenue is still coded as int (because boolean)
# while other cols are object dtype
# and that's causing a problem
# so let's see what happens when we turn boolean columns into object
# cast True to '1' instead of 1

## Test Drive

## Debugging

In [2]:
pd.__version__

'2.1.0.dev0+625.g829444a28'

In [3]:
trn = pd.read_csv('data/credit-default_trn.csv.gz')
syn = pd.read_csv('data/credit-default_mostly.csv.gz')
#syn = pd.read_csv('data/credit-default_synthpop.csv.gz')

In [4]:
# # this fails because pandas no longer supports calculating mean over columns with object dtype
# # see https://github.com/pandas-dev/pandas/pull/52281
# # this cell does run with pandas=1.2.4
# fidelity(trn, syn, k=1, c=100).agg('mean')

k                 1.000000
tvd               0.037641
mae               0.003392
max               0.013368
l1d               0.075281
l2d               0.019235
hellinger         0.034161
jensen_shannon    0.001531
dtype: float64

In [4]:
fidelity(trn, syn, k=1, c=100)

Unnamed: 0,k,dim1,dim2,dim3,tvd,mae,max,l1d,l2d,hellinger,jensen_shannon
0,1,LIMIT_BAL,LIMIT_BAL,LIMIT_BAL,0.043467,0.00212,0.016267,0.086933,0.021778,0.039704,0.001574
0,1,SEX,SEX,SEX,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,EDUCATION,EDUCATION,EDUCATION,0.0427,0.01708,0.04008,0.0854,0.05427,0.031402,0.000986
0,1,MARRIAGE,MARRIAGE,MARRIAGE,0.004167,0.002778,0.004167,0.008333,0.005883,0.002964,9e-06
0,1,AGE,AGE,AGE,0.039253,0.002122,0.006647,0.078507,0.016334,0.034161,0.001166
0,1,PAY_0,PAY_0,PAY_0,0.018627,0.006209,0.018627,0.037253,0.021301,0.015799,0.00025
0,1,PAY_2,PAY_2,PAY_2,0.014427,0.005771,0.014033,0.028853,0.016967,0.013923,0.000194
0,1,PAY_3,PAY_3,PAY_3,0.028087,0.011235,0.028087,0.056173,0.033761,0.023746,0.000563
0,1,PAY_4,PAY_4,PAY_4,0.022033,0.008813,0.017773,0.044067,0.024462,0.021033,0.000441
0,1,PAY_5,PAY_5,PAY_5,0.01042,0.004168,0.0103,0.02084,0.012767,0.013395,0.000179


In [5]:
# cols dim1/2/3 are the problem so let's subset those out

In [6]:
fidelity(trn, syn, k=1, c=100)[['k',
 'tvd',
 'mae',
 'max',
 'l1d',
 'l2d',
 'hellinger',
 'jensen_shannon']].agg('mean')

k                 1.000000
tvd               0.037641
mae               0.003392
max               0.013368
l1d               0.075281
l2d               0.019235
hellinger         0.034161
jensen_shannon    0.001531
dtype: float64

In [None]:
# results are same as in old pandas version
# we're good to continue

## Debugging

In [2]:
debug_trn = pd.read_csv('data/online-shoppers_trn.csv.gz')
debug_trn.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,10,234.0,0.022222,0.088889,0.0,0.4,May,2,5,1,4,Returning_Visitor,False,False
1,0,0.0,0,0.0,8,134.0,0.0,0.025,0.0,0.0,May,2,2,1,2,Returning_Visitor,False,False
2,2,25.0,0,0.0,25,243.166667,0.016667,0.057284,0.0,0.0,Nov,2,2,3,6,Returning_Visitor,False,False
3,0,0.0,0,0.0,11,667.916667,0.0,0.009091,0.0,0.0,Dec,2,5,1,2,Returning_Visitor,False,True
4,0,0.0,0,0.0,4,157.0,0.0,0.05,0.0,0.0,May,2,5,6,6,Returning_Visitor,True,False


In [3]:
debug_trn.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

In [4]:
# Weekend and Revenue cols are boolean
# let's try running the benchmark without that to see if that works
# first let's just run this single test normally to make sure it's failing at this dataset

In [5]:
debug_syn = pd.read_csv('data/online-shoppers_mostly.csv.gz')
debug_syn.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,48,5911.5,0.03349,0.058219,0.0,0.4,May,3,2,7,3,Returning_Visitor,False,False
1,7,45.34,3,161.7,55,1054.554233,0.012707,0.029531,9.245312,0.0,Dec,2,2,1,2,Returning_Visitor,False,False
2,5,47.525,0,0.0,15,795.569667,0.0,0.007333,0.0,0.0,May,3,2,1,3,New_Visitor,False,False
3,15,146.04252,4,210.666667,135,4280.098943,0.021478,0.013256,5.273737,0.0,Dec,2,2,1,2,Returning_Visitor,False,False
4,6,307.0,2,53.0,30,411.666667,0.0,0.01,0.0,0.0,Mar,2,5,9,10,New_Visitor,False,False


In [None]:
fid_debug = fidelity(debug_trn, debug_syn, k=1, c=100)

In [19]:
# OK, yes this fidelity measure operation fails with the online-shoppers dataset
# let's just confirm it works on credit-default
# it does
# problem seems to be with boolean columns
# it's probably in how booleans are being converted to categorical

In [23]:
# get all non-object and non-boolean cols
num_cols = debug_trn.select_dtypes(include='number').columns
num_cols

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType'],
      dtype='object')

In [37]:
def bin_data(dt1, dt2, c = 10):
    dt1 = dt1.copy()
    dt2 = dt2.copy()
    # quantile binning of numerics
    
    # define all non-object dtypes (so incl. boolean) as numeric
    # num_cols = dt1.dtypes[dt1.dtypes!='object'].index
    
    num_cols = dt1.select_dtypes(include='number').columns
    
    for col in num_cols:
        # determine breaks based on `dt1`
        breaks = dt1[col].quantile(np.linspace(0, 1, c+1)).unique()
        dt1[col] = pd.cut(dt1[col], bins=breaks, include_lowest=True).astype(str)
        dt2_vals = pd.to_numeric(dt2[col], 'coerce')
        dt2_bins = pd.cut(dt2_vals, bins=breaks, include_lowest=True).astype(str)
        dt2_bins[dt2_vals < min(breaks)] = '_other_'
        dt2_bins[dt2_vals > max(breaks)] = '_other_'
        dt2[col] = dt2_bins
        print('column ' + col + ' processing has finished.')
    
    # top-C binning of categoricals
    #cat_cols = dt1.dtypes[dt1.dtypes=='object'].index
    
    cat_cols = dt1.select_dtypes(include=['object', 'category'])
    for col in cat_cols:
        # determine top values based on `dt1`
        top_vals = dt1[col].value_counts().head(c).index.tolist()
        dt1[col].replace(np.setdiff1d(dt1[col].unique().tolist(), top_vals), '_other_', inplace=True)
        dt2[col].replace(np.setdiff1d(dt2[col].unique().tolist(), top_vals), '_other_', inplace=True)
        print('column ' + col + ' processing has finished.')
    
    # add separate binning of booleans
    bool_cols = dt1.select_dtypes(include=['bool'])
    for col in bool_cols:
        dt1[col] = dt1[col].replace({True: 1, False: 0})
        dt1[col] = dt2[col].replace({True: 1, False: 0})
        print('column ' + col + ' processing has finished.')
    return [dt1, dt2]

In [33]:
bin_data(debug_trn, debug_syn, c=10)

column Administrative processing has finished.
column Administrative_Duration processing has finished.
column Informational processing has finished.
column Informational_Duration processing has finished.
column ProductRelated processing has finished.
column ProductRelated_Duration processing has finished.
column BounceRates processing has finished.
column ExitRates processing has finished.
column PageValues processing has finished.
column SpecialDay processing has finished.
column OperatingSystems processing has finished.
column Browser processing has finished.
column Region processing has finished.
column TrafficType processing has finished.


[     Administrative Administrative_Duration  Informational   
 0     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]  \
 1     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 2        (1.0, 2.0]             (5.5, 35.0]  (-0.001, 1.0]   
 3     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 4     (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 ...             ...                     ...            ...   
 6160  (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 6161  (-0.001, 1.0]           (-0.001, 5.5]    (2.0, 24.0]   
 6162  (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 6163  (-0.001, 1.0]          (35.0, 68.667]  (-0.001, 1.0]   
 6164  (-0.001, 1.0]           (-0.001, 5.5]  (-0.001, 1.0]   
 
      Informational_Duration ProductRelated ProductRelated_Duration   
 0            (-0.001, 65.5]    (9.0, 13.0]          (129.2, 242.8]  \
 1            (-0.001, 65.5]     (6.0, 9.0]          (129.2, 242.8]   
 2            (-0.001, 65.5] 

In [None]:
# OK this works now after making sure num_cols really only contains numerics
# ( did this by using .select_dtypes() )
# let's see if this break the fidelity measure
# YES
# now also converted boolean to integers

In [34]:
fid_debug = fidelity(debug_trn, debug_syn, k=1, c=100)

column Administrative processing has finished.
column Administrative_Duration processing has finished.
column Informational processing has finished.
column Informational_Duration processing has finished.
column ProductRelated processing has finished.
column ProductRelated_Duration processing has finished.
column BounceRates processing has finished.
column ExitRates processing has finished.
column PageValues processing has finished.
column SpecialDay processing has finished.
column OperatingSystems processing has finished.
column Browser processing has finished.
column Region processing has finished.
column TrafficType processing has finished.


In [None]:
# OK this works now on a single dataset

In [38]:
fid_debug2 = fidelity(debug_trn, debug_syn, k=2, c=100)

column Administrative processing has finished.
column Administrative_Duration processing has finished.
column Informational processing has finished.
column Informational_Duration processing has finished.
column ProductRelated processing has finished.
column ProductRelated_Duration processing has finished.
column BounceRates processing has finished.
column ExitRates processing has finished.
column PageValues processing has finished.
column SpecialDay processing has finished.
column OperatingSystems processing has finished.
column Browser processing has finished.
column Region processing has finished.
column TrafficType processing has finished.
column Administrative processing has finished.
column Administrative_Duration processing has finished.
column Informational processing has finished.
column Informational_Duration processing has finished.
column ProductRelated processing has finished.
column ProductRelated_Duration processing has finished.
column BounceRates processing has finished

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [None]:
# now it's failing when we set k=2