### This notebook is used to benchmark the KDECopula model using different datasets from SDGym. The categorical features will be integer encoded.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from sdgym import benchmark
from sdgym import load_dataset
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf
from synthsonic.models.categorical_utils import categorical_round, vec_translate, categorical_frequency_mapping, \
            categorical_frequency_inverse_mapping, encode_one_hot, decode_one_hot

In [3]:
def set_min_max(data, nf):
    x_min = [0] * nf
    x_max = [1.] * nf
    for i in range(0, nf):
        x_min[i] = data[:, i].min()
        x_max[i] = data[:, i].max()
    return x_min, x_max


In [4]:
def KDECopulaNNPdf_w_RhoMinMax(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def KDECopulaNNPdf_woKDE(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def KDECopulaNNPdf_woKDE_XGBoost(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with xgboost
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=XGBClassifier(random_state=7))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def KDECopulaNNPdf_w_RhoMinMax_sampleNoWeight(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with no weights
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen = kde.sample_no_weights(n_samples, random_state=42, mode='expensive')
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

In [5]:
def KDECopulaNNPdf_woKDE_OneHotEncoded(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    
    ## One hot encode the categorical features
    unique_values, ohe = encode_one_hot(real_data, categorical_columns)
    categorical_np = np.array(ohe)
    
    n_samples = real_data.shape[0]
    n_features = real_data.shape[1]
    
    ## Append the categorical one hot encoded data to numerical and ordinal
    data = np.float64(np.hstack((real_data[:, numerical_features+ordinal_columns], categorical_np)))

    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    X_gen = np.float32(X_gen)
    
    
    X_final = decode_one_hot(X_gen, categorical_columns, unique_values, n_features)
    X_final[:, numerical_features+ordinal_columns] = X_gen[:, numerical_features+ordinal_columns]
    return X_final

def KDECopulaNNPdf_woKDE_FreqMapping(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    data, inv_mappings = categorical_frequency_mapping(data, categorical_columns)

    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns] = np.round(X_gen[:, categorical_columns])
    X_final = categorical_frequency_inverse_mapping(X_gen, categorical_columns, inv_mappings)
    return X_final

In [6]:
gaussian_mixtures_scores = benchmark(synthesizers=[KDECopulaNNPdf_w_RhoMinMax, KDECopulaNNPdf_woKDE, KDECopulaNNPdf_woKDE_XGBoost,
                                                  KDECopulaNNPdf_w_RhoMinMax_sampleNoWeight], 
                   datasets=['grid', 'gridr'])
gaussian_mixtures_scores

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2
Transforming variables.
Fitting 

Unnamed: 0,grid/syn_likelihood,grid/test_likelihood,gridr/syn_likelihood,gridr/test_likelihood,timestamp
CLBNSynthesizer,-3.885928,-5.274841,-4.06621,-10.287411,2020-04-12 09:41:35.096775
CTGANSynthesizer,-9.162882,-5.066747,-8.653293,-5.086304,2020-04-12 09:41:35.096775
IdentitySynthesizer,-3.476662,-3.503242,-3.607534,-3.635514,2020-04-12 09:41:35.096775
IndependentSynthesizer,-3.544136,-3.469971,-5.033312,-4.03767,2020-04-12 09:41:35.096775
MedganSynthesizer,-6.833268,-84.380587,-7.747477,-160.899159,2020-04-12 09:41:35.096775
TableganSynthesizer,-6.777964,-4.931756,-7.080974,-5.047245,2020-04-12 09:41:35.096775
TVAESynthesizer,-3.388274,-5.190146,-3.820569,-3.724633,2020-04-12 09:41:35.096775
UniformSynthesizer,-7.294052,-4.534827,-7.227006,-4.54956,2020-04-12 09:41:35.096775
VEEGANSynthesizer,-8.646858,-423.573276,-11.458546,-8.908475,2020-04-12 09:41:35.096775
KDECopulaNNPdf_w_RhoMinMax,-6.883709,-4.370678,-6.611463,-4.427587,2020-09-23 07:24:41.779810


In [7]:
bayesian_scores = benchmark(synthesizers=[KDECopulaNNPdf_w_RhoMinMax, KDECopulaNNPdf_woKDE, KDECopulaNNPdf_woKDE_XGBoost,
                                                  KDECopulaNNPdf_w_RhoMinMax_sampleNoWeight], 
                   datasets=['asia', 'alarm'])
bayesian_scores

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 36
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 36
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 36
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitti

Unnamed: 0,alarm/syn_likelihood,alarm/test_likelihood,asia/syn_likelihood,asia/test_likelihood,timestamp
CLBNSynthesizer,-12.385436,-11.18165,-2.406589,-2.27184,2020-04-12 09:41:35.096775
CTGANSynthesizer,-15.625477,-12.915224,-3.886903,-2.442375,2020-04-12 09:41:35.096775
IdentitySynthesizer,-10.233935,-10.301955,-2.250748,-2.241122,2020-04-12 09:41:35.096775
IndependentSynthesizer,-18.230315,-15.80523,-4.933314,-3.001627,2020-04-12 09:41:35.096775
MedganSynthesizer,-8.169461,-13.02043,-1.519422,-5.197981,2020-04-12 09:41:35.096775
TableganSynthesizer,-12.744195,-11.56523,-3.331052,-2.68449,2020-04-12 09:41:35.096775
TVAESynthesizer,-11.357112,-10.754935,-2.353327,-2.267334,2020-04-12 09:41:35.096775
UniformSynthesizer,-18.420681,-18.420681,-14.301042,-5.551594,2020-04-12 09:41:35.096775
VEEGANSynthesizer,-18.386073,-18.210907,-11.492287,-5.952712,2020-04-12 09:41:35.096775
KDECopulaNNPdf_w_RhoMinMax,-18.416334,-18.111406,-10.323669,-3.16969,2020-09-23 07:32:16.935641


In [6]:
real_world_scores = benchmark(synthesizers=[KDECopulaNNPdf_w_RhoMinMax, KDECopulaNNPdf_woKDE, KDECopulaNNPdf_woKDE_XGBoost,
                                            KDECopulaNNPdf_w_RhoMinMax_sampleNoWeight, KDECopulaNNPdf_woKDE_OneHotEncoded,
                                            KDECopulaNNPdf_woKDE_FreqMapping], 
                   datasets=['adult'])
real_world_scores

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 91
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 91
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 91
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




Unnamed: 0,adult/accuracy,adult/f1,timestamp
CLBNSynthesizer,0.760117,0.291326,2020-04-12 09:41:35.096775
CTGANSynthesizer,0.784217,0.59742,2020-04-12 09:41:35.096775
IdentitySynthesizer,0.825608,0.6616,2020-04-12 09:41:35.096775
IndependentSynthesizer,0.654425,0.174515,2020-04-12 09:41:35.096775
MedganSynthesizer,0.593683,0.276167,2020-04-12 09:41:35.096775
TableganSynthesizer,0.79885,0.426258,2020-04-12 09:41:35.096775
TVAESynthesizer,0.810817,0.630175,2020-04-12 09:41:35.096775
UniformSynthesizer,0.500242,0.301878,2020-04-12 09:41:35.096775
VEEGANSynthesizer,0.718792,0.162142,2020-04-12 09:41:35.096775
KDECopulaNNPdf_w_RhoMinMax,0.704075,0.522094,2020-09-23 13:25:58.692414


In [None]:
real_world_scores = benchmark(synthesizers=[KDECopulaNNPdf_w_RhoMinMax, KDECopulaNNPdf_woKDE, KDECopulaNNPdf_woKDE_XGBoost,
                                            KDECopulaNNPdf_w_RhoMinMax_sampleNoWeight, KDECopulaNNPdf_woKDE_OneHotEncoded,
                                            KDECopulaNNPdf_woKDE_FreqMapping], 
                   datasets=['covtype'])
real_world_scores

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 51




Transforming variables.
Fitting and calibrating classifier.
