### This notebook is used to benchmark the KDECopula model using different datasets from SDGym. The categorical features will be integer encoded.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from sdgym import benchmark
from sdgym import load_dataset
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from src.models.kde_copula_nn_pdf import KDECopulaNNPdf

In [3]:
def set_min_max(data, nf):
    x_min = [0] * nf
    x_max = [1.] * nf
    for i in range(0, nf):
        x_min[i] = data[:, i].min()
        x_max[i] = data[:, i].max()
    return x_min, x_max

def kde_copula_nn_pdf_synthesizer(real_data, categorical_columns, ordinal_columns):
    
    n_samples = real_data.shape[0]
    n_features = real_data.shape[1]
    
    assert len(categorical_columns) == 0
    assert len(ordinal_columns) == 0
    
    x_min, x_max = set_min_max(real_data, n_features)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(real_data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    return X_gen

In [4]:
scores = benchmark(synthesizers=[kde_copula_nn_pdf_synthesizer], datasets=['grid'])

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 2


In [5]:
scores

Unnamed: 0,grid/syn_likelihood,grid/test_likelihood,timestamp
CLBNSynthesizer,-3.885928,-5.274841,2020-04-12 09:41:35.096775
CTGANSynthesizer,-9.162882,-5.066747,2020-04-12 09:41:35.096775
IdentitySynthesizer,-3.476662,-3.503242,2020-04-12 09:41:35.096775
IndependentSynthesizer,-3.544136,-3.469971,2020-04-12 09:41:35.096775
MedganSynthesizer,-6.833268,-84.380587,2020-04-12 09:41:35.096775
TableganSynthesizer,-6.777964,-4.931756,2020-04-12 09:41:35.096775
TVAESynthesizer,-3.388274,-5.190146,2020-04-12 09:41:35.096775
UniformSynthesizer,-7.294052,-4.534827,2020-04-12 09:41:35.096775
VEEGANSynthesizer,-8.646858,-423.573276,2020-04-12 09:41:35.096775
kde_copula_nn_pdf_synthesizer,-6.883709,-4.370678,2020-09-22 06:54:40.222742


In [6]:
def kde_copula_nn_pdf_synthesizer(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    print(x_min, x_max)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, clf=MLPClassifier(max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def kde_copula_nn_pdf_synthesizer_xgboost(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with xgboost
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    print(x_min, x_max)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, clf=XGBClassifier())
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def kde_copula_nn_pdf_synthesizer_no_weight(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with no weights
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    print(x_min, x_max)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, clf=MLPClassifier(max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen = kde.sample_no_weights(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

In [7]:
df, categorical_columns, ordinal_columns = load_dataset('adult')
X_gen = kde_copula_nn_pdf_synthesizer(df, categorical_columns, ordinal_columns)

[17.0, 0.0, 12285.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0] [90.0, 8.0, 1484705.0, 15.0, 16.0, 6.0, 14.0, 5.0, 4.0, 1.0, 99999.0, 4356.0, 99.0, 40.0, 1.0]
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14


In [8]:
scores = benchmark(synthesizers=[kde_copula_nn_pdf_synthesizer, kde_copula_nn_pdf_synthesizer_no_weight, kde_copula_nn_pdf_synthesizer_xgboost], 
                   datasets=['adult'])

[17.0, 0.0, 12285.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0] [90.0, 8.0, 1484705.0, 15.0, 16.0, 6.0, 14.0, 5.0, 4.0, 1.0, 99999.0, 4356.0, 99.0, 40.0, 1.0]
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




[17.0, 0.0, 12285.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0] [90.0, 8.0, 1484705.0, 15.0, 16.0, 6.0, 14.0, 5.0, 4.0, 1.0, 99999.0, 4356.0, 99.0, 40.0, 1.0]
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




[17.0, 0.0, 12285.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0] [90.0, 8.0, 1484705.0, 15.0, 16.0, 6.0, 14.0, 5.0, 4.0, 1.0, 99999.0, 4356.0, 99.0, 40.0, 1.0]
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




[17.0, 0.0, 12285.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0] [90.0, 8.0, 1484705.0, 15.0, 16.0, 6.0, 14.0, 5.0, 4.0, 1.0, 99999.0, 4356.0, 99.0, 40.0, 1.0]
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




[17.0, 0.0, 12285.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0] [90.0, 8.0, 1484705.0, 15.0, 16.0, 6.0, 14.0, 5.0, 4.0, 1.0, 99999.0, 4356.0, 99.0, 40.0, 1.0]
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




[17.0, 0.0, 12285.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0] [90.0, 8.0, 1484705.0, 15.0, 16.0, 6.0, 14.0, 5.0, 4.0, 1.0, 99999.0, 4356.0, 99.0, 40.0, 1.0]
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




[17.0, 0.0, 12285.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0] [90.0, 8.0, 1484705.0, 15.0, 16.0, 6.0, 14.0, 5.0, 4.0, 1.0, 99999.0, 4356.0, 99.0, 40.0, 1.0]
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




[17.0, 0.0, 12285.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0] [90.0, 8.0, 1484705.0, 15.0, 16.0, 6.0, 14.0, 5.0, 4.0, 1.0, 99999.0, 4356.0, 99.0, 40.0, 1.0]
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




[17.0, 0.0, 12285.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0] [90.0, 8.0, 1484705.0, 15.0, 16.0, 6.0, 14.0, 5.0, 4.0, 1.0, 99999.0, 4356.0, 99.0, 40.0, 1.0]
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 14




In [9]:
scores

Unnamed: 0,adult/accuracy,adult/f1,timestamp
CLBNSynthesizer,0.760117,0.291326,2020-04-12 09:41:35.096775
CTGANSynthesizer,0.784217,0.59742,2020-04-12 09:41:35.096775
IdentitySynthesizer,0.825608,0.6616,2020-04-12 09:41:35.096775
IndependentSynthesizer,0.654425,0.174515,2020-04-12 09:41:35.096775
MedganSynthesizer,0.593683,0.276167,2020-04-12 09:41:35.096775
TableganSynthesizer,0.79885,0.426258,2020-04-12 09:41:35.096775
TVAESynthesizer,0.810817,0.630175,2020-04-12 09:41:35.096775
UniformSynthesizer,0.500242,0.301878,2020-04-12 09:41:35.096775
VEEGANSynthesizer,0.718792,0.162142,2020-04-12 09:41:35.096775
kde_copula_nn_pdf_synthesizer,0.7181,0.532038,2020-09-22 07:01:33.603263
