### This notebook is used to benchmark the KDECopula model using different datasets from SDGym. The categorical features will be integer encoded.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
from sdgym import benchmark
from sdgym import load_dataset
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf

In [None]:
def set_min_max(data, nf):
    x_min = [0] * nf
    x_max = [1.] * nf
    for i in range(0, nf):
        x_min[i] = data[:, i].min()
        x_max[i] = data[:, i].max()
    return x_min, x_max

def kde_copula_nn_pdf_synthesizer(real_data, categorical_columns, ordinal_columns):
    
    n_samples = real_data.shape[0]
    n_features = real_data.shape[1]
    
    assert len(categorical_columns) == 0
    assert len(ordinal_columns) == 0
    
    x_min, x_max = set_min_max(real_data, n_features)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(real_data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    return X_gen

In [None]:
scores = benchmark(synthesizers=[kde_copula_nn_pdf_synthesizer], datasets=['grid'])

In [None]:
scores

In [None]:
def kde_copula_nn_pdf_synthesizer(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    print(x_min, x_max)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def kde_copula_nn_pdf_synthesizer_wo_kde(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    print(x_min, x_max)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, use_inverse_qt=True, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def kde_copula_nn_pdf_synthesizer_xgboost(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with xgboost
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    print(x_min, x_max)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, clf=XGBClassifier(random_state=0))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def kde_copula_nn_pdf_synthesizer_no_weight(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with no weights
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    print(x_min, x_max)
    #print(data.shape)
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen = kde.sample_no_weights(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

In [None]:
df, categorical_columns, ordinal_columns = load_dataset('adult')
X_gen = kde_copula_nn_pdf_synthesizer(df, categorical_columns, ordinal_columns)

In [None]:
scores = benchmark(synthesizers=[kde_copula_nn_pdf_synthesizer, kde_copula_nn_pdf_synthesizer_no_weight, \
                                 kde_copula_nn_pdf_synthesizer_wo_kde, kde_copula_nn_pdf_synthesizer_xgboost], 
                   datasets=['adult', 'census', 'covtype'])

In [None]:
scores