### This notebook is used to perform gridsearch on asia dataset

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from sdgym import benchmark
from sdgym import load_dataset
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf
from synthsonic.models.categorical_utils import categorical_round, vec_translate, categorical_frequency_mapping, \
            categorical_frequency_inverse_mapping, encode_one_hot, decode_one_hot
from timeit import default_timer as timer
%matplotlib inline

In [None]:
from functools import partial

### Gaussian Mixtures datasets

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns, times=None):
    # Max's kde copula model with default parameters
    start = timer()
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(alpha=0.1, random_state=0, max_iter=1000, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    end = timer()
    exec_time = end - start
    if isinstance(times, list):
        times.append(exec_time)
    return X_gen

In [None]:
grid_times = []
grid_thing = partial(KDECopulaNNPdf_RoundCategorical, times=grid_times)
grid_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__
grid_scores = benchmark(synthesizers=[grid_thing], datasets=['grid'])
grid_scores.drop(columns=['timestamp'], inplace=True)
exec_time = ['N/A'] * 9 + [round(np.mean(grid_times), 2)]
grid_scores['grid/exec_time(s)'] = exec_time

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns, times=None):
    # Max's kde copula model with default parameters
    start = timer()
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=XGBClassifier(n_estimators=50, reg_lambda=1, gamma=0, max_depth=2))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    end = timer()
    exec_time = end - start
    if isinstance(times, list):
        times.append(exec_time)
    return X_gen

In [None]:
gridr_times = []
gridr_thing = partial(KDECopulaNNPdf_RoundCategorical, times=gridr_times)
gridr_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__
gridr_scores = benchmark(synthesizers=[gridr_thing], datasets=['gridr'])
gridr_scores.drop(columns=['timestamp'], inplace=True)
exec_time = ['N/A'] * 9 + [round(np.mean(gridr_times), 2)]
gridr_scores['gridr/exec_time(s)'] = exec_time

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns, times=None):
    # Max's kde copula model with default parameters
    start = timer()
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=XGBClassifier(n_estimators=50, reg_lambda=1, gamma=0, max_depth=2))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    end = timer()
    exec_time = end - start
    if isinstance(times, list):
        times.append(exec_time)
    return X_gen

In [None]:
ring_times = []
ring_thing = partial(KDECopulaNNPdf_RoundCategorical, times=ring_times)
ring_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__
ring_scores = benchmark(synthesizers=[ring_thing], datasets=['ring'])
ring_scores.drop(columns=['timestamp'], inplace=True)
exec_time = ['N/A'] * 9 + [round(np.mean(ring_times), 2)]
ring_scores['ring/exec_time(s)'] = exec_time

In [None]:
gm_scores = pd.concat([grid_scores, gridr_scores, ring_scores], axis=1)
gm_scores

### Bayesian Networks datasets

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns, times=None):
    # Max's kde copula model with default parameters
    start = timer()
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(hidden_layer_sizes=(100,), alpha=0.05, \
                                            max_iter=500, early_stopping=True, random_state=0))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    end = timer()
    exec_time = end - start
    if isinstance(times, list):
        times.append(exec_time)
    return X_gen

In [None]:
asia_times = []
asia_thing = partial(KDECopulaNNPdf_RoundCategorical, times=asia_times)
asia_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__
asia_scores = benchmark(synthesizers=[asia_thing], datasets=['asia'])
asia_scores.drop(columns=['timestamp'], inplace=True)
exec_time = ['N/A'] * 9 + [round(np.mean(asia_times), 2)]
asia_scores['asia/exec_time(s)'] = exec_time

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns, times=None):
    # Max's kde copula model with default parameters
    start = timer()
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(force_uncorrelated=True, use_KDE=False, clf=XGBClassifier(max_depth=3))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    end = timer()
    exec_time = end - start
    if isinstance(times, list):
        times.append(exec_time)
    return X_gen

In [None]:
alarm_times = []
alarm_thing = partial(KDECopulaNNPdf_RoundCategorical, times=alarm_times)
alarm_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__
alarm_scores = benchmark(synthesizers=[alarm_thing], datasets=['alarm'])
alarm_scores.drop(columns=['timestamp'], inplace=True)
exec_time = ['N/A'] * 9 + [round(np.mean(alarm_times), 2)]
alarm_scores['alarm/exec_time(s)'] = exec_time

In [None]:
# To do - was unable to do gridsearch for child and insurance datasets
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns, times=None):
    # Max's kde copula model with default parameters
    start = timer()
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    end = timer()
    exec_time = end - start
    if isinstance(times, list):
        times.append(exec_time)
    return X_gen

In [None]:
child_times = []
child_thing = partial(KDECopulaNNPdf_RoundCategorical, times=child_times)
child_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__
child_scores = benchmark(synthesizers=[child_thing], datasets=['child'])
child_scores.drop(columns=['timestamp'], inplace=True)
exec_time = ['N/A'] * 9 + [round(np.mean(child_times), 2)]
child_scores['child/exec_time(s)'] = exec_time

In [None]:
insurance_times = []
insurance_thing = partial(KDECopulaNNPdf_RoundCategorical, times=insurance_times)
insurance_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__
insurance_scores = benchmark(synthesizers=[insurance_thing], datasets=['insurance'])
insurance_scores.drop(columns=['timestamp'], inplace=True)
exec_time = ['N/A'] * 9 + [round(np.mean(insurance_times), 2)]
insurance_scores['insurance/exec_time(s)'] = exec_time

In [None]:
bn_scores = pd.concat([asia_scores, alarm_scores, child_scores, insurance_scores], axis=1)
bn_scores
#bn_scores.loc[:, ~bn_scores.columns.str.endswith('syn_likelihood')]

### Real world datasets

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns, times=None):
    # Max's kde copula model with default parameters
    start = timer()
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=XGBClassifier(random_state=0, n_estimators=100, reg_lambda=1, gamma=0, max_depth=3))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    end = timer()
    exec_time = end - start
    if isinstance(times, list):
        times.append(exec_time)
    return X_gen

In [None]:
adult_times = []
adult_thing = partial(KDECopulaNNPdf_RoundCategorical, times=adult_times)
adult_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__
adult_scores = benchmark(synthesizers=[adult_thing], datasets=['adult'])
adult_scores.drop(columns=['timestamp'], inplace=True)
exec_time = ['N/A'] * 9 + [round(np.mean(adult_times), 2)]
adult_scores['adult/exec_time(s)'] = exec_time

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns, times=None):
    # Max's kde copula model with default parameters
    start = timer()
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(force_uncorrelated=False, use_KDE=False, n_nonlinear_vars=36, clf=XGBClassifier(max_depth=3, n_estimators=250))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    end = timer()
    exec_time = end - start
    if isinstance(times, list):
        times.append(exec_time)
    return X_gen

In [None]:
intrusion_times = []
intrusion_thing = partial(KDECopulaNNPdf_RoundCategorical, times=intrusion_times)
intrusion_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__
intrusion_scores = benchmark(synthesizers=[intrusion_thing], datasets=['intrusion'])
intrusion_scores.drop(columns=['timestamp'], inplace=True)
exec_time = ['N/A'] * 9 + [round(np.mean(intrusion_times), 2)]
intrusion_scores['intrusion/exec_time(s)'] = exec_time

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns, times=None):
    # Max's kde copula model with default parameters
    start = timer()
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    end = timer()
    exec_time = end - start
    if isinstance(times, list):
        times.append(exec_time)
    return X_gen

In [None]:
census_times = []
census_thing = partial(KDECopulaNNPdf_RoundCategorical, times=census_times)
census_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__
census_scores = benchmark(synthesizers=[census_thing], datasets=['census'])
census_scores.drop(columns=['timestamp'], inplace=True)
exec_time = ['N/A'] * 9 + [round(np.mean(census_times), 2)]
census_scores['intrusion/exec_time(s)'] = exec_time

In [None]:
rw_scores = pd.concat([adult_scores, intrusion_scores, census_scores], axis=1)
rw_scores