In [1]:
%load_ext autoreload
%autoreload 2

### This notebook is used to run kdecopulannpdf categorical with the lastest version of sdgym (0.3.0)

In [2]:
import numpy as np
import pandas as pd
from sdgym import benchmark
from sdgym import load_dataset
from sdgym.datasets import load_tables
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf
from synthsonic.models.categorical_utils import categorical_round, vec_translate, categorical_frequency_mapping, \
            categorical_frequency_inverse_mapping, encode_one_hot, decode_one_hot

In [3]:
def KDECopulaNNPdf_woKDE_OneHotEncoded(real_data, metadata):
    table_name = metadata.get_tables()[0]
    fields =  metadata.get_fields(table_name)
    real_data = real_data[table_name]
    col = real_data.columns
    
    categorical_columns = [i for i, key in enumerate(fields.keys()) if fields[key]['type'] == 'categorical']
    ordinal_columns = []
    
    le = LabelEncoder()
    for c in categorical_columns:
        real_data.iloc[:, c] = le.fit_transform(real_data.iloc[:, c])
        
    real_data = real_data.values    
        
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    
    ## One hot encode the categorical features
    unique_values, ohe = encode_one_hot(real_data, categorical_columns)
    categorical_np = np.array(ohe)
    
    n_samples = real_data.shape[0]
    n_features = real_data.shape[1]
    
    ## Append the categorical one hot encoded data to numerical and ordinal
    data = np.float64(np.hstack((real_data[:, numerical_features+ordinal_columns], categorical_np)))

    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    X_gen = np.float32(X_gen)
    
    
    X_final = decode_one_hot(X_gen, categorical_columns, unique_values, n_features)
    X_final[:, numerical_features+ordinal_columns] = X_gen[:, numerical_features+ordinal_columns]
    print(X_final.shape)
    return {table_name: pd.DataFrame(X_final, columns=col)}

In [4]:
scores = benchmark.run(synthesizers=[KDECopulaNNPdf_woKDE_OneHotEncoded],  datasets=['adult'])
scores

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 108
(32561, 15)


Unnamed: 0,synthesizer,dataset,modality,iteration,metric,error,score,metric_time,model_time,run_id
0,KDECopulaNNPdf_woKDE_OneHotEncoded,adult,single-table,0,BinaryDecisionTreeClassifier,,0.390493,0.350298,50.91998,e030e5fb-6
1,KDECopulaNNPdf_woKDE_OneHotEncoded,adult,single-table,0,BinaryAdaBoostClassifier,,0.332439,1.256983,50.91998,e030e5fb-6
2,KDECopulaNNPdf_woKDE_OneHotEncoded,adult,single-table,0,BinaryLogisticRegression,,0.165171,1.177456,50.91998,e030e5fb-6
3,KDECopulaNNPdf_woKDE_OneHotEncoded,adult,single-table,0,BinaryMLPClassifier,,0.36533,2.148088,50.91998,e030e5fb-6


In [5]:
def KDECopulaNNPdf_woKDE_FreqMapping(real_data, metadata):
    table_name = metadata.get_tables()[0]
    fields =  metadata.get_fields(table_name)
    real_data = real_data[table_name]
    col = real_data.columns
    
    categorical_columns = [i for i, key in enumerate(fields.keys()) if fields[key]['type'] == 'categorical']
    ordinal_columns = []
    
    le = LabelEncoder()
    for c in categorical_columns:
        real_data.iloc[:, c] = le.fit_transform(real_data.iloc[:, c])
        
    data = np.float64(real_data.values)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    data, inv_mappings = categorical_frequency_mapping(data, categorical_columns)

    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns] = np.round(X_gen[:, categorical_columns])
    X_final = categorical_frequency_inverse_mapping(X_gen, categorical_columns, inv_mappings)
    return {table_name: pd.DataFrame(X_final, columns=col)}

In [6]:
def _load_dataset(name):
    metadata = load_dataset(name)
    return load_tables(metadata), metadata

In [7]:
df, metadata = _load_dataset('adult')
X_gen = KDECopulaNNPdf_woKDE_FreqMapping(df, metadata)

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 15


In [8]:
scores = benchmark.run(synthesizers=[KDECopulaNNPdf_woKDE_FreqMapping],  datasets=['adult'])
scores

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 15


Unnamed: 0,synthesizer,dataset,modality,iteration,metric,error,score,metric_time,model_time,run_id
0,KDECopulaNNPdf_woKDE_FreqMapping,adult,single-table,0,BinaryDecisionTreeClassifier,,0.526071,0.387109,31.475345,eefb1a4b-a
1,KDECopulaNNPdf_woKDE_FreqMapping,adult,single-table,0,BinaryAdaBoostClassifier,,0.37264,1.673859,31.475345,eefb1a4b-a
2,KDECopulaNNPdf_woKDE_FreqMapping,adult,single-table,0,BinaryLogisticRegression,,0.405898,0.652167,31.475345,eefb1a4b-a
3,KDECopulaNNPdf_woKDE_FreqMapping,adult,single-table,0,BinaryMLPClassifier,,0.306124,1.580301,31.475345,eefb1a4b-a
