### This notebook is used to perform gridsearch on asia dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from sdgym import benchmark
from sdgym import load_dataset
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf
from synthsonic.models.categorical_utils import categorical_round, vec_translate, categorical_frequency_mapping, \
            categorical_frequency_inverse_mapping, encode_one_hot, decode_one_hot
from pandas_profiling import ProfileReport
%matplotlib inline

### EDA

In [4]:
df, categorical_columns, ordinal_columns = load_dataset('covtype')
explore_df = pd.DataFrame(df)
profile = ProfileReport(explore_df, title="EDA for covtype dataset")
profile

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=69.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…






### Observations:
* There are 10 numerical features and 45 categorical features, so it's worth trying all the categorical encoding strategies
    * Consider categorical as ordinal
    * One hot encode categorical features
    * Frequency mapping

### MLP classifier

In [10]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    #print(data.shape)
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def KDECopulaNNPdf_woKDE_OneHotEncoded(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    
    ## One hot encode the categorical features
    unique_values, ohe = encode_one_hot(real_data, categorical_columns)
    categorical_np = np.array(ohe)
    
    n_samples = real_data.shape[0]
    n_features = real_data.shape[1]
    
    ## Append the categorical one hot encoded data to numerical and ordinal
    data = np.float64(np.hstack((real_data[:, numerical_features+ordinal_columns], categorical_np)))

    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    X_gen = np.float32(X_gen)
    
    
    X_final = decode_one_hot(X_gen, categorical_columns, unique_values, n_features)
    X_final[:, numerical_features+ordinal_columns] = X_gen[:, numerical_features+ordinal_columns]
    print(X_final.shape)
    return X_final
    
def KDECopulaNNPdf_woKDE_FreqMapping(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    data, inv_mappings = categorical_frequency_mapping(data, categorical_columns)

    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns] = np.round(X_gen[:, categorical_columns])
    X_final = categorical_frequency_inverse_mapping(X_gen, categorical_columns, inv_mappings)
    return X_final

In [None]:
covtype_scores_mlp = benchmark(synthesizers=[KDECopulaNNPdf_RoundCategorical,
                                         KDECopulaNNPdf_woKDE_OneHotEncoded,
                                         KDECopulaNNPdf_woKDE_FreqMapping], datasets=['covtype'])
covtype_scores_mlp

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 53




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 53




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 53




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 102
(481012, 55)




Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 102
(481012, 55)




Transforming variables.
Fitting and calibrating classifier.


In [19]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=XGBClassifier(random_state=42, max_depth=6, alpha=0.2, subsample=0.5))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def KDECopulaNNPdf_woKDE_OneHotEncoded(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    
    ## One hot encode the categorical features
    unique_values, ohe = encode_one_hot(real_data, categorical_columns)
    categorical_np = np.array(ohe)
    
    n_samples = real_data.shape[0]
    n_features = real_data.shape[1]
    
    ## Append the categorical one hot encoded data to numerical and ordinal
    data = np.float64(np.hstack((real_data[:, numerical_features+ordinal_columns], categorical_np)))

    kde = KDECopulaNNPdf(use_KDE=False, clf=XGBClassifier(random_state=42, max_depth=6, alpha=0.2, subsample=0.5))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    X_gen = np.float32(X_gen)
    
    
    X_final = decode_one_hot(X_gen, categorical_columns, unique_values, n_features)
    X_final[:, numerical_features+ordinal_columns] = X_gen[:, numerical_features+ordinal_columns]
    print(X_final.shape)
    return X_final
    
def KDECopulaNNPdf_woKDE_FreqMapping(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    data, inv_mappings = categorical_frequency_mapping(data, categorical_columns)

    kde = KDECopulaNNPdf(use_KDE=False, clf=XGBClassifier(random_state=42, max_depth=6, alpha=0.2, subsample=0.5))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns] = np.round(X_gen[:, categorical_columns])
    X_final = categorical_frequency_inverse_mapping(X_gen, categorical_columns, inv_mappings)
    return X_final

In [20]:
covtype_scores_xgboost = benchmark(synthesizers=[KDECopulaNNPdf_RoundCategorical,
                                         KDECopulaNNPdf_woKDE_OneHotEncoded,
                                         KDECopulaNNPdf_woKDE_FreqMapping], datasets=['covtype'])
covtype_scores_xgboost

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 16
(10000, 8)
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 16
(10000, 8)
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 16
(10000, 8)
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables

Unnamed: 0,asia/syn_likelihood,asia/test_likelihood,timestamp
CLBNSynthesizer,-2.406589,-2.27184,2020-04-12 09:41:35.096775
CTGANSynthesizer,-3.886903,-2.442375,2020-04-12 09:41:35.096775
IdentitySynthesizer,-2.250748,-2.241122,2020-04-12 09:41:35.096775
IndependentSynthesizer,-4.933314,-3.001627,2020-04-12 09:41:35.096775
MedganSynthesizer,-1.519422,-5.197981,2020-04-12 09:41:35.096775
TableganSynthesizer,-3.331052,-2.68449,2020-04-12 09:41:35.096775
TVAESynthesizer,-2.353327,-2.267334,2020-04-12 09:41:35.096775
UniformSynthesizer,-14.301042,-5.551594,2020-04-12 09:41:35.096775
VEEGANSynthesizer,-11.492287,-5.952712,2020-04-12 09:41:35.096775
KDECopulaNNPdf_RoundCategorical,-4.499812,-2.631797,2020-09-23 15:31:10.111131


In [21]:
covtype_scores_mlp['Classifier'] = 'MLP'
covtype_scores_xgboost['Classifier'] = 'XGBoost'
covtype_scores_mlp.iloc[0:9]['Classifier'] = 'N/A'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asia_scores_mlp.iloc[0:9]['Classifier'] = 'N/A'


In [22]:
covtype_scores = covtype_scores_mlp.reset_index().append(covtype_scores_xgboost.reset_index().iloc[-3:], ignore_index=True)
covtype_scores

Unnamed: 0,index,asia/syn_likelihood,asia/test_likelihood,timestamp,Classifier
0,CLBNSynthesizer,-2.406589,-2.27184,2020-04-12 09:41:35.096775,
1,CTGANSynthesizer,-3.886903,-2.442375,2020-04-12 09:41:35.096775,
2,IdentitySynthesizer,-2.250748,-2.241122,2020-04-12 09:41:35.096775,
3,IndependentSynthesizer,-4.933314,-3.001627,2020-04-12 09:41:35.096775,
4,MedganSynthesizer,-1.519422,-5.197981,2020-04-12 09:41:35.096775,
5,TableganSynthesizer,-3.331052,-2.68449,2020-04-12 09:41:35.096775,
6,TVAESynthesizer,-2.353327,-2.267334,2020-04-12 09:41:35.096775,
7,UniformSynthesizer,-14.301042,-5.551594,2020-04-12 09:41:35.096775,
8,VEEGANSynthesizer,-11.492287,-5.952712,2020-04-12 09:41:35.096775,
9,KDECopulaNNPdf_RoundCategorical,-4.499812,-2.631797,2020-09-23 15:18:56.123862,MLP


### Grid search

In [24]:
data = np.float64(df)

In [25]:
kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier())
kde.get_params().keys()

dict_keys(['clf__activation', 'clf__alpha', 'clf__batch_size', 'clf__beta_1', 'clf__beta_2', 'clf__early_stopping', 'clf__epsilon', 'clf__hidden_layer_sizes', 'clf__learning_rate', 'clf__learning_rate_init', 'clf__max_fun', 'clf__max_iter', 'clf__momentum', 'clf__n_iter_no_change', 'clf__nesterovs_momentum', 'clf__power_t', 'clf__random_state', 'clf__shuffle', 'clf__solver', 'clf__tol', 'clf__validation_fraction', 'clf__verbose', 'clf__warm_start', 'clf', 'copy', 'do_PCA', 'force_uncorrelated', 'min_mutual_information', 'min_pca_variance', 'mirror_left', 'mirror_right', 'n_adaptive', 'n_nonlinear_vars', 'n_quantiles', 'ordering', 'random_state', 'rho', 'use_KDE', 'use_inverse_qt', 'x_max', 'x_min'])

In [29]:
# then for the grid search do this, where all classifier options now have a prefix clf__:
from sklearn.model_selection import GridSearchCV
parameters = {
    'clf__alpha': 10.0 ** -np.arange(1, 3),
    'clf__hidden_layer_sizes': [(10,),(20,),(50,),(100,)],
    'clf__activation': ['tanh', 'relu'],
    'clf__solver': ['sgd', 'adam'],
    'clf__alpha': [0.0001, 0.05],
    'clf__learning_rate': ['constant','adaptive'],
}

grid = GridSearchCV(KDECopulaNNPdf(use_KDE=False), parameters, cv=5)
grid.fit(data)
print (grid.best_params_)

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting 

In [30]:
print (grid.best_params_)

{'clf__activation': 'relu', 'clf__alpha': 0.05, 'clf__hidden_layer_sizes': (100,), 'clf__learning_rate': 'constant', 'clf__solver': 'adam'}


In [44]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    #print(data.shape)
    
    kde = KDECopulaNNPdf(clf=MLPClassifier(hidden_layer_sizes=(100,), alpha=0.05,
                                            max_iter=500, early_stopping=True, random_state=1), use_KDE=False)
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

In [45]:
asia_scores = benchmark(synthesizers=[KDECopulaNNPdf_RoundCategorical], datasets=['asia'])
asia_scores

Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8
Transforming variables.
Fitting and calibrating classifier.
Model = rho: 0.5, number of selected non-linear variables: 8


Unnamed: 0,asia/syn_likelihood,asia/test_likelihood,timestamp
CLBNSynthesizer,-2.406589,-2.27184,2020-04-12 09:41:35.096775
CTGANSynthesizer,-3.886903,-2.442375,2020-04-12 09:41:35.096775
IdentitySynthesizer,-2.250748,-2.241122,2020-04-12 09:41:35.096775
IndependentSynthesizer,-4.933314,-3.001627,2020-04-12 09:41:35.096775
MedganSynthesizer,-1.519422,-5.197981,2020-04-12 09:41:35.096775
TableganSynthesizer,-3.331052,-2.68449,2020-04-12 09:41:35.096775
TVAESynthesizer,-2.353327,-2.267334,2020-04-12 09:41:35.096775
UniformSynthesizer,-14.301042,-5.551594,2020-04-12 09:41:35.096775
VEEGANSynthesizer,-11.492287,-5.952712,2020-04-12 09:41:35.096775
KDECopulaNNPdf_RoundCategorical,-4.499812,-2.631797,2020-09-23 18:30:47.336991


In [46]:
asia_scores.sort_values('asia/test_likelihood')

Unnamed: 0,asia/syn_likelihood,asia/test_likelihood,timestamp
VEEGANSynthesizer,-11.492287,-5.952712,2020-04-12 09:41:35.096775
UniformSynthesizer,-14.301042,-5.551594,2020-04-12 09:41:35.096775
MedganSynthesizer,-1.519422,-5.197981,2020-04-12 09:41:35.096775
IndependentSynthesizer,-4.933314,-3.001627,2020-04-12 09:41:35.096775
TableganSynthesizer,-3.331052,-2.68449,2020-04-12 09:41:35.096775
KDECopulaNNPdf_RoundCategorical,-4.499812,-2.631797,2020-09-23 18:30:47.336991
CTGANSynthesizer,-3.886903,-2.442375,2020-04-12 09:41:35.096775
CLBNSynthesizer,-2.406589,-2.27184,2020-04-12 09:41:35.096775
TVAESynthesizer,-2.353327,-2.267334,2020-04-12 09:41:35.096775
IdentitySynthesizer,-2.250748,-2.241122,2020-04-12 09:41:35.096775
