### This notebook is used to perform gridsearch on asia dataset

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from sdgym import benchmark
from sdgym import load_dataset
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf
from synthsonic.models.categorical_utils import categorical_round, vec_translate, categorical_frequency_mapping, \
            categorical_frequency_inverse_mapping, encode_one_hot, decode_one_hot
from pandas_profiling import ProfileReport
%matplotlib inline

### EDA

In [None]:
df, categorical_columns, ordinal_columns = load_dataset('asia')
explore_df = pd.DataFrame(df)
profile = ProfileReport(explore_df, title="EDA for asia dataset")
profile

### Observations:
* All 8 features in this dataset are categorical, so it's worth trying all the categorical encoding strategies
    * Consider categorical as ordinal
    * One hot encode categorical features
    * Frequency mapping

### MLP classifier

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    #print(data.shape)
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def KDECopulaNNPdf_woKDE_OneHotEncoded(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    
    ## One hot encode the categorical features
    unique_values, ohe = encode_one_hot(real_data, categorical_columns)
    categorical_np = np.array(ohe)
    
    n_samples = real_data.shape[0]
    n_features = real_data.shape[1]
    
    ## Append the categorical one hot encoded data to numerical and ordinal
    data = np.float64(np.hstack((real_data[:, numerical_features+ordinal_columns], categorical_np)))

    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    X_gen = np.float32(X_gen)
    
    
    X_final = decode_one_hot(X_gen, categorical_columns, unique_values, n_features)
    X_final[:, numerical_features+ordinal_columns] = X_gen[:, numerical_features+ordinal_columns]
    print(X_final.shape)
    return X_final
    
def KDECopulaNNPdf_woKDE_FreqMapping(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    data, inv_mappings = categorical_frequency_mapping(data, categorical_columns)

    kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns] = np.round(X_gen[:, categorical_columns])
    X_final = categorical_frequency_inverse_mapping(X_gen, categorical_columns, inv_mappings)
    return X_final

In [None]:
asia_scores_mlp = benchmark(synthesizers=[KDECopulaNNPdf_RoundCategorical,
                                         KDECopulaNNPdf_woKDE_OneHotEncoded,
                                         KDECopulaNNPdf_woKDE_FreqMapping], datasets=['asia'])
asia_scores_mlp

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    kde = KDECopulaNNPdf(use_KDE=False, clf=XGBClassifier(random_state=42, max_depth=6, alpha=0.2, subsample=0.5))
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

def KDECopulaNNPdf_woKDE_OneHotEncoded(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    
    ## One hot encode the categorical features
    unique_values, ohe = encode_one_hot(real_data, categorical_columns)
    categorical_np = np.array(ohe)
    
    n_samples = real_data.shape[0]
    n_features = real_data.shape[1]
    
    ## Append the categorical one hot encoded data to numerical and ordinal
    data = np.float64(np.hstack((real_data[:, numerical_features+ordinal_columns], categorical_np)))

    kde = KDECopulaNNPdf(use_KDE=False, clf=XGBClassifier(random_state=42, max_depth=6, alpha=0.2, subsample=0.5))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    X_gen = np.float32(X_gen)
    
    
    X_final = decode_one_hot(X_gen, categorical_columns, unique_values, n_features)
    X_final[:, numerical_features+ordinal_columns] = X_gen[:, numerical_features+ordinal_columns]
    print(X_final.shape)
    return X_final
    
def KDECopulaNNPdf_woKDE_FreqMapping(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    data, inv_mappings = categorical_frequency_mapping(data, categorical_columns)

    kde = KDECopulaNNPdf(use_KDE=False, clf=XGBClassifier(random_state=42, max_depth=6, alpha=0.2, subsample=0.5))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns] = np.round(X_gen[:, categorical_columns])
    X_final = categorical_frequency_inverse_mapping(X_gen, categorical_columns, inv_mappings)
    return X_final

In [None]:
asia_scores_xgboost = benchmark(synthesizers=[KDECopulaNNPdf_RoundCategorical,
                                         KDECopulaNNPdf_woKDE_OneHotEncoded,
                                         KDECopulaNNPdf_woKDE_FreqMapping], datasets=['asia'])
asia_scores_xgboost

In [None]:
asia_scores_mlp['Classifier'] = 'MLP'
asia_scores_xgboost['Classifier'] = 'XGBoost'
asia_scores_mlp.iloc[0:9]['Classifier'] = 'N/A'

In [None]:
asia_scores = asia_scores_mlp.reset_index().append(asia_scores_xgboost.reset_index().iloc[-3:], ignore_index=True)
asia_scores

### Grid search

In [None]:
data = np.float64(df)

In [None]:
kde = KDECopulaNNPdf(use_KDE=False, clf=MLPClassifier())
kde.get_params().keys()

In [None]:
# then for the grid search do this, where all classifier options now have a prefix clf__:
from sklearn.model_selection import GridSearchCV
parameters = {
    'clf__alpha': 10.0 ** -np.arange(1, 3),
    'clf__hidden_layer_sizes': [(10,),(20,),(50,),(100,)],
    'clf__activation': ['tanh', 'relu'],
    'clf__solver': ['sgd', 'adam'],
    'clf__alpha': [0.0001, 0.05],
    'clf__learning_rate': ['constant','adaptive'],
}

grid = GridSearchCV(KDECopulaNNPdf(use_KDE=False), parameters, cv=5)
grid.fit(data)
print (grid.best_params_)

In [None]:
print (grid.best_params_)

In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns):
    # Max's kde copula model with default parameters
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    #print(data.shape)
    
    kde = KDECopulaNNPdf(clf=MLPClassifier(hidden_layer_sizes=(100,), alpha=0.05,
                                            max_iter=500, early_stopping=True, random_state=1), use_KDE=False)
    kde = kde.fit(data)
    
    X_gen, sample_weight = kde.sample(n_samples)
    
    X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    return X_gen

In [None]:
asia_scores = benchmark(synthesizers=[KDECopulaNNPdf_RoundCategorical], datasets=['asia'])
asia_scores

In [None]:
asia_scores.sort_values('asia/test_likelihood')

* With use_KDE=False, modifying the classification model or tuning the hyper-parameters don't make a difference.