In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from sdgym import benchmark
from sdgym import load_dataset
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf

In [None]:
def get_one_hot(df, cols):
    categorical_data = pd.DataFrame(df[:, cols], columns=cols)
    unique_values = categorical_data.apply(pd.Series.unique)
    unique_values = unique_values.apply(sorted)
    unique_values = unique_values.apply(np.array)
    one_hot_encoded = pd.get_dummies(data=categorical_data, columns=cols)
    return unique_values, one_hot_encoded

def get_inverse_one_hot(encoded, cols):
    n_samples = encoded.shape[0]
    orig_np = []
    for i in cols:
        original = encoded[encoded.columns[pd.Series(encoded.columns).str.startswith(str(i))]].idxmax(axis=1)
        original = original.str.replace('{}_'.format(str(i)), '').astype(float)
        orig_np.append(original)
    return np.stack(orig_np).T
    
def consolidate_categorical(samples, start_idx, end_idx):
    return np.array(pd.DataFrame(samples[:, start_idx:end_idx]).idxmax(axis=1).astype(float))
 
def set_min_max(data, nf):
    x_min = [0] * nf
    x_max = [1.] * nf
    for i in range(0, nf):
        x_min[i] = data[:, i].min()
        x_max[i] = data[:, i].max()
    return x_min, x_max

def kde_copula_nn_pdf_synthesizer_one_hot(real_data, categorical_columns, ordinal_columns):
    
    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns+ordinal_columns))
    
    ## One hot encode the categorical features
    unique_values, ohe = get_one_hot(real_data, categorical_columns)
    categorical_np = np.array(ohe)
    
    #orig = get_inverse_one_hot(ohe, categorical_columns)
    ## Append the categorical one hot encoded data to numerical and ordinal
    data = np.float64(np.hstack((real_data[:, numerical_features+ordinal_columns], categorical_np)))
    assert data.shape[-1] == unique_values.str.len().sum() + len(numerical_features + ordinal_columns)

    n_samples = data.shape[0]
    n_features = data.shape[1]
    x_min, x_max = set_min_max(data, n_features)
    
    ## test if one hot encoding and consolidation are working as expected
    col = 1
    sidx = len(numerical_features+ordinal_columns)
    eidx = sidx + len(unique_values[col])
    indices = consolidate_categorical(data, sidx, eidx).astype(int)
    assert np.allclose(real_data[:, col],unique_values[col][indices])
    ## end of test
    
    kde = KDECopulaNNPdf(x_min=x_min, x_max=x_max, rho=0.5, use_inverse_qt=True, 
                         clf=MLPClassifier(random_state=0, max_iter=500, early_stopping=True))
    kde = kde.fit(data)
    X_gen, sample_weight = kde.sample(n_samples)
    X_gen = np.float32(X_gen)

    X_gen_consolidate = {}
    for idx, col in enumerate(numerical_features+ordinal_columns):
        X_gen_consolidate[col] = X_gen[:, idx]
    end_idx = idx
    for col in categorical_columns:
        start_idx = end_idx
        end_idx = start_idx + len(unique_values[col])
        indices = consolidate_categorical(X_gen, start_idx, end_idx).astype(int)
        X_gen_consolidate[col] = unique_values[col][indices]
        
    keys = sorted(X_gen_consolidate.keys())
    X_final = np.zeros((X_gen.shape[0], len(keys)))
    for k in keys:
        X_final[:,k] = X_gen_consolidate[k]

    return X_final
    

In [None]:
scores = benchmark(synthesizers=[kde_copula_nn_pdf_synthesizer_one_hot],  datasets=['adult'])

In [None]:
scores