# Imports

In [1]:
config = {
    'dhdt': {
        'depth': 3,
        'learning_rate': 1e-3,
        
        'loss': 'binary_crossentropy',#'mae',
        'optimizer': 'adam',        
        
        'beta_1': 100,
        'beta_2': 100,
        
        'squeeze_factor': 1,
        
        'batch_size': 512,
        'epochs': 1_000,
        'early_stopping_epochs': 20,
    },
    
    
    
    'make_classification': {
        'number_of_variables': 5,
        'n_samples': 10_000,
    },

    'computation': {
        'random_seed': 42,
        'num_eval': 10,
        'trials': 5,
        'n_jobs': 60,
        'verbosity': 0,
    },
}



In [2]:
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder

from livelossplot import PlotLosses

import os
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

from IPython.display import Image
from IPython.display import display, clear_output

import pandas as pd

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = '' #'true'

import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import logging

import tensorflow as tf
import tensorflow_addons as tfa

tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(3)

np.seterr(all="ignore")

from keras import backend as K
from keras.utils.generic_utils import get_custom_objects


import seaborn as sns
sns.set_style("darkgrid")

import time
import random

from utilities.utilities import *
from utilities.DHDT import *

from joblib import Parallel, delayed

from itertools import product
from collections.abc import Iterable


# Evaluation

## make_classification

In [3]:
parallel_eval_synthetic = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_synthetic = parallel_eval_synthetic(delayed(evaluate_synthetic_parallel)(index = index,
                                                                                            random_seed_data = config['computation']['random_seed']+index,
                                                                                            random_seed_model = config['computation']['random_seed'],#+random_seed_model,
                                                                                            config = config,
                                                                                            verbosity = -1) for index in range(config['computation']['num_eval']))

for i, synthetic_result in enumerate(evaluation_results_synthetic):
    if i == 0:
        model_dict_synthetic = synthetic_result[0]
        scores_dict_synthetic = synthetic_result[1]
        dataset_dict_synthetic = synthetic_result[2]
    else: 
        model_dict_synthetic = mergeDict(model_dict_synthetic, synthetic_result[0])
        scores_dict_synthetic = mergeDict(scores_dict_synthetic, synthetic_result[1])
        dataset_dict_synthetic = mergeDict(dataset_dict_synthetic, synthetic_result[2])

[Parallel(n_jobs=60)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=60)]: Done   3 out of  10 | elapsed:  7.9min remaining: 18.4min
[Parallel(n_jobs=60)]: Done   7 out of  10 | elapsed: 10.9min remaining:  4.7min
[Parallel(n_jobs=60)]: Done  10 out of  10 | elapsed: 12.0min finished


In [4]:
metrics = ['accuracy_test']
index = [i for i in range(config['computation']['num_eval'])]
columns = flatten_list([[[approach + ' ' + metric + '_mean', approach + ' ' + metric + '_max', approach + ' ' + metric + '_std'] for metric in metrics] for approach in ['DHDT', 'sklearn']])

scores_DHDT = [scores_dict_synthetic[i]['DHDT'][metrics[0]] for i in range(config['computation']['num_eval'])]

scores_sklearn = [scores_dict_synthetic[i]['sklearn'][metrics[0]] for i in range(config['computation']['num_eval'])]


scores_DHDT_mean = np.mean(scores_DHDT, axis=1)
scores_sklearn_mean = np.mean(scores_sklearn, axis=1)
                        
scores_DHDT_max = np.max(scores_DHDT, axis=1)
scores_sklearn_max = np.max(scores_sklearn, axis=1)
                        
scores_DHDT_std = np.std(scores_DHDT, axis=1)
scores_sklearn_std = np.std(scores_sklearn, axis=1)

results_DHDT = np.vstack([scores_DHDT_mean, scores_DHDT_max, scores_DHDT_std])
results_sklearn = np.vstack([scores_sklearn_mean, scores_sklearn_max, scores_sklearn_std])


scores_dataframe_synthetic = pd.DataFrame(data=np.vstack([results_DHDT, results_sklearn]).T, index = index, columns = columns)
display(scores_dataframe_synthetic)
display(scores_dataframe_synthetic[scores_dataframe_synthetic.columns[1::3]])
display(scores_dataframe_synthetic.describe())

Unnamed: 0,DHDT accuracy_test_mean,DHDT accuracy_test_max,DHDT accuracy_test_std,sklearn accuracy_test_mean,sklearn accuracy_test_max,sklearn accuracy_test_std
0,0.6009,0.7145,0.076422,0.8298,0.849,0.0096
1,0.8046,0.883,0.094405,0.9204,0.9345,0.0282
2,0.8821,0.916,0.043082,0.9231,0.9345,0.0228
3,0.7159,0.793,0.065374,0.8065,0.9525,0.073
4,0.6875,0.718,0.03447,0.8203,0.8255,0.0026
5,0.7914,0.8345,0.05937,0.8562,0.895,0.0194
6,0.7367,0.8625,0.145777,0.8711,0.8735,0.0012
7,0.7735,0.8595,0.083224,0.8508,0.888,0.0186
8,0.8584,0.921,0.114036,0.9002,0.9395,0.0786
9,0.7151,0.79,0.04616,0.82,0.838,0.009


Unnamed: 0,DHDT accuracy_test_max,sklearn accuracy_test_max
0,0.7145,0.849
1,0.883,0.9345
2,0.916,0.9345
3,0.793,0.9525
4,0.718,0.8255
5,0.8345,0.895
6,0.8625,0.8735
7,0.8595,0.888
8,0.921,0.9395
9,0.79,0.838


Unnamed: 0,DHDT accuracy_test_mean,DHDT accuracy_test_max,DHDT accuracy_test_std,sklearn accuracy_test_mean,sklearn accuracy_test_max,sklearn accuracy_test_std
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.75661,0.8292,0.076232,0.85984,0.893,0.0263
std,0.083471,0.073997,0.034632,0.042711,0.046036,0.027505
min,0.6009,0.7145,0.03447,0.8065,0.8255,0.0012
25%,0.7153,0.79075,0.049463,0.822675,0.855125,0.00915
50%,0.7551,0.847,0.070898,0.8535,0.8915,0.019
75%,0.8013,0.877875,0.09161,0.892925,0.9345,0.02685
max,0.8821,0.921,0.145777,0.9231,0.9525,0.0786


## Real-World Eval

In [5]:
identifier_list = [
                    'Adult',#: 32,
                    'Bank Marketing',#: 32,
                    'Loan Credit',#: 32,

                    'Credit Card',#: 23, 
                    'Car',#: 21,


                    'Absenteeism',#: 15,
                    'Loan House',#: 15,
                    'Cervical Cancer',#: 15,

                    'Heart Disease',#: 13,           

                    'Titanic',#: 10,
                    'Medical Insurance',#: 10,
                    'Wisconsin Breast Cancer Original',#: 10,
                    'Wisconsin Diagnostic Breast Cancer',#: 10,
                    'Wisconsin Prognostic Breast Cancer',#: 10,
                    'Abalone',#: 10,

                    'Habermans Survival',#: 3, 
                  ]

parallel_eval_real_world = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_real_world = parallel_eval_real_world(delayed(evaluate_real_world_parallel)(identifier_list=identifier_list, 
                                                                                               random_seed_model=config['computation']['random_seed']+i,
                                                                                               config = config,
                                                                                               verbosity = -1) for i in range(config['computation']['trials']))


for i, real_world_result in enumerate(evaluation_results_real_world):
    if i == 0:
        model_dict_real_world = real_world_result[0]
        scores_dict_real_world = real_world_result[1]
        dataset_dict_real_world = real_world_result[2]
    else: 
        model_dict_real_world = mergeDict(model_dict_real_world, real_world_result[0])
        scores_dict_real_world = mergeDict(scores_dict_real_world, real_world_result[1])
        dataset_dict_real_world = mergeDict(dataset_dict_real_world, real_world_result[2])

[Parallel(n_jobs=60)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=60)]: Done   2 out of   5 | elapsed: 22.7min remaining: 34.0min
[Parallel(n_jobs=60)]: Done   5 out of   5 | elapsed: 32.3min finished


In [8]:
metrics = ['accuracy_test']
index = identifier_list
columns = flatten_list([[[approach + ' ' + metric + '_mean', approach + ' ' + metric + '_max', approach + ' ' + metric + '_std'] for metric in metrics] for approach in ['DHDT', 'sklearn']])

scores_DHDT = [scores_dict_real_world[identifier]['DHDT'][metrics[0]] for identifier in identifier_list]

scores_sklearn = [scores_dict_real_world[identifier]['sklearn'][metrics[0]] for identifier in identifier_list]


scores_DHDT_mean = np.mean(scores_DHDT, axis=1)
scores_sklearn_mean = np.mean(scores_sklearn, axis=1)

scores_DHDT_max = np.max(scores_DHDT, axis=1)
scores_sklearn_max = np.max(scores_sklearn, axis=1)

scores_DHDT_std = np.std(scores_DHDT, axis=1)
scores_sklearn_std = np.std(scores_sklearn, axis=1)

results_DHDT = np.vstack([scores_DHDT_mean, scores_DHDT_max, scores_DHDT_std])
results_sklearn = np.vstack([scores_sklearn_mean, scores_sklearn_max, scores_sklearn_std])


scores_dataframe_real_world = pd.DataFrame(data=np.vstack([results_DHDT, results_sklearn]).T, index = index, columns = columns)
display(scores_dataframe_real_world)
display(scores_dataframe_real_world[scores_dataframe_real_world.columns[1::3]])


Unnamed: 0,DHDT accuracy_test_mean,DHDT accuracy_test_max,DHDT accuracy_test_std,sklearn accuracy_test_mean,sklearn accuracy_test_max,sklearn accuracy_test_std
Adult,0.579238,0.751843,0.191894,0.815571,0.815571,0.0
Bank Marketing,0.837956,0.876576,0.036695,0.784672,0.784672,0.0
Loan Credit,0.570201,0.771779,0.142823,0.712556,0.712556,0.0
Credit Card,0.5128,0.7825,0.252159,0.776167,0.776167,0.0
Car,0.684058,0.695652,0.008198,0.898551,0.898551,1.110223e-16
Absenteeism,0.528378,0.635135,0.128015,0.635135,0.635135,0.0
Loan House,0.616393,0.680328,0.054222,0.762295,0.762295,0.0
Cervical Cancer,0.306433,0.824561,0.278515,0.467836,0.467836,0.0
Heart Disease,0.736667,0.833333,0.063596,0.85,0.85,0.0
Titanic,0.678652,0.780899,0.08648,0.797753,0.797753,0.0


Unnamed: 0,DHDT accuracy_test_max,sklearn accuracy_test_max
Adult,0.751843,0.815571
Bank Marketing,0.876576,0.784672
Loan Credit,0.771779,0.712556
Credit Card,0.7825,0.776167
Car,0.695652,0.898551
Absenteeism,0.635135,0.635135
Loan House,0.680328,0.762295
Cervical Cancer,0.824561,0.467836
Heart Disease,0.833333,0.85
Titanic,0.780899,0.797753


In [7]:
identifier = "Absenteeism"
plt.figure(figsize=(15,8))
image = model_dict[identifier]['DHDT'].plot(normalizer_list=dataset_dict['normalizer_list'][identifier])
display(image)

plt.figure(figsize=(15,8))
plot_tree(model_dict[identifier]['sklearn'], fontsize=10) 
plt.show()

NameError: name 'model_dict' is not defined

<Figure size 1080x576 with 0 Axes>

# Hyperparameter Optimization

In [None]:
z

In [None]:
parameter_dict = {
        'depth': [3],
        'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001],
        
        'loss': ['binary_crossentropy', 'rmse'],#'mae',
        'optimizer': ['adam', 'sgd'],        
        
        'beta_1': [10, 50, 100],
        'beta_2': [10, 50, 100],
        
        'squeeze_factor': [0.2, 0.5, 1, 2, 5],    
}


In [None]:
parameter_grid = ParameterGrid(parameter_dict)

In [None]:
def evaluate_parameter_setting_synthetic(parameter_setting):
    
    config_parameter_setting = deepcopy(config)
    
    
    for key, value in parameter_setting.items():
        config_parameter_setting[key] = value
    
    
    
    
    
    parallel_eval_synthetic = Parallel(n_jobs=1, verbose=3, backend='sequential') #loky #sequential multiprocessing
    evaluation_results_synthetic = parallel_eval_synthetic(delayed(evaluate_synthetic_parallel)(index = index,
                                                                                                random_seed_data = config['computation']['random_seed']+index,
                                                                                                random_seed_model = config['computation']['random_seed'],#+random_seed_model,
                                                                                                config = config_parameter_setting,
                                                                                                verbosity = -1) for index in range(config['computation']['num_eval']))

    
    for i, synthetic_result in enumerate(evaluation_results_synthetic):
        if i == 0:
            model_dict_synthetic = synthetic_result[0]
            scores_dict_synthetic = synthetic_result[1]
            dataset_dict_synthetic = synthetic_result[2]
        else: 
            model_dict_synthetic = mergeDict(model_dict_synthetic, synthetic_result[0])
            scores_dict_synthetic = mergeDict(scores_dict_synthetic, synthetic_result[1])
            dataset_dict_synthetic = mergeDict(dataset_dict_synthetic, synthetic_result[2])    
    
    metrics = ['accuracy_valid']
    index = [i for i in range(config['computation']['num_eval'])]
    columns = flatten_list([[[approach + ' ' + metric + '_mean', approach + ' ' + metric + '_max', approach + ' ' + metric + '_std'] for metric in metrics] for approach in ['DHDT', 'sklearn']])

    scores_DHDT = [scores_dict_synthetic[i]['DHDT'][metrics[0]] for i in range(config['computation']['num_eval'])]

    scores_sklearn = [scores_dict_synthetic[i]['sklearn'][metrics[0]] for i in range(config['computation']['num_eval'])]


    scores_DHDT_mean = np.mean(scores_DHDT, axis=1)
    scores_sklearn_mean = np.mean(scores_sklearn, axis=1)

    scores_DHDT_max = np.max(scores_DHDT, axis=1)
    scores_sklearn_max = np.max(scores_sklearn, axis=1)

    scores_DHDT_std = np.std(scores_DHDT, axis=1)
    scores_sklearn_std = np.std(scores_sklearn, axis=1)

    results_DHDT = np.vstack([scores_DHDT_mean, scores_DHDT_max, scores_DHDT_std])
    results_sklearn = np.vstack([scores_sklearn_mean, scores_sklearn_max, scores_sklearn_std])


    scores_dataframe_synthetic = pd.DataFrame(data=np.vstack([results_DHDT, results_sklearn]).T, index = index, columns = columns)
    #display(scores_dataframe_synthetic)
    #display(scores_dataframe_synthetic[scores_dataframe_synthetic.columns[1::3]])
    #display(scores_dataframe_synthetic.describe())    
    
    return np.mean(scores_DHDT_mean), parameter_setting
    
    
 

In [None]:
def evaluate_parameter_setting_real_world(parameter_setting, identifier):
    
    config_parameter_setting = deepcopy(config)
    
    
    for key, value in parameter_setting.items():
        config_parameter_setting[key] = value
    
    
    
    parallel_eval_real_world = Parallel(n_jobs=1, verbose=3, backend='sequential') #loky #sequential multiprocessing
    evaluation_results_real_world = parallel_eval_real_world(delayed(evaluate_real_world_parallel)(identifier_list=[identifier], 
                                                                                                   random_seed_model=config['computation']['random_seed']+i,
                                                                                                   config = config_parameter_setting,
                                                                                                   verbosity = -1) for i in range(config['computation']['trials']))


    for i, real_world_result in enumerate(evaluation_results_real_world):
        if i == 0:
            model_dict_real_world = real_world_result[0]
            scores_dict_real_world = real_world_result[1]
            dataset_dict_real_world = real_world_result[2]
        else: 
            model_dict_real_world = mergeDict(model_dict_real_world, real_world_result[0])
            scores_dict_real_world = mergeDict(scores_dict_real_world, real_world_result[1])
            dataset_dict_real_world = mergeDict(dataset_dict_real_world, real_world_result[2])    
    
    
    metrics = ['accuracy_valid']
    index = identifier_list
    columns = flatten_list([[[approach + ' ' + metric + '_mean', approach + ' ' + metric + '_std'] for metric in metrics] for approach in ['DHDT', 'sklearn']])

    scores_DHDT = [scores_dict_real_world[identifier]['DHDT'][metrics[0]] for identifier in identifier_list]

    scores_sklearn = [scores_dict_real_world[identifier]['sklearn'][metrics[0]] for identifier in identifier_list]


    scores_DHDT_mean = np.mean(scores_DHDT, axis=1)
    scores_sklearn_mean = np.mean(scores_sklearn, axis=1)

    scores_DHDT_max = np.max(scores_DHDT, axis=1)
    scores_sklearn_max = np.max(scores_sklearn, axis=1)

    scores_DHDT_std = np.std(scores_DHDT, axis=1)
    scores_sklearn_std = np.std(scores_sklearn, axis=1)

    results_DHDT = np.vstack([scores_DHDT_mean, scores_DHDT_max, scores_DHDT_std])
    results_sklearn = np.vstack([scores_sklearn_mean, scores_sklearn_max, scores_sklearn_std])


    scores_dataframe_real_world = pd.DataFrame(data=np.vstack([results_DHDT, results_sklearn]).T, index = index, columns = columns)
    display(scores_dataframe_synthetic)
    display(scores_dataframe_synthetic[scores_dataframe_synthetic.columns[1::3]])    
    
    
    return np.mean(scores_DHDT_mean), parameter_setting
    
    

In [None]:
parallel_hpo_synthetic = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_hpo_synthetic = parallel_hpo_synthetic(delayed(evaluate_parameter_setting_synthetic)(parameter_setting) for parameter_setting in parameter_grid)

In [None]:
sorted_evaluation_results_hpo_synthetic = sorted(evaluation_results_hpo_synthetic, key=lambda x: x[0], reverse=True)

In [None]:
print(sorted_evaluation_results_hpo_synthetic[:5])

In [None]:
identifier = 'Titanic'

parallel_hpo_real = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_hpo_real = parallel_hpo_real(delayed(evaluate_parameter_setting_real_world)(parameter_setting, identifier) for parameter_setting in parameter_grid)

In [None]:
sorted_evaluation_results_hpo_real = sorted(evaluation_results_hpo_real, key=lambda x: x[0], reverse=True)

In [None]:
print(sorted_evaluation_results_hpo_real[:5])