# Imports

In [1]:
config = {
    'dhdt': {
        'depth': 3,
        'learning_rate': 1e-3,
        
        'loss': 'binary_crossentropy',#'mae',
        'optimizer': 'adam',        
        
        'beta_1': 100,
        'beta_2': 100,
        
        'squeeze_factor': 1,
    },
    
    
    
    'make_classification': {
        'number_of_variables': 5,
        'n_samples': 10_000,
    },

    'computation': {
        'random_seed': 42,
        'num_eval': 20,
        'trials': 5,
        'n_jobs': 60,
        'verbosity': 0,
    },
}



In [2]:
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder
from livelossplot import PlotLosses

import os
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

from IPython.display import Image
from IPython.display import display, clear_output

import pandas as pd

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = '' #'true'

import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import logging

tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(3)

from keras import backend as K
from keras.utils.generic_utils import get_custom_objects


import seaborn as sns
sns.set_style("darkgrid")

import time
import random

from utilities.utilities import *
from utilities.DHDT import *

from joblib import Parallel, delayed

from itertools import product
from collections.abc import Iterable


# Evaluation

## make_classification

In [3]:
#model_seed_list = [i for i in range(config['computation']['trials'])]
#data_seed_list = [i for i in range(config['computation']['num_eval'])]

#combined_seed_list = list(product(model_seed_list, data_seed_list))

parallel_eval_synthetic = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_synthetic = parallel_eval_synthetic(delayed(evaluate_synthetic_parallel)(index = index,
                                                                                            random_seed_data = config['computation']['random_seed']+index,
                                                                                            random_seed_model = config['computation']['random_seed'],#+random_seed_model,
                                                                                            trials = config['computation']['trials'],
                                                                                            config = config['make_classification'],
                                                                                            verbosity = -1) for index in range(config['computation']['num_eval']))


[Parallel(n_jobs=60)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=60)]: Done   6 out of  20 | elapsed: 12.0min remaining: 27.9min
[Parallel(n_jobs=60)]: Done  13 out of  20 | elapsed: 14.6min remaining:  7.9min
[Parallel(n_jobs=60)]: Done  20 out of  20 | elapsed: 17.2min remaining:    0.0s
[Parallel(n_jobs=60)]: Done  20 out of  20 | elapsed: 17.2min finished


In [4]:
for i, synthetic_result in enumerate(evaluation_results_synthetic):
    if i == 0:
        model_dict_synthetic = synthetic_result[0]
        scores_dict_synthetic = synthetic_result[1]
        dataset_dict_synthetic = synthetic_result[2]
    else: 
        model_dict_synthetic = mergeDict(model_dict_synthetic, synthetic_result[0])
        scores_dict_synthetic = mergeDict(scores_dict_synthetic, synthetic_result[1])
        dataset_dict_synthetic = mergeDict(dataset_dict_synthetic, synthetic_result[2])

In [17]:
metrics = ['accuracy']
index = [i for i in range(config['computation']['num_eval'])]
columns = flatten_list([[[approach + ' ' + metric + '_mean', approach + ' ' + metric + '_max', approach + ' ' + metric + '_std'] for metric in metrics] for approach in ['DHDT', 'sklearn']])

scores_DHDT = [scores_dict_synthetic[i]['DHDT']['accuracy'] for i in range(config['computation']['num_eval'])]

scores_sklearn = [scores_dict_synthetic[i]['sklearn']['accuracy'] for i in range(config['computation']['num_eval'])]


scores_DHDT_mean = np.mean(scores_DHDT, axis=1)
scores_sklearn_mean = np.mean(scores_sklearn, axis=1)
                        
scores_DHDT_max = np.max(scores_DHDT, axis=1)
scores_sklearn_max = np.max(scores_sklearn, axis=1)
                        
scores_DHDT_std = np.std(scores_DHDT, axis=1)
scores_sklearn_std = np.std(scores_sklearn, axis=1)

results_DHDT = np.vstack([scores_DHDT_mean, scores_DHDT_max, scores_DHDT_std])
results_sklearn = np.vstack([scores_sklearn_mean, scores_sklearn_max, scores_sklearn_std])


scores_dataframe_synthetic = pd.DataFrame(data=np.vstack([results_DHDT, results_sklearn]).T, index = index, columns = columns)
display(scores_dataframe_synthetic)
display(scores_dataframe_synthetic[scores_dataframe_synthetic.columns[1::3]])
display(scores_dataframe_synthetic.describe())

Unnamed: 0,DHDT accuracy_mean,DHDT accuracy_max,DHDT accuracy_std,sklearn accuracy_mean,sklearn accuracy_max,sklearn accuracy_std
0,0.6131,0.714,0.073941,0.8364,0.882,0.0228
1,0.8465,0.8935,0.078356,0.9353,0.9385,0.0016
2,0.8814,0.9355,0.04997,0.9231,0.9345,0.0228
3,0.6747,0.7485,0.050682,0.7786,0.813,0.0172
4,0.6952,0.7155,0.021695,0.8268,0.858,0.0156
5,0.7606,0.818,0.069899,0.8609,0.9185,0.0288
6,0.728,0.862,0.153225,0.8479,0.8705,0.0452
7,0.7161,0.8575,0.098916,0.8278,0.8415,0.0274
8,0.8593,0.9235,0.114639,0.9002,0.9395,0.0786
9,0.7491,0.81,0.044152,0.8275,0.8755,0.024


Unnamed: 0,DHDT accuracy_max,sklearn accuracy_max
0,0.714,0.882
1,0.8935,0.9385
2,0.9355,0.9345
3,0.7485,0.813
4,0.7155,0.858
5,0.818,0.9185
6,0.862,0.8705
7,0.8575,0.8415
8,0.9235,0.9395
9,0.81,0.8755


Unnamed: 0,DHDT accuracy_mean,DHDT accuracy_max,DHDT accuracy_std,sklearn accuracy_mean,sklearn accuracy_max,sklearn accuracy_std
count,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.717885,0.79615,0.06981,0.847995,0.8741,0.02053
std,0.084852,0.076824,0.036579,0.052886,0.058696,0.023365
min,0.562,0.6655,0.017104,0.7576,0.768,0.0
25%,0.677625,0.74025,0.043845,0.82215,0.8405,0.00205
50%,0.7093,0.80925,0.069846,0.8417,0.873,0.0149
75%,0.7564,0.844375,0.083496,0.889625,0.9225,0.02485
max,0.8814,0.9355,0.153225,0.9353,0.967,0.0794


In [6]:
identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

parallel_eval_real_world = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_real_world = parallel_eval_real_world(delayed(evaluate_real_world_parallel)(identifier_list=identifier_list, 
                                                                                               random_seed_model=config['computation']['random_seed']+i,
                                                                                               verbosity = -1) for i in range(config['computation']['trials']))


for i, real_world_result in enumerate(evaluation_results_real_world):
    if i == 0:
        model_dict_real_world = real_world_result[0]
        scores_dict_real_world = real_world_result[1]
        dataset_dict_real_world = real_world_result[2]
    else: 
        model_dict_real_world = mergeDict(model_dict_real_world, real_world_result[0])
        scores_dict_real_world = mergeDict(scores_dict_real_world, real_world_result[1])
        dataset_dict_real_world = mergeDict(dataset_dict_real_world, real_world_result[2])

[Parallel(n_jobs=60)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=60)]: Done   2 out of   5 | elapsed:  2.7min remaining:  4.0min
[Parallel(n_jobs=60)]: Done   5 out of   5 | elapsed: 19.6min finished


In [7]:
metrics = ['accuracy']
index = identifier_list
columns = flatten_list([[[approach + ' ' + metric + '_mean', approach + ' ' + metric + '_std'] for metric in metrics] for approach in ['DHDT', 'sklearn']])

scores_DHDT = [scores_dict_real_world[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict_real_world[identifier]['sklearn']['accuracy'] for identifier in identifier_list]


scores_DHDT_mean = np.mean(scores_DHDT, axis=1)
scores_sklearn_mean = np.mean(scores_sklearn, axis=1)

scores_DHDT_std = np.std(scores_DHDT, axis=1)
scores_sklearn_std = np.std(scores_sklearn, axis=1)

results_DHDT = np.vstack([scores_DHDT_mean, scores_DHDT_std])
results_sklearn = np.vstack([scores_sklearn_mean, scores_sklearn_std])


scores_dataframe_real_world = pd.DataFrame(data=np.vstack([results_DHDT, results_sklearn]).T, index = index, columns = columns)
scores_dataframe_real_world

Unnamed: 0,DHDT accuracy_mean,DHDT accuracy_std,sklearn accuracy_mean,sklearn accuracy_std
Cervical Cancer,0.34152,0.247738,0.467836,0.0
Credit Card,0.512833,0.252194,0.776167,0.0


In [8]:
scores_dict_real_world

{'Cervical Cancer': {'sklearn': {'accuracy': [0.4678362573099415,
    0.4678362573099415,
    0.4678362573099415,
    0.4678362573099415,
    0.4678362573099415]},
  'DHDT': {'accuracy': [0.17543859649122806,
    0.06432748538011696,
    0.6374269005847953,
    0.6432748538011696,
    0.1871345029239766]},
  'history': [None, None, None, None, None]},
 'Credit Card': {'sklearn': {'accuracy': [0.7761666666666667,
    0.7761666666666667,
    0.7761666666666667,
    0.7761666666666667,
    0.7761666666666667]},
  'DHDT': {'accuracy': [0.21883333333333332,
    0.21916666666666668,
    0.7766666666666666,
    0.5668333333333333,
    0.7826666666666666]},
  'history': [None, None, None, None, None]}}

In [9]:
scores_DHDT = [scores_dict_real_world[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict_real_world[identifier]['sklearn']['accuracy'] for identifier in identifier_list]


In [10]:
np.mean(scores_DHDT, axis=1)

array([0.34152047, 0.51283333])

In [11]:
scores_DHDT = [scores_dict_real_world[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict_real_world[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

ValueError: Shape of passed values is (5, 4), indices imply (2, 2)

In [None]:
scores_DHDT = [scores_dict[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

In [None]:
scores_DHDT = [scores_dict[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

In [None]:
parameter_grid = {
        'depth': 3,
        'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001],
        
        'loss': ['binary_crossentropy', 'rmse'],#'mae',
        'optimizer': ['adam', 'sgd'],        
        
        'beta_1': [10, 50, 100],
        'beta_2': [10, 50, 100],
        
        'squeeze_factor': [0.2, 0.5, 1, 2, 5],    
}


In [None]:
scores_DHDT_make_class = [scores_dict_make_class[identifier]['DHDT']['accuracy'] for identifier in range(num_make_class_eval)]

scores_sklearn_make_class = [scores_dict_make_class[identifier]['sklearn']['accuracy'] for identifier in range(num_make_class_eval)]


## real-world

In [None]:
--> put eval in function 
    --> make parallel execution with different seeds / splits, etcs 
        --> compare (save all values and generate mean+std df, but keep all values)


In [None]:
config['n_samples'] = 10_000
config['number_of_variables'] = 5
random_seed = i

In [None]:
n_jobs = 20

identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

parallel_real_world_eval = Parallel(n_jobs=n_jobs, verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_by_dataset = parallel_real_world_eval(delayed(evaluate_dhdt)(identifier) for identifier in identifier_list)


In [None]:
identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

trials = 20
n_jobs = 20
random_seed = 42

parallel_real_world_eval = Parallel(n_jobs=n_jobs, verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_by_dataset = parallel_real_world_eval(delayed(evaluate_all_parallel)(identifier_list=identifier_list, 
                                                                               random_seed=random_seed+i) for i in range(trials))


In [None]:
identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

evaluate_all_real_world(identifier_list)

In [None]:
def evaluate_all_real_world(identifier_list):
    
    identifier_list = ['Cervical Cancer',
                       'Credit Card',
                       'Absenteeism']

    dataset_dict = {}
    model_dict = {}

    scores_dict = {}

    for identifier in tqdm(identifier_list, desc='dataset loop'):

        print('_________________________________________________________________________________________________________________')   

        dataset_dict[identifier] = {}
        model_dict[identifier] = {}

        scores_dict[identifier] = {'sklearn': {},
                                   'DHDT': {}}

        dataset_dict[identifier] = get_preprocessed_dataset(identifier)    

        model_dict[identifier]['sklearn'] = DecisionTreeClassifier(max_depth=3, 
                                                                   random_state=42)

        model_dict[identifier]['sklearn'].fit(dataset_dict[identifier]['X_train'], 
                                              dataset_dict[identifier]['y_train'])

        scores_dict[identifier]['sklearn']['accuracy'] = model_dict[identifier]['sklearn'].score(dataset_dict[identifier]['X_test'], 
                                                                                                 dataset_dict[identifier]['y_test'])



        model_dict[identifier]['DHDT'] = DHDT(depth=3,
                                                 number_of_variables = dataset_dict[identifier]['X_train'].shape[1],
                                                 learning_rate=1e-3,
                                                 squeeze_factor = 1,
                                                 loss='binary_crossentropy',#'binary_crossentropy',
                                                 optimizer='rmsprop',
                                                 random_seed=40,
                                                 verbosity=0)

        scores_dict[identifier]['history'] = model_dict[identifier]['DHDT'].fit(dataset_dict[identifier]['X_train'], 
                                                                              dataset_dict[identifier]['y_train'], 
                                                                              batch_size=512, 
                                                                              epochs=1_000, 
                                                                              early_stopping_epochs=50, 
                                                                              valid_data=(dataset_dict[identifier]['X_valid'], dataset_dict[identifier]['y_valid']))

        dataset_dict[identifier]['y_test_dhdt'] = model_dict[identifier]['DHDT'].predict(dataset_dict[identifier]['X_test'])
        scores_dict[identifier]['DHDT']['accuracy'] = accuracy_score(dataset_dict[identifier]['y_test'], np.round(dataset_dict[identifier]['y_test_dhdt']))

        print('Test Accuracy Sklearn (' + identifier + ')', scores_dict[identifier]['sklearn']['accuracy'])
        print('Test Accuracy DHDT (' + identifier + ')', scores_dict[identifier]['DHDT']['accuracy'])   
        print('_________________________________________________________________________________________________________________')   

    return   model_dict, scores_dict, dataset_dit


In [None]:
identifier_list = ['Cervical Cancer',
                   'Credit Card',
                   'Absenteeism']

dataset_dict = {}
model_dict = {}

scores_dict = {}

for identifier in tqdm(identifier_list, desc='dataset loop'):
    
    print('_________________________________________________________________________________________________________________')   
    
    dataset_dict[identifier] = {}
    model_dict[identifier] = {}

    scores_dict[identifier] = {'sklearn': {},
                               'DHDT': {}}
    
    dataset_dict[identifier] = get_preprocessed_dataset(identifier)    

    model_dict[identifier]['sklearn'] = DecisionTreeClassifier(max_depth=3, 
                                                               random_state=42)

    model_dict[identifier]['sklearn'].fit(dataset_dict[identifier]['X_train'], 
                                          dataset_dict[identifier]['y_train'])

    scores_dict[identifier]['sklearn']['accuracy'] = model_dict[identifier]['sklearn'].score(dataset_dict[identifier]['X_test'], 
                                                                                             dataset_dict[identifier]['y_test'])



    model_dict[identifier]['DHDT'] = DHDT(depth=3,
                                             number_of_variables = dataset_dict[identifier]['X_train'].shape[1],
                                             learning_rate=1e-3,
                                             squeeze_factor = 1,
                                             loss='binary_crossentropy',#'binary_crossentropy',
                                             optimizer='rmsprop',
                                             random_seed=40,
                                             verbosity=0)

    scores_dict[identifier]['history'] = model_dict[identifier]['DHDT'].fit(dataset_dict[identifier]['X_train'], 
                                                                          dataset_dict[identifier]['y_train'], 
                                                                          batch_size=512, 
                                                                          epochs=1_000, 
                                                                          early_stopping_epochs=50, 
                                                                          valid_data=(dataset_dict[identifier]['X_valid'], dataset_dict[identifier]['y_valid']))

    dataset_dict[identifier]['y_test_dhdt'] = model_dict[identifier]['DHDT'].predict(dataset_dict[identifier]['X_test'])
    scores_dict[identifier]['DHDT']['accuracy'] = accuracy_score(dataset_dict[identifier]['y_test'], np.round(dataset_dict[identifier]['y_test_dhdt']))
    
    print('Test Accuracy Sklearn (' + identifier + ')', scores_dict[identifier]['sklearn']['accuracy'])
    print('Test Accuracy DHDT (' + identifier + ')', scores_dict[identifier]['DHDT']['accuracy'])   
    print('_________________________________________________________________________________________________________________')   

    

In [None]:
scores_DHDT = [scores_dict[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

In [None]:
identifier = "Absenteeism"
plt.figure(figsize=(15,8))
image = model_dict[identifier]['DHDT'].plot(normalizer_list=dataset_dict['normalizer_list'][identifier])
display(image)

plt.figure(figsize=(15,8))
plot_tree(model_dict[identifier]['sklearn'], fontsize=10) 
plt.show()

## Absenteeism