# Imports

In [None]:
config = {
    'dhdt': {
        'depth': 3,
        'learning_rate': 1e-3,
        
        'loss': 'binary_crossentropy',#'mae',
        'optimizer': 'adam',        
        
        'beta_1': 100,
        'beta_2': 100,
        
        'squeeze_factor': 1,
    },
    
    
    
    'make_classification': {
        'number_of_variables': 5,
        'n_samples': 10_000,
    },

    'computation': {
        'random_seed': 42,
        'num_eval': 10,
        'trials': 5,
        'n_jobs': 60,
        'verbosity': 0,
    },
}



In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder
from livelossplot import PlotLosses

import os
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

from IPython.display import Image
from IPython.display import display, clear_output

import pandas as pd

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = '' #'true'

import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import logging

tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(3)

from keras import backend as K
from keras.utils.generic_utils import get_custom_objects


import seaborn as sns
sns.set_style("darkgrid")

import time
import random

from utilities.utilities import *
from utilities.DHDT import *

from joblib import Parallel, delayed

from itertools import product
from collections.abc import Iterable


# Evaluation

## make_classification

In [None]:
#model_seed_list = [i for i in range(config['computation']['trials'])]
#data_seed_list = [i for i in range(config['computation']['num_eval'])]

#combined_seed_list = list(product(model_seed_list, data_seed_list))

parallel_eval_synthetic = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_synthetic = parallel_eval_synthetic(delayed(evaluate_synthetic_parallel)(index = index,
                                                                                            random_seed_data = config['computation']['random_seed']+index,
                                                                                            random_seed_model = config['computation']['random_seed'],#+random_seed_model,
                                                                                            trials = config['computation']['trials'],
                                                                                            config = config['make_classification'],
                                                                                            verbosity = -1) for index in range(config['computation']['num_eval']))


In [None]:
for i, synthetic_result in enumerate(evaluation_results_synthetic):
    if i == 0:
        model_dict_synthetic = synthetic_result[0]
        scores_dict_synthetic = synthetic_result[1]
        dataset_dict_synthetic = synthetic_result[2]
    else: 
        model_dict_synthetic = mergeDict(model_dict_synthetic, synthetic_result[0])
        scores_dict_synthetic = mergeDict(scores_dict_synthetic, synthetic_result[1])
        dataset_dict_synthetic = mergeDict(dataset_dict_synthetic, synthetic_result[2])

In [None]:
metrics = ['accuracy']
index = [i for i in range(config['computation']['num_eval'])]
columns = flatten_list([[[approach + ' ' + metric + '_mean', approach + ' ' + metric + '_max', approach + ' ' + metric + '_std'] for metric in metrics] for approach in ['DHDT', 'sklearn']])

scores_DHDT = [scores_dict_synthetic[i]['DHDT']['accuracy'] for i in range(config['computation']['num_eval'])]

scores_sklearn = [scores_dict_synthetic[i]['sklearn']['accuracy'] for i in range(config['computation']['num_eval'])]


scores_DHDT_mean = np.mean(scores_DHDT, axis=1)
scores_sklearn_mean = np.mean(scores_sklearn, axis=1)
                        
scores_DHDT_max = np.max(scores_DHDT, axis=1)
scores_sklearn_max = np.max(scores_sklearn, axis=1)
                        
scores_DHDT_std = np.std(scores_DHDT, axis=1)
scores_sklearn_std = np.std(scores_sklearn, axis=1)

results_DHDT = np.vstack([scores_DHDT_mean, scores_DHDT_max, scores_DHDT_std])
results_sklearn = np.vstack([scores_sklearn_mean, scores_sklearn_max, scores_sklearn_std])


scores_dataframe_synthetic = pd.DataFrame(data=np.vstack([results_DHDT, results_sklearn]).T, index = index, columns = columns)
display(scores_dataframe_synthetic)
display(scores_dataframe_synthetic[scores_dataframe_synthetic.columns[1::3]])
display(scores_dataframe_synthetic.describe())

## Real-World Eval

In [None]:
identifier_list = [
                    'Adult',#: 32,
                    'Bank Marketing',#: 32,
                    'Loan Credit',#: 32,

                    'Credit Card',#: 23, 
                    'Car',#: 21,


                    'Absenteeism',#: 15,
                    'Loan House',#: 15,
                    'Cervical Cancer',#: 15,

                    'Heart Disease',#: 13,           

                    'Titanic',#: 10,
                    'Medical Insurance',#: 10,
                    'Brest Cancer Wisconsin',#: 10,
                    'Wisconsin Diagnostic Breast Cancer',#: 10,
                    'Wisconsin Prognostic Breast Cancer',#: 10,
                    'Abalone',#: 10,

                    'Haberman',#: 3, 
                  ]

parallel_eval_real_world = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_real_world = parallel_eval_real_world(delayed(evaluate_real_world_parallel)(identifier_list=identifier_list, 
                                                                                               random_seed_model=config['computation']['random_seed']+i,
                                                                                               verbosity = -1) for i in range(config['computation']['trials']))


for i, real_world_result in enumerate(evaluation_results_real_world):
    if i == 0:
        model_dict_real_world = real_world_result[0]
        scores_dict_real_world = real_world_result[1]
        dataset_dict_real_world = real_world_result[2]
    else: 
        model_dict_real_world = mergeDict(model_dict_real_world, real_world_result[0])
        scores_dict_real_world = mergeDict(scores_dict_real_world, real_world_result[1])
        dataset_dict_real_world = mergeDict(dataset_dict_real_world, real_world_result[2])

In [None]:
metrics = ['accuracy']
index = identifier_list
columns = flatten_list([[[approach + ' ' + metric + '_mean', approach + ' ' + metric + '_std'] for metric in metrics] for approach in ['DHDT', 'sklearn']])

scores_DHDT = [scores_dict_real_world[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict_real_world[identifier]['sklearn']['accuracy'] for identifier in identifier_list]


scores_DHDT_mean = np.mean(scores_DHDT, axis=1)
scores_sklearn_mean = np.mean(scores_sklearn, axis=1)

scores_DHDT_max = np.max(scores_DHDT, axis=1)
scores_sklearn_max = np.max(scores_sklearn, axis=1)

scores_DHDT_std = np.std(scores_DHDT, axis=1)
scores_sklearn_std = np.std(scores_sklearn, axis=1)

results_DHDT = np.vstack([scores_DHDT_mean, scores_DHDT_max, scores_DHDT_std])
results_sklearn = np.vstack([scores_sklearn_mean, scores_sklearn_max, scores_sklearn_std])


scores_dataframe_real_world = pd.DataFrame(data=np.vstack([results_DHDT, results_sklearn]).T, index = index, columns = columns)
display(scores_dataframe_synthetic)
display(scores_dataframe_synthetic[scores_dataframe_synthetic.columns[1::3]])

In [None]:
scores_dict_real_world

In [None]:
scores_DHDT = [scores_dict_real_world[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict_real_world[identifier]['sklearn']['accuracy'] for identifier in identifier_list]


In [None]:
np.mean(scores_DHDT, axis=1)

In [None]:
scores_DHDT = [scores_dict_real_world[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict_real_world[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

In [None]:
scores_DHDT = [scores_dict[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

In [None]:
scores_DHDT = [scores_dict[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

In [None]:
parameter_grid = {
        'depth': 3,
        'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001],
        
        'loss': ['binary_crossentropy', 'rmse'],#'mae',
        'optimizer': ['adam', 'sgd'],        
        
        'beta_1': [10, 50, 100],
        'beta_2': [10, 50, 100],
        
        'squeeze_factor': [0.2, 0.5, 1, 2, 5],    
}


In [None]:
scores_DHDT_make_class = [scores_dict_make_class[identifier]['DHDT']['accuracy'] for identifier in range(num_make_class_eval)]

scores_sklearn_make_class = [scores_dict_make_class[identifier]['sklearn']['accuracy'] for identifier in range(num_make_class_eval)]


## real-world

In [None]:
--> put eval in function 
    --> make parallel execution with different seeds / splits, etcs 
        --> compare (save all values and generate mean+std df, but keep all values)


In [None]:
config['n_samples'] = 10_000
config['number_of_variables'] = 5
random_seed = i

In [None]:
n_jobs = 20

identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

parallel_real_world_eval = Parallel(n_jobs=n_jobs, verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_by_dataset = parallel_real_world_eval(delayed(evaluate_dhdt)(identifier) for identifier in identifier_list)


In [None]:
identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

trials = 20
n_jobs = 20
random_seed = 42

parallel_real_world_eval = Parallel(n_jobs=n_jobs, verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_by_dataset = parallel_real_world_eval(delayed(evaluate_all_parallel)(identifier_list=identifier_list, 
                                                                               random_seed=random_seed+i) for i in range(trials))


In [None]:
identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

evaluate_all_real_world(identifier_list)

In [None]:
def evaluate_all_real_world(identifier_list):
    
    identifier_list = ['Cervical Cancer',
                       'Credit Card',
                       'Absenteeism']

    dataset_dict = {}
    model_dict = {}

    scores_dict = {}

    for identifier in tqdm(identifier_list, desc='dataset loop'):

        print('_________________________________________________________________________________________________________________')   

        dataset_dict[identifier] = {}
        model_dict[identifier] = {}

        scores_dict[identifier] = {'sklearn': {},
                                   'DHDT': {}}

        dataset_dict[identifier] = get_preprocessed_dataset(identifier)    

        model_dict[identifier]['sklearn'] = DecisionTreeClassifier(max_depth=3, 
                                                                   random_state=42)

        model_dict[identifier]['sklearn'].fit(dataset_dict[identifier]['X_train'], 
                                              dataset_dict[identifier]['y_train'])

        scores_dict[identifier]['sklearn']['accuracy'] = model_dict[identifier]['sklearn'].score(dataset_dict[identifier]['X_test'], 
                                                                                                 dataset_dict[identifier]['y_test'])



        model_dict[identifier]['DHDT'] = DHDT(depth=3,
                                                 number_of_variables = dataset_dict[identifier]['X_train'].shape[1],
                                                 learning_rate=1e-3,
                                                 squeeze_factor = 1,
                                                 loss='binary_crossentropy',#'binary_crossentropy',
                                                 optimizer='rmsprop',
                                                 random_seed=40,
                                                 verbosity=0)

        scores_dict[identifier]['history'] = model_dict[identifier]['DHDT'].fit(dataset_dict[identifier]['X_train'], 
                                                                              dataset_dict[identifier]['y_train'], 
                                                                              batch_size=512, 
                                                                              epochs=1_000, 
                                                                              early_stopping_epochs=50, 
                                                                              valid_data=(dataset_dict[identifier]['X_valid'], dataset_dict[identifier]['y_valid']))

        dataset_dict[identifier]['y_test_dhdt'] = model_dict[identifier]['DHDT'].predict(dataset_dict[identifier]['X_test'])
        scores_dict[identifier]['DHDT']['accuracy'] = accuracy_score(dataset_dict[identifier]['y_test'], np.round(dataset_dict[identifier]['y_test_dhdt']))

        print('Test Accuracy Sklearn (' + identifier + ')', scores_dict[identifier]['sklearn']['accuracy'])
        print('Test Accuracy DHDT (' + identifier + ')', scores_dict[identifier]['DHDT']['accuracy'])   
        print('_________________________________________________________________________________________________________________')   

    return   model_dict, scores_dict, dataset_dit


In [None]:
identifier_list = ['Cervical Cancer',
                   'Credit Card',
                   'Absenteeism']

dataset_dict = {}
model_dict = {}

scores_dict = {}

for identifier in tqdm(identifier_list, desc='dataset loop'):
    
    print('_________________________________________________________________________________________________________________')   
    
    dataset_dict[identifier] = {}
    model_dict[identifier] = {}

    scores_dict[identifier] = {'sklearn': {},
                               'DHDT': {}}
    
    dataset_dict[identifier] = get_preprocessed_dataset(identifier)    

    model_dict[identifier]['sklearn'] = DecisionTreeClassifier(max_depth=3, 
                                                               random_state=42)

    model_dict[identifier]['sklearn'].fit(dataset_dict[identifier]['X_train'], 
                                          dataset_dict[identifier]['y_train'])

    scores_dict[identifier]['sklearn']['accuracy'] = model_dict[identifier]['sklearn'].score(dataset_dict[identifier]['X_test'], 
                                                                                             dataset_dict[identifier]['y_test'])



    model_dict[identifier]['DHDT'] = DHDT(depth=3,
                                             number_of_variables = dataset_dict[identifier]['X_train'].shape[1],
                                             learning_rate=1e-3,
                                             squeeze_factor = 1,
                                             loss='binary_crossentropy',#'binary_crossentropy',
                                             optimizer='rmsprop',
                                             random_seed=40,
                                             verbosity=0)

    scores_dict[identifier]['history'] = model_dict[identifier]['DHDT'].fit(dataset_dict[identifier]['X_train'], 
                                                                          dataset_dict[identifier]['y_train'], 
                                                                          batch_size=512, 
                                                                          epochs=1_000, 
                                                                          early_stopping_epochs=50, 
                                                                          valid_data=(dataset_dict[identifier]['X_valid'], dataset_dict[identifier]['y_valid']))

    dataset_dict[identifier]['y_test_dhdt'] = model_dict[identifier]['DHDT'].predict(dataset_dict[identifier]['X_test'])
    scores_dict[identifier]['DHDT']['accuracy'] = accuracy_score(dataset_dict[identifier]['y_test'], np.round(dataset_dict[identifier]['y_test_dhdt']))
    
    print('Test Accuracy Sklearn (' + identifier + ')', scores_dict[identifier]['sklearn']['accuracy'])
    print('Test Accuracy DHDT (' + identifier + ')', scores_dict[identifier]['DHDT']['accuracy'])   
    print('_________________________________________________________________________________________________________________')   

    

In [None]:
scores_DHDT = [scores_dict[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

In [None]:
identifier = "Absenteeism"
plt.figure(figsize=(15,8))
image = model_dict[identifier]['DHDT'].plot(normalizer_list=dataset_dict['normalizer_list'][identifier])
display(image)

plt.figure(figsize=(15,8))
plot_tree(model_dict[identifier]['sklearn'], fontsize=10) 
plt.show()

## Absenteeism