# Imports

In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder
from livelossplot import PlotLosses

import os
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

from IPython.display import Image
from IPython.display import display, clear_output

import pandas as pd

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = '' #'true'

import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import logging

tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(3)

from keras import backend as K
from keras.utils.generic_utils import get_custom_objects


import seaborn as sns
sns.set_style("darkgrid")

import time
import random

from utilities.utilities import *
from utilities.DHDT import *

# Evaluation

## make_classification

In [2]:
num_make_class_eval = 10

identifier_list_make_class = [i for i in range(num_make_class_eval)]

dataset_dict_make_class = {}
model_dict_make_class = {}

scores_dict_make_class = {}

for identifier in tqdm(identifier_list_make_class, desc='dataset loop'):
    
    print('_________________________________________________________________________________________________________________')   
    
    dataset_dict_make_class[identifier] = {}
    model_dict_make_class[identifier] = {}

    scores_dict_make_class[identifier] = {'sklearn': {},
                               'DHDT': {}}
        
    X_data, y_data = make_classification(
                                        n_samples=10_000, 
                                        n_features=5, 
                                        n_informative=3, 
                                        n_redundant=1, 
                                        random_state=identifier
                                        )

    ((dataset_dict_make_class[identifier]['X_train'], dataset_dict_make_class[identifier]['y_train']), 
     (dataset_dict_make_class[identifier]['X_valid'], dataset_dict_make_class[identifier]['y_valid']), 
     (dataset_dict_make_class[identifier]['X_test'], dataset_dict_make_class[identifier]['y_test']), 
     
     dataset_dict_make_class[identifier]['normalizer_list']) = preprocess_data(X_data = X_data, 
                                                                                y_data = y_data,
                                                                                nominal_features = [],
                                                                                ordinal_features = [],
                                                                                random_seed = 42)    

    model_dict_make_class[identifier]['sklearn'] = DecisionTreeClassifier(max_depth=3, 
                                                               random_state=42)

    model_dict_make_class[identifier]['sklearn'].fit(dataset_dict_make_class[identifier]['X_train'], 
                                                      dataset_dict_make_class[identifier]['y_train'])

    scores_dict_make_class[identifier]['sklearn']['accuracy'] = model_dict_make_class[identifier]['sklearn'].score(dataset_dict_make_class[identifier]['X_test'], 
                                                                                                                 dataset_dict_make_class[identifier]['y_test'])



    model_dict_make_class[identifier]['DHDT'] = DHDT(depth=3,
                                             number_of_variables = dataset_dict_make_class[identifier]['X_train'].shape[1],
                                             learning_rate=1e-3,
                                             squeeze_factor = 1,
                                             loss='binary_crossentropy',#'binary_crossentropy',
                                             optimizer='rmsprop',
                                             random_seed=40,
                                             verbosity=0)

    scores_dict_make_class[identifier]['history'] = model_dict_make_class[identifier]['DHDT'].fit(dataset_dict_make_class[identifier]['X_train'], 
                                                                          dataset_dict_make_class[identifier]['y_train'], 
                                                                          batch_size=512, 
                                                                          epochs=1_000, 
                                                                          early_stopping_epochs=50, 
                                                                          valid_data=(dataset_dict_make_class[identifier]['X_valid'], dataset_dict_make_class[identifier]['y_valid']))

    dataset_dict_make_class[identifier]['y_test_dhdt'] = model_dict_make_class[identifier]['DHDT'].predict(dataset_dict_make_class[identifier]['X_test'])
    scores_dict_make_class[identifier]['DHDT']['accuracy'] = accuracy_score(dataset_dict_make_class[identifier]['y_test'], np.round(dataset_dict_make_class[identifier]['y_test_dhdt']))
    
    print('Test Accuracy Sklearn (' + str(identifier) + ')', scores_dict_make_class[identifier]['sklearn']['accuracy'])
    print('Test Accuracy DHDT (' + str(identifier) + ')', scores_dict_make_class[identifier]['DHDT']['accuracy'])   
    print('_________________________________________________________________________________________________________________')   

    

dataset loop:   0%|          | 0/10 [00:00<?, ?it/s]

_________________________________________________________________________________________________________________
Original Data Shape (selected):  (10000, 5)
Original Data Shape (encoded):  (10000, 5)
Original Data Class Distribution:  5003  (true) / 4997  (false)
(7000, 5) (7000,)
(1000, 5) (1000,)
(2000, 5) (2000,)
True Ratio:  0.4957142857142857


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (0) 0.887
Test Accuracy DHDT (0) 0.5065
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (10000, 5)
Original Data Shape (encoded):  (10000, 5)
Original Data Class Distribution:  5000  (true) / 5000  (false)
(7000, 5) (7000,)
(1000, 5) (1000,)
(2000, 5) (2000,)
True Ratio:  0.49642857142857144


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (1) 0.9235
Test Accuracy DHDT (1) 0.8685
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (10000, 5)
Original Data Shape (encoded):  (10000, 5)
Original Data Class Distribution:  4992  (true) / 5008  (false)
(7000, 5) (7000,)
(1000, 5) (1000,)
(2000, 5) (2000,)
True Ratio:  0.49914285714285717


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (2) 0.861
Test Accuracy DHDT (2) 0.7955
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (10000, 5)
Original Data Shape (encoded):  (10000, 5)
Original Data Class Distribution:  5001  (true) / 4999  (false)
(7000, 5) (7000,)
(1000, 5) (1000,)
(2000, 5) (2000,)
True Ratio:  0.499


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (3) 0.878
Test Accuracy DHDT (3) 0.818
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (10000, 5)
Original Data Shape (encoded):  (10000, 5)
Original Data Class Distribution:  4999  (true) / 5001  (false)
(7000, 5) (7000,)
(1000, 5) (1000,)
(2000, 5) (2000,)
True Ratio:  0.49742857142857144


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (4) 0.746
Test Accuracy DHDT (4) 0.536
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (10000, 5)
Original Data Shape (encoded):  (10000, 5)
Original Data Class Distribution:  4994  (true) / 5006  (false)
(7000, 5) (7000,)
(1000, 5) (1000,)
(2000, 5) (2000,)
True Ratio:  0.5052857142857143


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (5) 0.886
Test Accuracy DHDT (5) 0.727
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (10000, 5)
Original Data Shape (encoded):  (10000, 5)
Original Data Class Distribution:  4999  (true) / 5001  (false)
(7000, 5) (7000,)
(1000, 5) (1000,)
(2000, 5) (2000,)
True Ratio:  0.5057142857142857


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (6) 0.895
Test Accuracy DHDT (6) 0.608
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (10000, 5)
Original Data Shape (encoded):  (10000, 5)
Original Data Class Distribution:  5001  (true) / 4999  (false)
(7000, 5) (7000,)
(1000, 5) (1000,)
(2000, 5) (2000,)
True Ratio:  0.4972857142857143


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (7) 0.909
Test Accuracy DHDT (7) 0.9045
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (10000, 5)
Original Data Shape (encoded):  (10000, 5)
Original Data Class Distribution:  4992  (true) / 5008  (false)
(7000, 5) (7000,)
(1000, 5) (1000,)
(2000, 5) (2000,)
True Ratio:  0.49785714285714283


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (8) 0.8155
Test Accuracy DHDT (8) 0.7575
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (10000, 5)
Original Data Shape (encoded):  (10000, 5)
Original Data Class Distribution:  5002  (true) / 4998  (false)
(7000, 5) (7000,)
(1000, 5) (1000,)
(2000, 5) (2000,)
True Ratio:  0.5038571428571429


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (9) 0.9055
Test Accuracy DHDT (9) 0.541
_________________________________________________________________________________________________________________


In [11]:
scores_DHDT_make_class = [scores_dict_make_class[identifier]['DHDT']['accuracy'] for identifier in range(num_make_class_eval)]

scores_sklearn_make_class = [scores_dict_make_class[identifier]['sklearn']['accuracy'] for identifier in range(num_make_class_eval)]


## real-world

In [None]:
--> put eval in function 
    --> make parallel execution with different seeds / splits, etcs 
        --> compare (save all values and generate mean+std df, but keep all values)


In [None]:
n_jobs = 20

In [None]:
parallel_real_world_eval = Parallel(n_jobs=n_jobs, verbose=3, backend=backend) #loky #sequential multiprocessing
evaluation_results_by_dataset = parallel_inet_evaluation(delayed(distribution_evaluation_single_model_synthetic_data)(i, identifier) for i, identifier in enumerate(identifier_list))


In [15]:
identifier_list = ['Cervical Cancer',
                   'Credit Card',
                   'Absenteeism']

dataset_dict = {}
model_dict = {}

scores_dict = {}

for identifier in tqdm(identifier_list, desc='dataset loop'):
    
    print('_________________________________________________________________________________________________________________')   
    
    dataset_dict[identifier] = {}
    model_dict[identifier] = {}

    scores_dict[identifier] = {'sklearn': {},
                               'DHDT': {}}
    
    dataset_dict[identifier] = get_preprocessed_dataset(identifier)

    model_dict[identifier]['sklearn'] = DecisionTreeClassifier(max_depth=3, 
                                                               random_state=42)

    model_dict[identifier]['sklearn'].fit(dataset_dict[identifier]['X_train'], 
                                          dataset_dict[identifier]['y_train'])

    scores_dict[identifier]['sklearn']['accuracy'] = model_dict[identifier]['sklearn'].score(dataset_dict[identifier]['X_test'], 
                                                                                             dataset_dict[identifier]['y_test'])



    model_dict[identifier]['DHDT'] = DHDT(depth=3,
                                             number_of_variables = dataset_dict[identifier]['X_train'].shape[1],
                                             learning_rate=1e-3,
                                             squeeze_factor = 1,
                                             loss='binary_crossentropy',#'binary_crossentropy',
                                             optimizer='rmsprop',
                                             random_seed=40,
                                             verbosity=0)

    scores_dict[identifier]['history'] = model_dict[identifier]['DHDT'].fit(dataset_dict[identifier]['X_train'], 
                                                                          dataset_dict[identifier]['y_train'], 
                                                                          batch_size=512, 
                                                                          epochs=1_000, 
                                                                          early_stopping_epochs=50, 
                                                                          valid_data=(dataset_dict[identifier]['X_valid'], dataset_dict[identifier]['y_valid']))

    dataset_dict[identifier]['y_test_dhdt'] = model_dict[identifier]['DHDT'].predict(dataset_dict[identifier]['X_test'])
    scores_dict[identifier]['DHDT']['accuracy'] = accuracy_score(dataset_dict[identifier]['y_test'], np.round(dataset_dict[identifier]['y_test_dhdt']))
    
    print('Test Accuracy Sklearn (' + identifier + ')', scores_dict[identifier]['sklearn']['accuracy'])
    print('Test Accuracy DHDT (' + identifier + ')', scores_dict[identifier]['DHDT']['accuracy'])   
    print('_________________________________________________________________________________________________________________')   

    

dataset loop:   0%|          | 0/3 [00:00<?, ?it/s]

_________________________________________________________________________________________________________________
Original Data Shape (selected):  (858, 15)
Original Data Shape (encoded):  (858, 15)
Original Data Class Distribution:  55  (true) / 803  (false)
(602, 15) (602,)
(85, 15) (85,)
(171, 15) (171,)
True Ratio:  0.061461794019933555
True Ratio:  0.5


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (Cervical Cancer) 0.4678362573099415
Test Accuracy DHDT (Cervical Cancer) 0.17543859649122806
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (30000, 23)
Original Data Shape (encoded):  (30000, 23)
Original Data Class Distribution:  23364  (true) / 6636  (false)
(21000, 23) (21000,)
(3000, 23) (3000,)
(6000, 23) (6000,)
True Ratio:  0.7782857142857142
True Ratio:  0.5


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (Credit Card) 0.7761666666666667
Test Accuracy DHDT (Credit Card) 0.782
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________
Original Data Shape (selected):  (740, 15)
Original Data Shape (encoded):  (740, 15)
Original Data Class Distribution:  279  (true) / 461  (false)
(518, 15) (518,)
(74, 15) (74,)
(148, 15) (148,)
True Ratio:  0.3861003861003861


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Test Accuracy Sklearn (Absenteeism) 0.6351351351351351
Test Accuracy DHDT (Absenteeism) 0.6283783783783784
_________________________________________________________________________________________________________________


In [19]:
scores_DHDT = [scores_dict[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

Unnamed: 0,DHDT,sklearn
Cervical Cancer,0.175439,0.467836
Credit Card,0.782,0.776167
Absenteeism,0.628378,0.635135


In [None]:
identifier = "Absenteeism"
plt.figure(figsize=(15,8))
image = model_dict[identifier]['DHDT'].plot(normalizer_list=dataset_dict['normalizer_list'][identifier])
display(image)

plt.figure(figsize=(15,8))
plot_tree(model_dict[identifier]['sklearn'], fontsize=10) 
plt.show()

## Absenteeism