# Imports

In [1]:
config = {
    'dhdt': {
        'depth': 3,
        'learning_rate': 1e-3,
        
        'loss': 'binary_crossentropy',#'mae',
        'optimizer': 'adam',        
        
        'beta_1': 100,
        'beta_2': 100,
        
        'squeeze_factor': 1,
    },
    
    
    
    'make_classification': {
        'number_of_variables': 5,
        'n_samples': 10_000,
    },

    
    'computation': {
        'random_seed': 42,
        'num_eval': 10,
        'trials': 2,
        'n_jobs': 10,
        'verbosity': 0,
    },
}



In [2]:
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder
from livelossplot import PlotLosses

import os
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

from IPython.display import Image
from IPython.display import display, clear_output

import pandas as pd

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = '' #'true'

import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import logging

tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(3)

from keras import backend as K
from keras.utils.generic_utils import get_custom_objects


import seaborn as sns
sns.set_style("darkgrid")

import time
import random

from utilities.utilities import *
from utilities.DHDT import *

from joblib import Parallel, delayed

from itertools import product

# Evaluation

## make_classification

In [3]:
identifier_list = [
                   'make_classification',
                  ]

#model_seed_list = [i for i in range(config['computation']['trials'])]
#data_seed_list = [i for i in range(config['computation']['num_eval'])]

#combined_seed_list = list(product(model_seed_list, data_seed_list))

parallel_eval = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results = parallel_eval(delayed(evaluate_synthetic_parallel)(identifier_list=identifier_list, 
                                                                      random_seed_data=config['computation']['random_seed']+random_seed_data,
                                                                      random_seed_model=config['computation']['random_seed'],#+random_seed_model,
                                                                      trials = config['computation']['trials'],
                                                                      config = config['make_classification'],
                                                                      verbosity = -1) for random_seed_data in range(config['computation']['num_eval']))
    

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:  4.4min remaining: 10.2min
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:  5.9min remaining:  2.5min
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:  6.7min finished


In [5]:
evaluation_results[0]

NameError: name 'evaluation_results' is not defined

In [None]:
identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

parallel_eval_real_world = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_real_world = parallel_eval_real_world(delayed(evaluate_real_world_parallel)(identifier_list=identifier_list, 
                                                                                    random_seed_model=config['computation']['random_seed']+i,
                                                                                    verbosity = -1) for i in range(config['computation']['trials']))


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


In [5]:
identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

parallel_eval_real_world = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='multiprocessing') #loky #sequential multiprocessing
evaluation_results_real_world = parallel_eval(delayed(evaluate_all_parallel)(identifier_list=identifier_list, 
                                                                             random_seed_model=config['computation']['random_seed']+i,
                                                                             verbosity = -1) for i in range(config['computation']['trials']))


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]


[Parallel(n_jobs=10)]: Done   2 out of   2 | elapsed:  9.6min remaining:    0.0s
[Parallel(n_jobs=10)]: Done   2 out of   2 | elapsed:  9.6min finished


In [8]:
config['computation']['trials']

2

In [None]:
for real_world_result in evaluation_results_real_world:
    

In [13]:
evaluation_results_real_world[0][1]

{'Cervical Cancer': {'sklearn': {'accuracy': 0.4678362573099415},
  'DHDT': {'accuracy': 0.17543859649122806},
  'history': None},
 'Credit Card': {'sklearn': {'accuracy': 0.7761666666666667},
  'DHDT': {'accuracy': 0.21883333333333332},
  'history': None}}

epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]
epochs:   0%|          | 0/1000 [00:00<?, ?it/s]


In [9]:
len(evaluation_results_real_world)

2

In [None]:
scores_DHDT = [scores_dict[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

In [None]:
parameter_grid = {
        'depth': 3,
        'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001],
        
        'loss': ['binary_crossentropy', 'rmse'],#'mae',
        'optimizer': ['adam', 'sgd'],        
        
        'beta_1': [10, 50, 100],
        'beta_2': [10, 50, 100],
        
        'squeeze_factor': [0.2, 0.5, 1, 2, 5],    
}


In [None]:
scores_DHDT_make_class = [scores_dict_make_class[identifier]['DHDT']['accuracy'] for identifier in range(num_make_class_eval)]

scores_sklearn_make_class = [scores_dict_make_class[identifier]['sklearn']['accuracy'] for identifier in range(num_make_class_eval)]


## real-world

In [None]:
--> put eval in function 
    --> make parallel execution with different seeds / splits, etcs 
        --> compare (save all values and generate mean+std df, but keep all values)


In [None]:
config['n_samples'] = 10_000
config['number_of_variables'] = 5
random_seed = i

In [None]:
n_jobs = 20

identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

parallel_real_world_eval = Parallel(n_jobs=n_jobs, verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_by_dataset = parallel_real_world_eval(delayed(evaluate_dhdt)(identifier) for identifier in identifier_list)


In [None]:
identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

trials = 20
n_jobs = 20
random_seed = 42

parallel_real_world_eval = Parallel(n_jobs=n_jobs, verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_by_dataset = parallel_real_world_eval(delayed(evaluate_all_parallel)(identifier_list=identifier_list, 
                                                                               random_seed=random_seed+i) for i in range(trials))


In [None]:
identifier_list = [
                   'Cervical Cancer',
                   'Credit Card',
                   #'Absenteeism'
                  ]

evaluate_all_real_world(identifier_list)

In [None]:
def evaluate_all_real_world(identifier_list):
    
    identifier_list = ['Cervical Cancer',
                       'Credit Card',
                       'Absenteeism']

    dataset_dict = {}
    model_dict = {}

    scores_dict = {}

    for identifier in tqdm(identifier_list, desc='dataset loop'):

        print('_________________________________________________________________________________________________________________')   

        dataset_dict[identifier] = {}
        model_dict[identifier] = {}

        scores_dict[identifier] = {'sklearn': {},
                                   'DHDT': {}}

        dataset_dict[identifier] = get_preprocessed_dataset(identifier)    

        model_dict[identifier]['sklearn'] = DecisionTreeClassifier(max_depth=3, 
                                                                   random_state=42)

        model_dict[identifier]['sklearn'].fit(dataset_dict[identifier]['X_train'], 
                                              dataset_dict[identifier]['y_train'])

        scores_dict[identifier]['sklearn']['accuracy'] = model_dict[identifier]['sklearn'].score(dataset_dict[identifier]['X_test'], 
                                                                                                 dataset_dict[identifier]['y_test'])



        model_dict[identifier]['DHDT'] = DHDT(depth=3,
                                                 number_of_variables = dataset_dict[identifier]['X_train'].shape[1],
                                                 learning_rate=1e-3,
                                                 squeeze_factor = 1,
                                                 loss='binary_crossentropy',#'binary_crossentropy',
                                                 optimizer='rmsprop',
                                                 random_seed=40,
                                                 verbosity=0)

        scores_dict[identifier]['history'] = model_dict[identifier]['DHDT'].fit(dataset_dict[identifier]['X_train'], 
                                                                              dataset_dict[identifier]['y_train'], 
                                                                              batch_size=512, 
                                                                              epochs=1_000, 
                                                                              early_stopping_epochs=50, 
                                                                              valid_data=(dataset_dict[identifier]['X_valid'], dataset_dict[identifier]['y_valid']))

        dataset_dict[identifier]['y_test_dhdt'] = model_dict[identifier]['DHDT'].predict(dataset_dict[identifier]['X_test'])
        scores_dict[identifier]['DHDT']['accuracy'] = accuracy_score(dataset_dict[identifier]['y_test'], np.round(dataset_dict[identifier]['y_test_dhdt']))

        print('Test Accuracy Sklearn (' + identifier + ')', scores_dict[identifier]['sklearn']['accuracy'])
        print('Test Accuracy DHDT (' + identifier + ')', scores_dict[identifier]['DHDT']['accuracy'])   
        print('_________________________________________________________________________________________________________________')   

    return   model_dict, scores_dict, dataset_dit


In [None]:
identifier_list = ['Cervical Cancer',
                   'Credit Card',
                   'Absenteeism']

dataset_dict = {}
model_dict = {}

scores_dict = {}

for identifier in tqdm(identifier_list, desc='dataset loop'):
    
    print('_________________________________________________________________________________________________________________')   
    
    dataset_dict[identifier] = {}
    model_dict[identifier] = {}

    scores_dict[identifier] = {'sklearn': {},
                               'DHDT': {}}
    
    dataset_dict[identifier] = get_preprocessed_dataset(identifier)    

    model_dict[identifier]['sklearn'] = DecisionTreeClassifier(max_depth=3, 
                                                               random_state=42)

    model_dict[identifier]['sklearn'].fit(dataset_dict[identifier]['X_train'], 
                                          dataset_dict[identifier]['y_train'])

    scores_dict[identifier]['sklearn']['accuracy'] = model_dict[identifier]['sklearn'].score(dataset_dict[identifier]['X_test'], 
                                                                                             dataset_dict[identifier]['y_test'])



    model_dict[identifier]['DHDT'] = DHDT(depth=3,
                                             number_of_variables = dataset_dict[identifier]['X_train'].shape[1],
                                             learning_rate=1e-3,
                                             squeeze_factor = 1,
                                             loss='binary_crossentropy',#'binary_crossentropy',
                                             optimizer='rmsprop',
                                             random_seed=40,
                                             verbosity=0)

    scores_dict[identifier]['history'] = model_dict[identifier]['DHDT'].fit(dataset_dict[identifier]['X_train'], 
                                                                          dataset_dict[identifier]['y_train'], 
                                                                          batch_size=512, 
                                                                          epochs=1_000, 
                                                                          early_stopping_epochs=50, 
                                                                          valid_data=(dataset_dict[identifier]['X_valid'], dataset_dict[identifier]['y_valid']))

    dataset_dict[identifier]['y_test_dhdt'] = model_dict[identifier]['DHDT'].predict(dataset_dict[identifier]['X_test'])
    scores_dict[identifier]['DHDT']['accuracy'] = accuracy_score(dataset_dict[identifier]['y_test'], np.round(dataset_dict[identifier]['y_test_dhdt']))
    
    print('Test Accuracy Sklearn (' + identifier + ')', scores_dict[identifier]['sklearn']['accuracy'])
    print('Test Accuracy DHDT (' + identifier + ')', scores_dict[identifier]['DHDT']['accuracy'])   
    print('_________________________________________________________________________________________________________________')   

    

In [None]:
scores_DHDT = [scores_dict[identifier]['DHDT']['accuracy'] for identifier in identifier_list]

scores_sklearn = [scores_dict[identifier]['sklearn']['accuracy'] for identifier in identifier_list]

scores_dataframe = pd.DataFrame(data=np.vstack([scores_DHDT, scores_sklearn]).T, index = identifier_list, columns = ['DHDT', 'sklearn'])
scores_dataframe

In [None]:
identifier = "Absenteeism"
plt.figure(figsize=(15,8))
image = model_dict[identifier]['DHDT'].plot(normalizer_list=dataset_dict['normalizer_list'][identifier])
display(image)

plt.figure(figsize=(15,8))
plot_tree(model_dict[identifier]['sklearn'], fontsize=10) 
plt.show()

## Absenteeism