In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import hyperopt
import time
import sklearn.metrics as mt
import pickle

from xgboost import Booster
from xgboost import XGBClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

  from pandas import MultiIndex, Int64Index


In [3]:
#################################################################################################
###### The function load_window_dataset() that takes in input window data file, and 
###### label to be assigned and returns numpy arrays with features and labels
#################################################################################################

def load_window_dataset(X, y, filename, label):
#Inputs: - X: current matrix of datapoints where we want to APPEND the datapoints retrieved from filename (only features)
#        - y: current matrix of datapoints where we want to APPEND the datapoints retrieved from filename (only labels)
#        - filename: full name (with path) of the file to be read (it must be a window dataset file created above)
#        - label: integer, label to be assigned to the datapoints retrieved from filename; it may differ from labels already included in current y
#Outputs: - X: updated X (including features for the new data points retrieved from filename)
#         - y: updated y (including labels for the new data points)
#This function to X and y in input the new datapoints retrieved from filename and return updated X and y
#The function handle the case when X and y are empty (initialized as None)

    data = pd.read_csv(filename)

    if X is None:
        X = data.to_numpy()
        # full() function puts in all X.shape[0] elements the value "label"
        y = np.full(X.shape[0], label)
    else:
        X_temp = data.to_numpy()
        y_temp = np.full(X_temp.shape[0], label)
        X = np.append(X, X_temp, axis = 0) #F: axis=0-->stack X and X_temp vertically (increase no of rows)
        y = np.append(y, y_temp)

    return X, y



In [4]:
##########################################################################################
###### Use function load_window_dataset() with datasets of for all scenarios  
###### using window length = 10 and spacing = 1. Finally, perform features scaling 
##########################################################################################

X=None 
y=None
length=10
spacing=1
folderpath='Features_raw'

for filename in os.listdir(folderpath):   
    if filename.endswith('_sp' + str(spacing) + '_w' + str(length) + '.dat'):
        print(filename)
        label = 0
        if int(filename[9]) > 5:
          label = 1
        fullname = folderpath + '/' + filename
#------------------------------------------------------------
        X, y = load_window_dataset(X, y, fullname, label)
#------------------------------------------------------------
        print('current shape of X: ' +str(X.shape))
        print('current shape of y: ' +str(y.shape))

#All scenario correlated ONLY TO "length" and "spacing" variables
print(X)
print(y)
print(X.shape)
print(y.shape)

# Features scaling 
scaler = StandardScaler()
X = scaler.fit_transform(X)

print(X)
print(y)
print(X.shape)
print(y.shape)



Scenario_1_monitor_node_1_preamp_lpth_2_1_sp1_w10.dat
current shape of X: (21591, 6)
current shape of y: (21591,)
Scenario_1_monitor_node_1_preamp_lpth_3-1_1_sp1_w10.dat
current shape of X: (43182, 6)
current shape of y: (43182,)
Scenario_1_monitor_node_1_preamp_lpth_3-2_1_sp1_w10.dat
current shape of X: (64773, 6)
current shape of y: (64773,)
Scenario_2_monitor_node_1_preamp_lpth_2_1_sp1_w10.dat
current shape of X: (86364, 6)
current shape of y: (86364,)
Scenario_2_monitor_node_1_preamp_lpth_3-1_1_sp1_w10.dat
current shape of X: (107955, 6)
current shape of y: (107955,)
Scenario_3_monitor_node_1_preamp_lpth_2_1_sp1_w10.dat
current shape of X: (129546, 6)
current shape of y: (129546,)
Scenario_4_monitor_node_1_preamp_lpth_2_1_sp1_w10.dat
current shape of X: (151137, 6)
current shape of y: (151137,)
Scenario_4_monitor_node_1_preamp_lpth_3-1_1_sp1_w10.dat
current shape of X: (172728, 6)
current shape of y: (172728,)
Scenario_4_monitor_node_1_preamp_lpth_3-2_1_sp1_w10.dat
current shape of

In [5]:
############################################################################################################
###### Perform XGBoost hyperparameters optimization via crossvalidation
###### Print hyperparameters obtained with crossvalidation in resfileXGB
###### Retrain an XGB model with best hyperparameters using the entire training set (X_train, y_train)
###### Print training results (best accuracy and training duration) in resfileXGB
###### Return the trained XGB model
###### for XGB with given hyperparameters space 
###### XGB documentation at: https://xgboost.readthedocs.io/en/stable/python/python_intro.html 
###### XGB hyperparameters: https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html 
############################################################################################################

def train_classifier_XGB(X_train, y_train, resfileXGB): 

    #F: define the search space for your hyperparameters - a space where to search
    # These parameters are needed to balance between underfitting and overfitting
    # We are testing 3 hyperparameters: eta, max_depth and subsample
    space4xgb = { 
     'eta': hp.choice('eta', [0.1, 0.3, 0.5, 0.7, 0.9, 1]),
     # max_depth (maximum depth of the decision trees being trained)
     'max_depth': hp.choice('max_depth', np.arange(1, 20, 2)),
     'subsample': hp.choice('subsample', [0.1, 0.3, 0.5, 0.7, 0.9, 1])
    }

    # hyperopt is used to perform an efficent search in the space of parameters
    def hyperopt_train_test(params):
        model = XGBClassifier(use_label_encoder=False, verbosity = 0, **params)
        #F: see https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst
        
        return cross_val_score(model, X_train, y_train, cv = 5).mean()
        #F: cross_val_score is from scikit learn (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)
        #F: will use the default score (for XGB it is accuracy)
        #F: this includes also training; cv=5 (5 number of folds) (5-folds crossvalidation)
        #F: .mean() is taken as cross_val_score returns an array of scores (one for each fold)
        # We have to do the mean because "cross_val_score" returns the accuracy of all the folds

    #We pass to this function the "space4xgb" parameter (in fmin() function)
    #F: this function is used below, as a parameter to fmin
    def f(params): 
        # assumes that "hyperopt_train_test" gives use the best cross validation accuracy 
        # given that combination of hyperparameters (params)
        acc = hyperopt_train_test(params)
        #F: loss is returned as opposite (negative) of accuracy because we will use in 
        #f_min (that only minimizes), where we want to minimize the loss (i.e., maximize accuracy)
        # We need to return these parameters because fmin() requires them
        return {'loss': -acc, 'status': STATUS_OK} #F: loss is returned as opposite (negative) of accuracy because we will use in f_min (that only minimizes), where we want to minimize the loss (i.e., maximize accuracy)

    trials = Trials()
    # best_params stores the index of the best parameters values according to the function
    # fmin() returns the indexes based on the minimum value of a passed function "f"
    # space4xgb is the search space
    # algo=tpe.suggest is the used alorithm
    # max_evals=5 is the maximum trials
    best_params = fmin(f, space4xgb, algo=tpe.suggest, max_evals=5, trials=trials)
    #F: see: https://github.com/hyperopt/hyperopt/blob/master/hyperopt/fmin.py
    #F: at this point, best_param is a dictionary where each key is the index of the corresponding best param in space4xgb
    
    #Insert in the paramets the values of the hyperparameters (not the indexes)
    best_params = hyperopt.space_eval(space4xgb, best_params)
    #F: this is used to extract from space4xgb the best values according to the indexes in best_params (and put such values in best_params)
    print(best_params)
    
    best_cv_acc = -round(trials.best_trial['result']['loss'], 2) #F: best across trials
    print('best_cv_acc: ' + str(best_cv_acc))

    xgb = XGBClassifier(eta = best_params['eta'], max_depth= best_params['max_depth'], 
                            subsample = best_params['subsample'], use_label_encoder=False, verbosity = 0) 

    t0 = time.time()
    #F: fit() is a function from scikit learn interface for XGB (https://xgboost.readthedocs.io/en/stable/python/python_intro.html#scikit-learn-interface), 
    # there is also train() that can be used directly with XGB objects (https://xgboost.readthedocs.io/en/stable/python/python_intro.html#training) 
    xgb.fit(X_train, y_train) 
    t1 = time.time()

    with open(resfileXGB, 'w') as result_file:
        result_file.write('Best eta: {}\n'.format(best_params['eta']))
        result_file.write('Best max depth: {}\n'.format(best_params['max_depth']))
        result_file.write('Best subsample: {}\n'.format(best_params['subsample']))
        result_file.write('Crossvalidation accuracy: {}\n'.format(best_cv_acc))
        result_file.write('Training duration for XGB is {} s\n'.format(round(t1 - t0)))

    return xgb


In [6]:
############################################################################################################
###### Perform DNN hyperparameters optimization via crossvalidation
###### Print hyperparameters obtained with crossvalidation in resfileDNN
###### Retrain a DNN model with best hyperparameters using the entire training set (X_train, y_train)
###### Print training results (best accuracy and training duration) in resfileDNN
###### Return the trained DNN model
############################################################################################################

def train_classifier_DNN(X_train, y_train, resfileDNN): 

    #F: define the search space for your hyperparameters
    space4dnn = {
     'activation': hp.choice('activation', ['logistic', 'tanh', 'relu']),
     'neurons': hp.choice('neurons', [10, 50, 100]),
     'layers': hp.choice('layers', np.arange(1, 4, 1))
    }

    def hyperopt_train_test(params):
        size = (params['neurons'],) * params['layers']
        dnn = MLPClassifier(hidden_layer_sizes=size, activation=params['activation'],
                            solver='adam', learning_rate='invscaling', max_iter=1000)
        return cross_val_score(dnn, X_train, y_train, cv = 5).mean()

    def f(params):
        acc = hyperopt_train_test(params)
        return {'loss': -acc, 'status': STATUS_OK}

    trials = Trials()
    best_params = fmin(f, space4dnn, algo=tpe.suggest, max_evals=5, trials=trials)
    
    best_params = hyperopt.space_eval(space4dnn, best_params)
    print(best_params) 
    
    best_cv_acc = -round(trials.best_trial['result']['loss'], 2)
    print('best_cv_acc: ' + str(best_cv_acc))
    
    
    size = (best_params['neurons'],) * best_params['layers']
    dnn = MLPClassifier(hidden_layer_sizes=size, activation=best_params['activation'],
                                solver='adam', learning_rate='invscaling', max_iter=1000)

    t0 = time.time()
    dnn.fit(X_train, y_train)
    t1 = time.time()

    with open(resfileDNN, 'w') as result_file:
        result_file.write('Best number of layers: {}\n'.format(best_params['layers']))
        result_file.write('Best number of neurons: {}\n'.format(best_params['neurons']))
        result_file.write('Best activation function: {}\n'.format(best_params['activation']))
        result_file.write('Crossvalidation accuracy: {}\n'.format(best_cv_acc))
        result_file.write('Training duration for DNN is {} s\n'.format(round(t1 - t0)))

    return dnn


In [7]:
############################################################################################################
###### Perform KNN hyperparameters optimization via crossvalidation
###### Print hyperparameters obtained with crossvalidation in resfileKNN
###### Retrain a KNN model with best hyperparameters using the entire training set (X_train, y_train)
###### Print training results (best accuracy and training duration) in resfileKNN
###### Return the trained KNN model
############################################################################################################

def train_classifier_KNN(X_train, y_train, resfileKNN): 

    #F: define the search space for your hyperparameters
    space4knn = {
     'leaf_size': hp.choice('leaf_size', np.arange(1, 50, 1)),
     'p': hp.choice('p', [1, 2]),
     'n_neighbors': hp.choice('n_neighbors', np.arange(1, 30, 1))
    }

    def hyperopt_train_test(params):
        knn = KNeighborsClassifier(leaf_size=params['leaf_size'], p=params['p'], 
                                   n_neighbors=params['n_neighbors'])
        return cross_val_score(knn, X_train, y_train, cv = 5).mean()

    def f(params):
        acc = hyperopt_train_test(params)
        return {'loss': -acc, 'status': STATUS_OK}

    trials = Trials()
    best_params = fmin(f, space4knn, algo=tpe.suggest, max_evals=5, trials=trials)
    
    best_params = hyperopt.space_eval(space4knn, best_params)
    print(best_params) 
    
    best_cv_acc = -round(trials.best_trial['result']['loss'], 2)
    print('best_cv_acc: ' + str(best_cv_acc))
    
    knn = KNeighborsClassifier(leaf_size=best_params['leaf_size'], p=best_params['p'], 
                                   n_neighbors=best_params['n_neighbors'])

    t0 = time.time()
    knn.fit(X_train, y_train)
    t1 = time.time()

    with open(resfileKNN, 'w') as result_file:
        result_file.write('Best leaf_size: {}\n'.format(best_params['leaf_size']))
        result_file.write('Best number of p: {}\n'.format(best_params['p']))
        result_file.write('Best number of neighbors: {}\n'.format(best_params['n_neighbors']))
        result_file.write('Crossvalidation accuracy: {}\n'.format(best_cv_acc))
        result_file.write('Training duration for kNN is {} s\n'.format(round(t1 - t0)))

    return knn


In [8]:
################################################################################################################
###### Split into into train/test and call train_classifier_XXX() functions
################################################################################################################
res_folder = 'Hyperparameter_optimization'
if not os.path.exists(res_folder):
    os.makedirs(res_folder)

resfile_XGB=res_folder + '/XGB_sp_' + str(spacing) + 'w_' + str(length) + '_results.txt'
resfile_DNN=res_folder + '/DNN_sp_' + str(spacing) + 'w_' + str(length) + '_results.txt'
resfile_KNN=res_folder + '/KNN_sp_' + str(spacing) + 'w_' + str(length) + '_results.txt'

# Stratify garantees the split all the scenarios among train and test
# It's like shuffle and split
# random_state it's a seed to get the same output
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

print('Training XGB...')
xgb = train_classifier_XGB(X_train, y_train, resfile_XGB)

print('Training DNN...')
dnn = train_classifier_DNN(X_train, y_train, resfile_DNN)

print('Training KNN...')
knn = train_classifier_KNN(X_train, y_train, resfile_KNN)

Training XGB...
100%|██████████| 5/5 [02:53<00:00, 34.64s/trial, best loss: -0.9992303916756216]
{'eta': 0.7, 'max_depth': 7, 'subsample': 1}
best_cv_acc: 1.0
Training DNN...
100%|██████████| 5/5 [21:38<00:00, 259.68s/trial, best loss: -0.998431464219388] 
{'activation': 'tanh', 'layers': 2, 'neurons': 50}
best_cv_acc: 1.0
Training KNN...
100%|██████████| 5/5 [00:48<00:00,  9.74s/trial, best loss: -0.997386995237138] 
{'leaf_size': 43, 'n_neighbors': 14, 'p': 1}
best_cv_acc: 1.0
