<a href="https://colab.research.google.com/github/Patatone/Network-failure-cause-identification/blob/main/Failure_cause_identification_with_different_failures_location_IPY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import hyperopt
import time
import sklearn.metrics as mt
import pickle

from xgboost import Booster
from xgboost import XGBClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

#XAI-related packages: LIME and SHAP
import lime
import lime.lime_tabular
import shap 
import shap.plots

In [5]:
#################################################################################################
###### The function load_window_dataset() that takes in input window data file, and 
###### label to be assigned and returns numpy arrays with features and labels
#################################################################################################

def load_window_dataset(X, y, filename, label):
#Inputs: - X: current matrix of datapoints where we want to APPEND the datapoints retrieved from filename (only features)
#        - y: current matrix of datapoints where we want to APPEND the datapoints retrieved from filename (only labels)
#        - filename: full name (with path) of the file to be read (it must be a window dataset file created above)
#        - label: integer, label to be assigned to the datapoints retrieved from filename; it may differ from labels already included in current y
#Outputs: - X: updated X (including features for the new data points retrieved from filename)
#         - y: updated y (including labels for the new data points)
#This function to X and y in input the new datapoints retrieved from filename and return updated X and y
#The function handle the case when X and y are empty (initialized as None)

    data = pd.read_csv(filename)

    if X is None:
        X = data.to_numpy()
        # full() function puts in all X.shape[0] elements the value "label"
        y = np.full(X.shape[0], label)
    else:
        X_temp = data.to_numpy()
        y_temp = np.full(X_temp.shape[0], label)
        X = np.append(X, X_temp, axis = 0) #F: axis=0-->stack X and X_temp vertically (increase no of rows)
        y = np.append(y, y_temp)

    return X, y



In [7]:
##########################################################################################
###### Use function load_window_dataset() with datasets of for all scenarios  
###### using window length = 10 and spacing = 1. Finally, perform features scaling 
##########################################################################################

X=None 
y=None
length=10
spacing=1
folderpath='../Features'

for filename in os.listdir(folderpath):   
    if filename.endswith('_sp' + str(spacing) + '_w' + str(length) + '.dat'):
        print(filename)
        label = 0
        if int(filename[9]) > 5:
          label = 1
        fullname = folderpath + '/' + filename
#------------------------------------------------------------
        X, y = load_window_dataset(X, y, fullname, label)
#------------------------------------------------------------
        print('current shape of X: ' +str(X.shape))
        print('current shape of y: ' +str(y.shape))

# Features scaling 
scaler = StandardScaler()
X = scaler.fit_transform(X)

all_data = {'mean': [X[:,[1,2,3,4,5]], [n for n in y if n != 0]], 'RMS': [X[:,[0,2,3,4,5]], [n for n in y if n != 1]], 
            'ptp': [X[:,[0,1,3,4,5]], [n for n in y if n != 2]], 'std': [X[:,[0,1,2,4,5]], [n for n in y if n != 3]], 
            'max': [X[:,[0,1,2,3,5]], [n for n in y if n != 4]], 'min': [X[:,[0,1,2,3,4]], [n for n in y if n != 5]]}

print(all_data)

Scenario_1_monitor_node_1_preamp_lpth_2_1_sp1_w10.dat
current shape of X: (21591, 6)
current shape of y: (21591,)
Scenario_1_monitor_node_1_preamp_lpth_3-1_1_sp1_w10.dat
current shape of X: (43182, 6)
current shape of y: (43182,)
Scenario_1_monitor_node_1_preamp_lpth_3-2_1_sp1_w10.dat
current shape of X: (64773, 6)
current shape of y: (64773,)
Scenario_2_monitor_node_1_preamp_lpth_2_1_sp1_w10.dat
current shape of X: (86364, 6)
current shape of y: (86364,)
Scenario_2_monitor_node_1_preamp_lpth_3-1_1_sp1_w10.dat
current shape of X: (107955, 6)
current shape of y: (107955,)
Scenario_3_monitor_node_1_preamp_lpth_2_1_sp1_w10.dat
current shape of X: (129546, 6)
current shape of y: (129546,)
Scenario_4_monitor_node_1_preamp_lpth_2_1_sp1_w10.dat
current shape of X: (151137, 6)
current shape of y: (151137,)
Scenario_4_monitor_node_1_preamp_lpth_3-1_1_sp1_w10.dat
current shape of X: (172728, 6)
current shape of y: (172728,)
Scenario_4_monitor_node_1_preamp_lpth_3-2_1_sp1_w10.dat
current shape of

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
def train_classifier_XGB(X_train, y_train): 
    xgb = XGBClassifier(eta = 0.7, max_depth= 19, subsample = 0.7, verbosity = 0)
    xgb.fit(X_train, y_train)
    return xgb

In [9]:
def train_classifier_DNN(X_train, y_train): 
    size = (10,) * 3
    dnn = MLPClassifier(hidden_layer_sizes=size, activation='logistic',
                                solver='adam', learning_rate='invscaling', max_iter=1000)
    dnn.fit(X_train, y_train)
    return dnn

In [10]:
def train_classifier_KNN(X_train, y_train): 
    knn = KNeighborsClassifier(leaf_size=21, p=2, n_neighbors=4)
    knn.fit(X_train, y_train)
    return knn

In [11]:
################################################################################################################
###### Split into into train/test and call train_classifier_XXX() functions
################################################################################################################

xgb_models = []
dnn_models = []
knn_models = []

all_models = [xgb_models, dnn_models, knn_models]

for key in all_data:
    # Stratify garantees the split all the scenarios among train and test
    # It's like shuffle and split
    # random_state it's a seed to get the same output
    X_train, X_test, y_train, y_test = train_test_split(all_data[key][0], all_data[key][1], stratify=y, test_size=0.2, random_state=42)

    print('Training XGB without '+ key +'...')
    xgb = train_classifier_XGB(X_train, y_train)
    xgb_models.append(xgb)
    
    print('Training MLP without '+ key +'...')
    dnn = train_classifier_DNN(X_train, y_train)
    dnn_models.append(dnn)
    
    print('Training KNN without '+ key +'...')
    knn = train_classifier_KNN(X_train, y_train)
    knn_models.append(knn)

NameError: name 'a_dict' is not defined

In [None]:
########################################################################################################
###### The function performance_eval() takes in input ground truth and predicted labels, 
###### prints results in a result file passed in input, and returns global metrics
########################################################################################################

def performance_eval(y_true, y_pred, lab, l_names):
    
    #Compute metrics and print them
    accuracy = mt.accuracy_score(y_true, y_pred)
    precision = mt.precision_score(y_true, y_pred, labels=lab, average=None) #F: average=None gives per-class results
    global_precision = mt.precision_score(y_true, y_pred, labels=lab, average='weighted') 
    recall = mt.recall_score(y_true, y_pred, labels=lab, average=None)
    global_recall = mt.recall_score(y_true, y_pred, labels=lab, average='weighted') 
    f1score = mt.f1_score(y_true, y_pred, labels=lab, average=None)
    global_f1score = mt.f1_score(y_true, y_pred, labels=lab, average='weighted')

    return accuracy, global_precision, global_recall, global_f1score 


In [None]:
##############################################################################################################
###### Load models into NEW models, perform prediction and evaluate performance using performance_eval() 
##############################################################################################################

lbl = [0, 1]
label_names=['Attenuation', 'Filtering']

# Added to fix: 'XGBClassifier' object has no attribute '_le'
xgb._le = LabelEncoder().fit(y_test)

y_pred_XGB = xgb.predict(X_test)
y_pred_DNN = dnn.predict(X_test)
y_pred_KNN = knn.predict(X_test)

XGB_metrics = performance_eval(y_test, y_pred_XGB, lbl, label_names)
DNN_metrics = performance_eval(y_test, y_pred_DNN, lbl, label_names)
KNN_metrics = performance_eval(y_test, y_pred_KNN, lbl, label_names)

print('XGB metrics: ' +str(XGB_metrics))
print('****************')
print('DNN metrics: ' +str(DNN_metrics))
print('****************')
print('KNN metrics: ' +str(KNN_metrics))