In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import path
import os

# Classifiers
from sklearn.naive_bayes import GaussianNB # Gaussiana de Naive-Bayes
from sklearn.neural_network import MLPClassifier # Multi-layer Perceptron
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbors
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.ensemble import AdaBoostClassifier # AdaBoost
from sklearn.svm import SVC # Support Vector Machine (Linear, Polynomial, RBF)

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from imblearn.metrics import sensitivity_score
from imblearn.metrics import specificity_score
from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix

## Classification of datasets

In [None]:
EXPERIMENT_NAME = "All_Classes_Classification"

# Set datasets to classification: ['Raw', 'Normalized', 'Fourier', 'HOS', 'SCM']
datasets_name_list = ['Fourier', 'SCM']

# Set classifiers to classification: ['Naive_Bayes','MLP','Nearest_Neighbors','Random_Forest','SVM_Linear','SVM_Polynomial','SVM_RBF']
classifiers_name_list = ['Naive_Bayes','MLP','Nearest_Neighbors','Random_Forest','SVM_Linear','SVM_Polynomial','SVM_RBF']

# Set metrics to evaluate classifier performance: ['accuracy','balanced_accuracy','precision','sensitivity','specificity','f1_score','fit_time','predict_time']
metrics_name_list = ['accuracy','balanced_accuracy','precision','sensitivity','specificity','f1_score','fit_time','predict_time']

# Set classes to classification: 3225 = CLASS 0, 3225_REVERSO = CLASS 1, 3225_VAZIO = CLASS 2, 3230 = CLASS 3, 3230_VAZIO = CLASS 4, 3235 = CLASS 5
classes = [0, 1, 2, 3, 4, 5]

# Grouping classes
group_classes = False

classification_type = 'KFold_Test' # 'KFold_Test_Val', 'KFold_Test', 'HoldOut'

# Define the grouping, if it is True
if group_classes:
    changeClass_dict = {0:1, 2:0, 3:1, 4:0} # format {old:new, old:new,...}

In [None]:
# Making dirs to save results
outpath = path.join("..","results","classification",EXPERIMENT_NAME)
os.makedirs(outpath)

train_path = path.join(outpath, 'Train')
os.makedirs(train_path)

if classification_type == 'KFold_Test_Val':
    validation_path = path.join(outpath, 'Validation')
    os.makedirs(validation_path)

for ds_name in datasets_name_list:
    os.makedirs(path.join(train_path, ds_name))
    if classification_type == 'KFold_Test_Val':
        os.makedirs(path.join(validation_path, ds_name))
    
def join_classes(df, dict_groups):
    df['Class'].replace(dict_groups, inplace=True)
    return df

In [None]:
# Load datasets
datasets_list = []

if 'Raw' in datasets_name_list:
    RawDs = pd.read_csv(path.join("..","results","extraction","32FINAL.csv"))
    RawDs = RawDs.drop(columns=['Tmp'+str(i) for i in range(100)]) # Exclude Tmp columns
    datasets_list.append(RawDs)
    
if 'Normalized' in datasets_name_list:
    NormDs = pd.read_csv(path.join("..","results","extraction","32FINAL_NORM.csv"))
    NormDs = NormDs.drop(columns=['Tmp'+str(i) for i in range(100)]) # Exclude Tmp columns
    datasets_list.append(NormDs)
    
if 'FFT' in datasets_name_list:  ## NEW
    FFTDs = pd.read_csv(path.join("..","results","extraction","32FINAL_FFT.csv"))
    datasets_list.append(FFTDs)
    
if 'Fourier' in datasets_name_list:
    FourierDs = pd.read_csv(path.join("..","results","extraction","32FINAL_Fourier.csv"))
#     FourierDs = FourierDs.drop(columns=['AcY'+str(i) for i in range(4)])
#     FourierDs = FourierDs.drop(columns=['AcZ'+str(i) for i in range(4)])
#     FourierDs = FourierDs.drop(columns=['GyX'+str(i) for i in range(4)])
#     FourierDs = FourierDs.drop(columns=['GyY'+str(i) for i in range(4)])
#     FourierDs = FourierDs.drop(columns=['GyZ'+str(i) for i in range(4)]) 
    datasets_list.append(FourierDs)
    
if 'HOS' in datasets_name_list:
    HOSDs = pd.read_csv(path.join("..","results","extraction","32FINAL_HOS.csv"))
    datasets_list.append(HOSDs)
    
if 'SCM' in datasets_name_list:
    SCMDs = pd.read_csv(path.join("..","results","extraction","32FINAL_SCM.csv"))
#     SCMDs = SCMDs.drop(columns=['AcY'+str(i) for i in range(8)])
#     SCMDs = SCMDs.drop(columns=['AcZ'+str(i) for i in range(8)])
#     SCMDs = SCMDs.drop(columns=['GyX'+str(i) for i in range(8)])
#     SCMDs = SCMDs.drop(columns=['GyY'+str(i) for i in range(8)])
#     SCMDs = SCMDs.drop(columns=['GyZ'+str(i) for i in range(8)])
    datasets_list.append(SCMDs)

# Exclude classes that are not in scope 
for ds in datasets_list:
    for i in range(6):
        if i not in classes:
            indexNames = ds[ds['Class'] == i].index
            ds.drop(indexNames, inplace=True) # Delete these row indexes from dataframe

# Grouping classes           
if group_classes:
    for ds in datasets_list:
        ds = join_classes(ds, changeClass_dict)
    print("Experiment Classes: ", classes)
    classes = np.unique(list(changeClass_dict.values())).tolist()
    print("New Classes: ", classes)
        
unique, counts = np.unique(SCMDs['Class'], return_counts=True)
print(dict(zip(unique, counts)))

In [None]:
from scipy.stats import randint

# Load classifiers
classifiers_list = [GaussianNB(),        
                    MLPClassifier(max_iter=1000, solver='adam', learning_rate_init=5e-04),      
                    KNeighborsClassifier(),   
                    RandomForestClassifier(),
                    SVC(kernel='linear', probability=True, tol=1e-3),
                    SVC(kernel='poly', probability=True, tol=1e-3),
                    SVC(kernel='rbf', probability=True, tol=1e-3)
                   ]

# param_dist_dict = {'Naive_Bayes': [], 
#                    'MLP': {"hidden_layer_sizes": list(np.arange(2,1001))},
#                    'Nearest_Neighbors': {"n_neighbors": [1,3,5,7,9,11]}, 
#                    'Random_Forest': {"n_estimators": [3000],
#                                      "max_depth": [6, None],
#                                      "max_features": randint(1, 11),
#                                      "min_samples_split": randint(2, 11),
#                                      "min_samples_leaf": randint(1, 11),
#                                      "bootstrap": [True, False],
#                                      "criterion": ["gini", "entropy"]},
#                     'SVM_Linear': {'kernel': ['linear'], 'C': [2**i for i in range(-5,15)]},
#                     'SVM_Polynomial': {'kernel': ['poly'], 'degree': [3, 5, 7 ,9], 'C': [2**i for i in range(-5,15)]},                    
#                     'SVM_RBF': {'kernel': ['rbf'], 'gamma': [2**i for i in range(-15,3)],
#                                 'C': [2**i for i in range(-5,15)]}
#                   }

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import warnings
import time

current_milli_time = lambda: int(round(time.time() * 1000))

warnings.filterwarnings('ignore')

In [None]:
%%time
if classification_type == 'KFold_Test_Val':
    for n_ds, dataset in enumerate(datasets_list):
        print("processing: " + datasets_name_list[n_ds] + " dataset")
        train_ds_path = path.join(train_path, datasets_name_list[n_ds])
        validation_ds_path = path.join(validation_path, datasets_name_list[n_ds])

        X, y = dataset.iloc[:,:-1], dataset.iloc[:,-1]
        X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True)

        list_final_metrics = []
        val_list_final_metrics = []

        for n_clf, classifier in enumerate(classifiers_list):
            classifier_name = classifiers_name_list[n_clf]
            if n_clf != 0:
                random_search = RandomizedSearchCV(classifier, param_dist_dict[classifier_name], cv=4, 
                                                   n_iter=5, scoring='accuracy')
                random_search.fit(X_train, y_train)
                params = random_search.best_params_
                classifier.set_params(**params)

            print("    processing: " + classifier_name + " classifier")
            with open(path.join(train_ds_path, classifier_name+"_config.txt"), 'w') as clf_txt:
                clf_txt.write(str(classifier))

            metrics_dict = dict((k,[]) for k in metrics_name_list)
            val_metrics_dict = dict((k,[]) for k in metrics_name_list)

            cmx = np.zeros((len(classes),len(classes)))

            kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
            print("        processing: K-Fold iterations", end='')

            for train_index, test_index in kfold.split(X_train, y_train):
                X_train2, X_test = X_train.iloc[train_index], X_train.iloc[test_index]
                y_train2, y_test = y_train.iloc[train_index], y_train.iloc[test_index]

                fitStart = current_milli_time() # Start fit chronometer
                classifier.fit(X_train2, y_train2)
                fit_time = current_milli_time() - fitStart # Stop fit chronometer and save time

                predictStart = current_milli_time() # Start predict chronometer
                y_predict = classifier.predict(X_test)
                predict_time = current_milli_time() - predictStart # Stop predict chronometer and save time

                accuracy = accuracy_score(y_test, y_predict)
                bal_accuracy = balanced_accuracy_score(y_test, y_predict)
                precision = precision_score(y_test, y_predict, average='macro')
                sensitivity = sensitivity_score(y_test, y_predict, average='macro')
                specificity = specificity_score(y_test, y_predict, average='macro')
                f1 = f1_score(y_test, y_predict, average='macro')

                cmx += confusion_matrix(y_test, y_predict)

                metrics_dict["accuracy"].append(accuracy)
                metrics_dict["balanced_accuracy"].append(bal_accuracy)
                metrics_dict["precision"].append(precision)
                metrics_dict["sensitivity"].append(sensitivity)
                metrics_dict["specificity"].append(specificity)
                metrics_dict["f1_score"].append(f1)
                metrics_dict["fit_time"].append(fit_time)
                metrics_dict["predict_time"].append(predict_time)

                print('.', end='')

            print(' Done!')
            cmx_csv = pd.DataFrame(cmx.astype(int), index=classes, 
                                   columns=classes).to_csv(path.join(train_ds_path, classifier_name+"_cmx.csv"), sep=',')

            mean_metrics_dict = dict((k+'_mean', np.mean(metrics_dict[k])) for k in metrics_dict.keys())
            std_metrics_dict = dict((k+'_std', np.std(metrics_dict[k])) for k in metrics_dict.keys())

            final_metrics_dict = dict(mean_metrics_dict.items())
            final_metrics_dict.update(std_metrics_dict.items())

            list_final_metrics.append(final_metrics_dict)

            # VALIDATION OF THE MODEL

            val_predictStart = current_milli_time() # Start predict chronometer
            val_y_predict = classifier.predict(X_validation)
            val_predict_time = current_milli_time() - val_predictStart # Stop predict chronometer and save time

            val_accuracy = accuracy_score(y_validation, val_y_predict)
            val_bal_accuracy = balanced_accuracy_score(y_validation, val_y_predict)
            val_precision = precision_score(y_validation, val_y_predict, average='macro')
            val_sensitivity = sensitivity_score(y_validation, val_y_predict, average='macro')
            val_specificity = specificity_score(y_validation, val_y_predict, average='macro')
            val_f1 = f1_score(y_validation, val_y_predict, average='macro')

            val_cmx = confusion_matrix(y_validation, val_y_predict)

            val_metrics_dict["accuracy"].append(val_accuracy)
            val_metrics_dict["balanced_accuracy"].append(val_bal_accuracy)
            val_metrics_dict["precision"].append(val_precision)
            val_metrics_dict["sensitivity"].append(val_sensitivity)
            val_metrics_dict["specificity"].append(val_specificity)
            val_metrics_dict["f1_score"].append(val_f1)
            val_metrics_dict["fit_time"].append(np.mean(metrics_dict["fit_time"]))
            val_metrics_dict["predict_time"].append(val_predict_time)

            val_cmx_csv = pd.DataFrame(val_cmx.astype(int), index=classes, 
                                       columns=classes).to_csv(path.join(validation_ds_path, classifier_name+"_cmx.csv"), sep=',')

            val_mean_metrics_dict = dict((k, np.mean(val_metrics_dict[k])) for k in val_metrics_dict.keys())

            val_list_final_metrics.append(val_mean_metrics_dict)

        metrics_csv = pd.DataFrame(list_final_metrics, 
                                   index=classifiers_name_list).to_csv(path.join(train_ds_path, "evaluation_metrics.csv"), sep=',')

        val_metrics_csv = pd.DataFrame(val_list_final_metrics, 
                                       index=classifiers_name_list).to_csv(path.join(validation_ds_path, "evaluation_metrics.csv"), sep=',')

    print('\n\nFinished!')

In [None]:
%%time
if classification_type == 'KFold_Test':
    for n_ds, dataset in enumerate(datasets_list):
        print("processing: " + datasets_name_list[n_ds] + " dataset")
        train_ds_path = path.join(train_path, datasets_name_list[n_ds])

        X, y = dataset.iloc[:,:-1], dataset.iloc[:,-1]

        list_final_metrics = []

        for n_clf, classifier in enumerate(classifiers_list):
            classifier_name = classifiers_name_list[n_clf]
#             if n_clf != 0:
#                 random_search = RandomizedSearchCV(classifier, param_dist_dict[classifier_name], cv=4, 
#                                                    n_iter=5, scoring='accuracy')
#                 random_search.fit(X, y)
#                 params = random_search.best_params_
#                 classifier.set_params(**params)

            print("    processing: " + classifier_name + " classifier")
            with open(path.join(train_ds_path, classifier_name+"_config.txt"), 'w') as clf_txt:
                clf_txt.write(str(classifier))

            metrics_dict = dict((k,[]) for k in metrics_name_list)

            cmx = np.zeros((len(classes),len(classes)))

            kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
            print("        processing: K-Fold iterations", end='')

            for train_index, test_index in kfold.split(X, y):
#                 if n_clf != 0:
#                     classifier_aux = classifier.set_params(**params) ############
#                 else: 
#                     classifier_aux = classifier
                classifier_aux = classifier

                X_train, X_test = X.iloc[train_index], X.iloc[test_index] ####
                y_train, y_test = y.iloc[train_index], y.iloc[test_index] ####

                fitStart = current_milli_time() # Start fit chronometer
                classifier_aux.fit(X_train, y_train) #####
                fit_time = current_milli_time() - fitStart # Stop fit chronometer and save time

                predictStart = current_milli_time() # Start predict chronometer
                y_predict = classifier_aux.predict(X_test) #####
                predict_time = current_milli_time() - predictStart # Stop predict chronometer and save time
                
                accuracy = accuracy_score(y_test, y_predict)
                bal_accuracy = balanced_accuracy_score(y_test, y_predict)                
                precision = precision_score(y_test, y_predict, average='macro')                
                sensitivity = sensitivity_score(y_test, y_predict, average='macro')                
                specificity = specificity_score(y_test, y_predict, average='macro')                
                f1 = f1_score(y_test, y_predict, average='macro')

                cmx += confusion_matrix(y_test, y_predict)

                metrics_dict["accuracy"].append(accuracy)
                metrics_dict["balanced_accuracy"].append(bal_accuracy)
                metrics_dict["precision"].append(precision)
                metrics_dict["sensitivity"].append(sensitivity)
                metrics_dict["specificity"].append(specificity)
                metrics_dict["f1_score"].append(f1)
                metrics_dict["fit_time"].append(fit_time)
                metrics_dict["predict_time"].append(predict_time)

                print('.', end='')

            print(' Done!')
            cmx_csv = pd.DataFrame(cmx.astype(int), index=classes, 
                                   columns=classes).to_csv(path.join(train_ds_path, classifier_name+"_cmx.csv"), sep=',')

            mean_metrics_dict = dict((k+'_mean', np.mean(metrics_dict[k])) for k in metrics_dict.keys())
            std_metrics_dict = dict((k+'_std', np.std(metrics_dict[k])) for k in metrics_dict.keys())

            final_metrics_dict = dict(mean_metrics_dict.items()) 

            final_metrics_dict.update(std_metrics_dict.items())

            list_final_metrics.append(final_metrics_dict)

        metrics_csv = pd.DataFrame(list_final_metrics, 
                                   index=classifiers_name_list).to_csv(path.join(train_ds_path, "evaluation_metrics.csv"), sep=',')

    print('\n\nFinished!')

In [None]:
%%time
if classification_type == "HoldOut":
    for n_ds, dataset in enumerate(datasets_list):
        print("processing: " + datasets_name_list[n_ds] + " dataset")
        train_ds_path = path.join(train_path, datasets_name_list[n_ds])

        X, y = dataset.iloc[:,:-1], dataset.iloc[:,-1]
        list_final_metrics = []

        for n_clf, classifier in enumerate(classifiers_list):
            classifier_name = classifiers_name_list[n_clf]
            if n_clf != 0:
                random_search = RandomizedSearchCV(classifier, param_dist_dict[classifier_name], cv=4, 
                                                   n_iter=5, scoring='accuracy')
                random_search.fit(X, y)
                params = random_search.best_params_
                classifier.set_params(**params)

            print("    processing: " + classifier_name + " classifier")
            with open(path.join(train_ds_path, classifier_name+"_config.txt"), 'w') as clf_txt:
                clf_txt.write(str(classifier))

            metrics_dict = dict((k,[]) for k in metrics_name_list)

            cmx = np.zeros((len(classes),len(classes)))
            
            print("        processing: Hold-Out iterations", end='')
            
            for rnd in range(100):
                if n_clf != 0:
                    classifier_aux = classifier.set_params(**params)
                else: 
                    classifier_aux = classifier
                
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True)

                fitStart = current_milli_time() # Start fit chronometer
                classifier_aux.fit(X_train, y_train)
                fit_time = current_milli_time() - fitStart # Stop fit chronometer and save time

                predictStart = current_milli_time() # Start predict chronometer
                y_predict = classifier_aux.predict(X_test)
                predict_time = current_milli_time() - predictStart # Stop predict chronometer and save time

                accuracy = accuracy_score(y_test, y_predict)
                bal_accuracy = balanced_accuracy_score(y_test, y_predict)
                precision = precision_score(y_test, y_predict, average='macro')
                sensitivity = sensitivity_score(y_test, y_predict, average='macro')
                specificity = specificity_score(y_test, y_predict, average='macro')
                f1 = f1_score(y_test, y_predict, average='macro')

                cmx += confusion_matrix(y_test, y_predict)

                metrics_dict["accuracy"].append(accuracy)
                metrics_dict["balanced_accuracy"].append(bal_accuracy)
                metrics_dict["precision"].append(precision)
                metrics_dict["sensitivity"].append(sensitivity)
                metrics_dict["specificity"].append(specificity)
                metrics_dict["f1_score"].append(f1)
                metrics_dict["fit_time"].append(fit_time)
                metrics_dict["predict_time"].append(predict_time)

                print('.', end='')

            print(' Done!')
            cmx_csv = pd.DataFrame(cmx.astype(int), index=classes, 
                                   columns=classes).to_csv(path.join(train_ds_path, classifier_name+"_cmx.csv"), sep=',')

            mean_metrics_dict = dict((k+'_mean', np.mean(metrics_dict[k])) for k in metrics_dict.keys())
            std_metrics_dict = dict((k+'_std', np.std(metrics_dict[k])) for k in metrics_dict.keys())

            final_metrics_dict = dict(mean_metrics_dict.items())
            final_metrics_dict.update(std_metrics_dict.items())

            list_final_metrics.append(final_metrics_dict)

        metrics_csv = pd.DataFrame(list_final_metrics, 
                                   index=classifiers_name_list).to_csv(path.join(train_ds_path, "evaluation_metrics.csv"), sep=',')

    print('\n\nFinished!')