In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import path

# Classifiers
from sklearn.naive_bayes import GaussianNB # Gaussiana de Naive-Bayes
from sklearn.neural_network import MLPClassifier # Multi-layer Perceptron
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbors
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.ensemble import AdaBoostClassifier # AdaBoost
from sklearn.svm import SVC # Support Vector Machine (Linear, Polynomial, RBF)

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from imblearn.metrics import sensitivity_score
from imblearn.metrics import specificity_score
from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix

In [2]:
outpath = path.join("..","results","classification")

datasets_name_list = ['Raw', 'Normalized', 'Fourier', 'HOS', 'SCM']

classifiers_name_list = ['Naive_Bayes','MLP','Nearest_Neighbors','Random_Forest','SVM_Linear','SVM_Polynomial','SVM_RBF']

metrics_name_list = ['accuracy', 'precision', 'sensitivity', 'specificity', 'f1_score']

In [3]:
# Load datasets
RawDs = pd.read_csv(path.join("..","results","extraction","32FINAL.csv"))
NormDs = pd.read_csv(path.join("..","results","extraction","32FINAL_NORM.csv"))
FourierDs = pd.read_csv(path.join("..","results","extraction","32FINAL_Fourier.csv"))
HOSDs = pd.read_csv(path.join("..","results","extraction","32FINAL_HOS.csv"))
SCMDs = pd.read_csv(path.join("..","results","extraction","32FINAL_SCM.csv"))

# Exclude Tmp columns
RawDs = RawDs.drop(columns=['Tmp'+str(i) for i in range(100)])
NormDs = NormDs.drop(columns=['Tmp'+str(i) for i in range(100)])

# Put it into a list
datasets_list = [RawDs, NormDs, FourierDs, HOSDs, SCMDs]

In [4]:
from scipy.stats import randint

# Load classifiers
classifiers_list = [GaussianNB(),        
                    MLPClassifier(max_iter=1000, solver='adam', learning_rate_init=5e-04),      
                    KNeighborsClassifier(),   
                    RandomForestClassifier(),
                    SVC(kernel='linear', probability=True, tol=1e-3),
                    SVC(kernel='poly', probability=True, tol=1e-3),
                    SVC(kernel='rbf', probability=True, tol=1e-3)
                   ]

param_dist_dict = {'Naive_Bayes': [], 
                   'MLP': {"hidden_layer_sizes": list(np.arange(2,1001))},
                   'Nearest_Neighbors': {"n_neighbors": [1,3,5,7,9,11]}, 
                   'Random_Forest': {"n_estimators": [3000],
                                     "max_depth": [6, None],
                                     "max_features": randint(1, 11),
                                     "min_samples_split": randint(2, 11),
                                     "min_samples_leaf": randint(1, 11),
                                     "bootstrap": [True, False],
                                     "criterion": ["gini", "entropy"]},
                    'SVM_Linear': {'kernel': ['linear'], 'C': [2**i for i in range(-5,15)]},
                    'SVM_Polynomial': {'kernel': ['poly'], 'degree': [3, 5, 7 ,9], 'C': [2**i for i in range(-5,15)]},                    
                    'SVM_RBF': {'kernel': ['rbf'], 'gamma': [2**i for i in range(-15,3)],
                                'C': [2**i for i in range(-5,15)]}
                  }

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import warnings

warnings.filterwarnings('ignore')

for n_ds, dataset in enumerate(datasets_list):
    print("processing: " + datasets_name_list[n_ds] + " dataset")
    ds_outpath = path.join(outpath, datasets_name_list[n_ds])
    
    X, y = dataset.iloc[:,:-1], dataset.iloc[:,-1]
    X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.3, stratify=y, shuffle=True)
    unique, counts = np.unique(y_validation, return_counts=True)
    print(dict(zip(unique, counts))) ########################################################

    for n_clf, classifier in enumerate(classifiers_list):
        classifier_name = classifiers_name_list[n_clf]
        if n_clf != 0:
            random_search = RandomizedSearchCV(classifier, param_dist_dict[classifier_name], cv=4, 
                                               n_iter=5, scoring='accuracy')
            random_search.fit(X_train, y_train)
            params = random_search.best_params_
            classifier.set_params(**params)
            
        print("    processing: " + classifier_name + " classifier")
        with open(path.join(ds_outpath, classifier_name+"_config.txt"), 'w') as clf_txt:
            clf_txt.write(str(classifier))
        
        metrics_dict = dict((k,[]) for k in metrics_name_list)
        cmx = np.zeros((6,6))
        
        kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        print("        processing: K-Fold iterations", end='')
        
        for train_index, test_index in kfold.split(X_train, y_train):
            X_train2, X_test = X_train.iloc[train_index], X_train.iloc[test_index]
            y_train2, y_test = y_train.iloc[train_index], y_train.iloc[test_index]
            classifier.fit(X_train2, y_train2)
            y_predict = classifier.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_predict)
            precision = precision_score(y_test, y_predict, average='macro')
            sensitivity = sensitivity_score(y_test, y_predict, average='macro')
            specificity = specificity_score(y_test, y_predict, average='macro')
            f1 = f1_score(y_test, y_predict, average='macro')
            
            cmx += confusion_matrix(y_test, y_predict)
            
            metrics_dict["accuracy"].append(accuracy)
            metrics_dict["precision"].append(precision)
            metrics_dict["sensitivity"].append(sensitivity)
            metrics_dict["specificity"].append(specificity)
            metrics_dict["f1_score"].append(f1)
            
            print('.', end='')
            
        print(' Done!')
        print("            train mean accuracy: ", np.mean(metrics_dict["f1_score"]))
        print("            train std accuracy: ", np.std(metrics_dict["f1_score"]))
        print("            confusion matrix: ", cmx)

processing: Raw dataset
{0: 75, 1: 75, 2: 23, 3: 75, 4: 7, 5: 75}
    processing: Naive_Bayes classifier
        processing: K-Fold iterations.......... Done!
            train mean accuracy:  0.776942956523377
            train std accuracy:  0.06847407191838004
            confusion matrix:  [[168.   0.   7.   0.   0.   0.]
 [  0. 175.   0.   0.   0.   0.]
 [ 21.   0.  31.   0.   0.   0.]
 [  0.   0.   0. 146.  29.   0.]
 [  0.   0.   0.  13.   5.   0.]
 [  0.   0.   0.   0.   0. 175.]]
    processing: MLP classifier
        processing: K-Fold iterations.......... Done!
            train mean accuracy:  0.3635979356661614
            train std accuracy:  0.05140949245492059
            confusion matrix:  [[ 80.  50.  22.  22.   0.   1.]
 [ 44.  92.  11.  27.   0.   1.]
 [ 17.  18.   8.   9.   0.   0.]
 [ 22.  29.  13.  79.   7.  25.]
 [  2.   1.   2.  10.   0.   3.]
 [  4.  13.  10.  36.   3. 109.]]


KeyboardInterrupt: 