In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn

In [2]:
# Load dataset
filename = "../data/feats_rel_harm_entr.csv"
dataset = pd.read_csv(filename)

In [3]:
# Group subjects for cross validation
subj_groups = dataset["id_subject"]
classes = dataset["class"]

In [4]:
dataset = dataset.drop(['class', 'id_subject'], axis=1)

In [5]:
dataset

Unnamed: 0,mean_667,mean_750,mean_850,mean_1000,mean_1200,var_667,var_750,var_850,var_1000,var_1200,entropy
0,0.186822,0.308491,0.117709,0.168620,0.141432,0.000078,0.000004,0.000011,0.000005,2.745694e-05,4.843136
1,0.397575,0.144061,0.135572,0.161908,0.210339,0.012223,0.000218,0.000218,0.000600,1.675651e-04,4.625999
2,0.174790,0.169944,0.162433,0.388763,0.115002,0.000003,0.000003,0.000018,0.000037,3.086156e-07,4.380569
3,0.119349,0.169296,0.127662,0.116165,0.396926,0.000011,0.001589,0.000054,0.002355,1.458470e-02,4.280912
4,0.437979,0.180027,0.125427,0.145099,0.184484,0.000428,0.000077,0.000017,0.000028,1.286333e-05,4.519675
...,...,...,...,...,...,...,...,...,...,...,...
1076,0.773072,0.136314,0.058285,0.081027,0.034296,0.007181,0.000001,0.000239,0.002612,9.951960e-05,2.639393
1077,0.750852,0.138513,0.070561,0.107435,0.035810,0.002061,0.000001,0.000037,0.000719,1.001812e-04,2.788224
1078,0.470570,0.162645,0.133048,0.248016,0.083917,0.000003,0.000198,0.000020,0.000549,3.664338e-05,3.828699
1079,0.621858,0.134438,0.070463,0.048530,0.181848,0.010900,0.000335,0.000736,0.000567,6.375987e-04,3.023690


In [6]:
# Feature statistics
dataset.describe()

Unnamed: 0,mean_667,mean_750,mean_850,mean_1000,mean_1200,var_667,var_750,var_850,var_1000,var_1200,entropy
count,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0
mean,0.314226,0.223678,0.173949,0.152861,0.181843,0.0006608371,0.0003753875,0.0002412382,0.0002758787,0.0007430933,4.265062
std,0.1846,0.08908,0.079534,0.074604,0.112066,0.00367643,0.001050285,0.0007773609,0.001071631,0.002775016,0.629521
min,0.083522,0.07679,0.030011,0.009445,0.016603,9.656039e-10,1.022866e-10,1.278717e-12,1.103632e-10,6.427427e-12,2.191331
25%,0.195524,0.155114,0.123012,0.109999,0.111448,2.823855e-06,2.083088e-06,1.512841e-06,1.971206e-06,1.487042e-06,4.158772
50%,0.253215,0.207641,0.167029,0.145474,0.159443,2.783796e-05,2.109645e-05,1.320961e-05,1.853225e-05,2.075888e-05,4.448209
75%,0.339061,0.267685,0.210161,0.182495,0.224056,0.0002246435,0.0002060883,0.0001228189,0.0001191025,0.0002913098,4.639112
max,0.897179,0.62619,0.534246,0.636412,0.666245,0.0786584,0.01286221,0.01024083,0.01998633,0.06210772,5.058233


In [8]:
# Cross-validation methods import
from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, GridSearchCV, cross_val_score

In [9]:
# Pipeline import
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [10]:
# Classifier import
from sklearn.svm import SVC, LinearSVC

In [11]:
# Model hyperparameters
p_grid_rbf = {"svc__C": [0.1, 1, 10, 100], 
              "svc__gamma": [0.01, 0.1], 
              "svc__kernel": ["rbf"], 
              "svc__random_state": [0]}
p_grid_lin = {"svc__C": [0.1, 1, 10, 100], 
              "svc__max_iter": [1000], 
              "svc__random_state": [0]}

In [12]:
# Models for training
svm_rbf = SVC(kernel="rbf", random_state = 0)
svm_lin = LinearSVC(random_state = 0)

In [13]:
# Results of interest
inner_res_svm_lin = ['param_svc__C', 'mean_test_score', 'std_test_score', 'rank_test_score']
inner_res_svm_rbf = ['param_svc__C', 'param_svc__gamma', 'mean_test_score', 'std_test_score', 'rank_test_score']

# Validation results lists
res_val_svm_lin = []
res_val_svm_rbf = []
# Test results list
res_test_svm_lin = []
res_test_svm_rbf = []
# Mean test lists
mean_test_svm_lin = []
mean_test_svm_rbf = []
# Standard deviation test accuracy lists
std_test_svm_lin = []
std_test_svm_rbf = []

# Outer cross-validation scheme
outer_cv = LeaveOneGroupOut()
# Inner cross-validation layer - 7 training subjects, 3 validation subjects 
inner_cv = LeavePGroupsOut(n_groups=3)

for train_out_i, test_out_i in outer_cv.split(X=dataset, y=classes, groups=subj_groups):
    train_in_dat = dataset.loc[train_out_i, :]
    test_in_dat = dataset.loc[test_out_i, :]
    train_in_class = classes[train_out_i]
    test_in_class = classes[test_out_i]
    train_in_groups = subj_groups[train_out_i]
    test_in_groups = subj_groups[test_out_i]
    
    # Feature scaling inside the CV scheme (separate for training, separate for validation)
    pipe_rbf = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
    pipe_lin = Pipeline([('scaler', StandardScaler()), ('svc', LinearSVC())])
    
    # Linear SVM
    clf1 = GridSearchCV(estimator=pipe_lin, param_grid=p_grid_lin, cv=inner_cv)
    clf1.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # RBF
    clf2 = GridSearchCV(estimator=pipe_rbf, param_grid=p_grid_rbf, cv=inner_cv)
    clf2.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # Add validation results for the given fold
    res_val_svm_lin.append(pd.DataFrame(clf1.cv_results_)[inner_res_svm_lin])
    res_val_svm_rbf.append(pd.DataFrame(clf2.cv_results_)[inner_res_svm_rbf])
    
    # Test set classification
    pred_lin = clf1.predict(test_in_dat)
    pred_rbf = clf2.predict(test_in_dat)
    
    # Mean test accuracy
    mean_test_svm_lin.append(np.mean(np.array(pred_lin) == np.array(test_in_class)))
    mean_test_svm_rbf.append(np.mean(np.array(pred_rbf) == np.array(test_in_class)))
    
    # Standard deviation test accuracy
    std_test_svm_lin.append(np.std(np.array(pred_lin) == np.array(test_in_class)))
    std_test_svm_rbf.append(np.std(np.array(pred_rbf) == np.array(test_in_class)))
    
















































































































































































In [14]:
# Best results
final_res = pd.DataFrame(data=np.transpose(np.array([mean_test_svm_lin, 
                                                  std_test_svm_lin, 
                                                  mean_test_svm_rbf, 
                                                  std_test_svm_rbf])),
                         columns=["mean_test_svm_lin", "std_test_svm_lin", "mean_test_svm_rbf", "std_test_svm_rbf"])

In [15]:
# Best linear SVM parameters
best_param_lin = pd.DataFrame(columns=["param_svc__C", "mean_test_score", "std_test_score"])
for i in range(len(res_val_svm_lin)):
    max_row_val = res_val_svm_lin[i]["mean_test_score"].argmax()
    max_row = res_val_svm_lin[i].iloc[max_row_val]
    best_param_lin = best_param_lin.append(max_row, ignore_index=True)
best_param_lin = best_param_lin.drop(columns=["rank_test_score"])

In [16]:
# Best RBF kernel SVM parameters
best_param_rbf = pd.DataFrame(columns=["param_svc__C", "param_svc__gamma", "mean_test_score", "std_test_score"])
for i in range(len(res_val_svm_rbf)):
    max_row_val = res_val_svm_rbf[i]["mean_test_score"].argmax()
    max_row = res_val_svm_rbf[i].iloc[max_row_val]
    best_param_rbf = best_param_rbf.append(max_row, ignore_index=True)
best_param_rbf = best_param_rbf.drop(columns=["rank_test_score"])

In [17]:
# Save the results
best_param_rbf.to_csv("../results/best_param_rbfsvm_snr_scale.csv")
best_param_lin.to_csv("../results/best_param_linsvm_snr_scale.csv")
final_res.to_csv("../results/final_res_snr_scale.csv")

In [18]:
best_param_rbf

Unnamed: 0,param_svc__C,param_svc__gamma,mean_test_score,std_test_score
0,10,0.01,0.535928,0.124592
1,10,0.01,0.540777,0.130701
2,10,0.01,0.58963,0.123552
3,10,0.01,0.574768,0.131197
4,10,0.01,0.602977,0.118041
5,10,0.01,0.603742,0.124374
6,10,0.01,0.551668,0.133291
7,10,0.01,0.582171,0.124094
8,10,0.01,0.516027,0.116068
9,10,0.01,0.543832,0.130135


In [19]:
best_param_lin

Unnamed: 0,param_svc__C,mean_test_score,std_test_score
0,0.1,0.549517,0.132391
1,0.1,0.563304,0.142901
2,0.1,0.602958,0.131063
3,0.1,0.591288,0.138176
4,0.1,0.618367,0.128521
5,0.1,0.610402,0.132828
6,0.1,0.566504,0.143145
7,0.1,0.60081,0.132699
8,0.1,0.529423,0.121513
9,0.1,0.565992,0.141778


In [20]:
final_res

Unnamed: 0,mean_test_svm_lin,std_test_svm_lin,mean_test_svm_rbf,std_test_svm_rbf
0,0.927536,0.259254,0.855072,0.352028
1,0.669565,0.47037,0.695652,0.460131
2,0.217391,0.412471,0.202899,0.402158
3,0.423913,0.494177,0.456522,0.498106
4,0.234783,0.423863,0.234783,0.423863
5,0.293478,0.455356,0.228261,0.419712
6,0.626087,0.483841,0.608696,0.488042
7,0.26087,0.439109,0.318841,0.466027
8,0.973913,0.159394,0.947826,0.222378
9,0.652174,0.47628,0.686957,0.463732


# RobustScaler, SVM, Nested CV

In [21]:
from sklearn.preprocessing import RobustScaler

In [22]:
# Model hyperparameters
p_grid_rbf = {"svc__C": [0.1, 1, 10, 100], 
              "svc__gamma": [0.01, 0.1], 
              "svc__kernel": ["rbf"], 
              "svc__random_state": [0]}
p_grid_lin = {"svc__C": [0.1, 1, 10, 100], 
              "svc__max_iter": [1000], 
              "svc__random_state": [0]}

In [23]:
# Results of interest
inner_res_svm_lin = ['param_svc__C', 'mean_test_score', 'std_test_score', 'rank_test_score']
inner_res_svm_rbf = ['param_svc__C', 'param_svc__gamma', 'mean_test_score', 'std_test_score', 'rank_test_score']

# Validation results lists
res_val_svm_lin = []
res_val_svm_rbf = []
# Test results lists
res_test_svm_lin = []
res_test_svm_rbf = []
# Mean test lists
mean_test_svm_lin = []
mean_test_svm_rbf = []
# Standard deviation test accuracy lists
std_test_svm_lin = []
std_test_svm_rbf = []

# Outer cross-validation scheme
outer_cv = LeaveOneGroupOut()
# Inner cross-validation layer - 7 training subjects, 3 validation subjects 
inner_cv = LeavePGroupsOut(n_groups=3)

for train_out_i, test_out_i in outer_cv.split(X=dataset, y=classes, groups=subj_groups):
    train_in_dat = dataset.loc[train_out_i, :]
    test_in_dat = dataset.loc[test_out_i, :]
    train_in_class = classes[train_out_i]
    test_in_class = classes[test_out_i]
    train_in_groups = subj_groups[train_out_i]
    test_in_groups = subj_groups[test_out_i]
    
    # Feature scaling inside the CV scheme (separate for training, separate for validation)
    pipe_rbf = Pipeline([('scaler', RobustScaler()), ('svc', SVC())])
    pipe_lin = Pipeline([('scaler', RobustScaler()), ('svc', LinearSVC())])
    
    # Linear SVM
    clf1 = GridSearchCV(estimator=pipe_lin, param_grid=p_grid_lin, cv=inner_cv)
    clf1.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # RBF
    clf2 = GridSearchCV(estimator=pipe_rbf, param_grid=p_grid_rbf, cv=inner_cv)
    clf2.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # Add validation results for the given fold
    res_val_svm_lin.append(pd.DataFrame(clf1.cv_results_)[inner_res_svm_lin])
    res_val_svm_rbf.append(pd.DataFrame(clf2.cv_results_)[inner_res_svm_rbf])
    
    # Test set classification
    pred_lin = clf1.predict(test_in_dat)
    pred_rbf = clf2.predict(test_in_dat)
    
    # Mean test accuracy
    mean_test_svm_lin.append(np.mean(np.array(pred_lin) == np.array(test_in_class)))
    mean_test_svm_rbf.append(np.mean(np.array(pred_rbf) == np.array(test_in_class)))
    
    # Standard deviation test accuracy
    std_test_svm_lin.append(np.std(np.array(pred_lin) == np.array(test_in_class)))
    std_test_svm_rbf.append(np.std(np.array(pred_rbf) == np.array(test_in_class)))
    








































































































































































































































In [24]:
best_param_rbf

Unnamed: 0,param_svc__C,param_svc__gamma,mean_test_score,std_test_score
0,10,0.01,0.535928,0.124592
1,10,0.01,0.540777,0.130701
2,10,0.01,0.58963,0.123552
3,10,0.01,0.574768,0.131197
4,10,0.01,0.602977,0.118041
5,10,0.01,0.603742,0.124374
6,10,0.01,0.551668,0.133291
7,10,0.01,0.582171,0.124094
8,10,0.01,0.516027,0.116068
9,10,0.01,0.543832,0.130135


In [25]:
best_param_lin

Unnamed: 0,param_svc__C,mean_test_score,std_test_score
0,0.1,0.549517,0.132391
1,0.1,0.563304,0.142901
2,0.1,0.602958,0.131063
3,0.1,0.591288,0.138176
4,0.1,0.618367,0.128521
5,0.1,0.610402,0.132828
6,0.1,0.566504,0.143145
7,0.1,0.60081,0.132699
8,0.1,0.529423,0.121513
9,0.1,0.565992,0.141778


In [26]:
final_res

Unnamed: 0,mean_test_svm_lin,std_test_svm_lin,mean_test_svm_rbf,std_test_svm_rbf
0,0.927536,0.259254,0.855072,0.352028
1,0.669565,0.47037,0.695652,0.460131
2,0.217391,0.412471,0.202899,0.402158
3,0.423913,0.494177,0.456522,0.498106
4,0.234783,0.423863,0.234783,0.423863
5,0.293478,0.455356,0.228261,0.419712
6,0.626087,0.483841,0.608696,0.488042
7,0.26087,0.439109,0.318841,0.466027
8,0.973913,0.159394,0.947826,0.222378
9,0.652174,0.47628,0.686957,0.463732
