In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn

In [2]:
# Load dataset
# filename = "../data/feats_rel.csv"
# filename = "../data/feats_snr.csv"
# filename = "../data/feats_rel_harm_entr.csv"
filename = "../data/feats_rel_harmsep_entr.csv"
# filename = "../data/feats_rel_noharm_entr.csv"

In [None]:
# Data with variance
dataset = pd.read_csv(filename)

In [3]:
# Cross-validation groups across subjects
subj_groups = dataset["id_subject"]
classes = dataset["class"]

In [4]:
# Dataset with variance features
dataset_w_var = dataset.drop(['class', 'id_subject'], axis=1)

In [5]:
# Dataset without variance features
dataset_no_var = dataset_w_var
for column in dataset_no_var.columns:
    if 'var' in column:
        dataset_no_var = dataset_no_var.drop(column, axis=1)

# Model training - SVM, nested CV

In [6]:
# Import CV methods
from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, GridSearchCV, cross_val_score

In [7]:
# Import pipelines and preprocessing tools
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [8]:
# Import classifiers
from sklearn.svm import SVC, LinearSVC

In [9]:
# Choose training dataset
dataset = dataset_no_var
# dataset = dataset_w_var

In [10]:
# Feature statistics
dataset.describe()

Unnamed: 0,mean_667,mean_750,mean_850,mean_1000,mean_1200,mean_harm2_667,mean_harm2_750,mean_harm2_850,mean_harm2_1000,mean_harm2_1200,...,mean_harm3_750,mean_harm3_850,mean_harm3_1000,mean_harm3_1200,mean_harm4_667,mean_harm4_750,mean_harm4_850,mean_harm4_1000,mean_harm4_1200,entropy
count,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,...,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0
mean,0.183259,0.116606,0.098336,0.090259,0.112357,0.073592,0.056832,0.038119,0.036847,0.029898,...,0.035687,0.025377,0.014552,0.012156,0.020528,0.014552,0.012118,0.011203,0.027432,4.265062
std,0.212661,0.060074,0.06011,0.06513,0.095391,0.060344,0.050762,0.029963,0.032431,0.030502,...,0.034498,0.021031,0.011353,0.01079,0.01476,0.011353,0.009925,0.010388,0.057506,0.629521
min,0.007377,0.012167,0.010408,0.006854,0.005546,0.004011,0.002742,0.001864,0.001464,0.001217,...,0.001276,0.001036,0.000655,0.000507,0.000933,0.000655,0.000506,0.000454,0.000254,2.191331
25%,0.063585,0.072215,0.052614,0.04724,0.05338,0.039826,0.028014,0.019939,0.018309,0.014541,...,0.017386,0.012302,0.006417,0.00494,0.010543,0.006417,0.005065,0.003815,0.005571,4.158772
50%,0.105617,0.11075,0.086156,0.078027,0.086729,0.061566,0.044734,0.030867,0.029247,0.024078,...,0.02805,0.020792,0.011655,0.00888,0.017468,0.011655,0.009515,0.00791,0.008891,4.448209
75%,0.17498,0.148045,0.130096,0.110682,0.137761,0.089247,0.070156,0.047215,0.04394,0.035373,...,0.041864,0.03271,0.019777,0.016421,0.02799,0.019777,0.016919,0.015865,0.01678,4.639112
max,0.890707,0.401267,0.426491,0.547098,0.619839,0.438913,0.454936,0.261485,0.289446,0.389776,...,0.349476,0.223093,0.071562,0.107723,0.102059,0.071562,0.071923,0.088424,0.453224,5.058233


## Fundamental frequencies and harmonics

### Non-scaled features
* Features without variance
* Features with variance

In [11]:
# Model hyperparameters
# RBF
p_grid_rbf = {"C": [0.1, 1, 10, 100], 
              "gamma": [0.01, 0.1]}
# Linearni
p_grid_lin = {"C": [0.1, 1, 10, 100], 
              "max_iter": [10000]}

In [12]:
# Training models
svm_rbf = SVC(kernel="rbf", random_state = 0)
svm_lin = LinearSVC(random_state = 0)

In [None]:
# Results of interest
inner_res_svm_lin = ['param_C', 'mean_test_score', 'std_test_score', 'rank_test_score']
inner_res_svm_rbf = ['param_C', 'param_gamma', 'mean_test_score', 'std_test_score', 'rank_test_score']

# Validation results lists
res_val_svm_lin = []
res_val_svm_rbf = []
# Test results list
res_test_svm_lin = []
res_test_svm_rbf = []
# Mean test lists
mean_test_svm_lin = []
mean_test_svm_rbf = []
# Standard deviation test accuracy lists
std_test_svm_lin = []
std_test_svm_rbf = []

# Outer cross-validation scheme
outer_cv = LeaveOneGroupOut()
# Inner cross-validation layer - 7 training subjects, 3 validation subjects 
inner_cv = LeavePGroupsOut(n_groups=3)

for train_out_i, test_out_i in outer_cv.split(X=dataset, y=classes, groups=subj_groups):
    train_in_dat = dataset_no_var.loc[train_out_i, :]
    test_in_dat = dataset_no_var.loc[test_out_i, :]
    train_in_class = classes[train_out_i]
    test_in_class = classes[test_out_i]
    train_in_groups = subj_groups[train_out_i]
    test_in_groups = subj_groups[test_out_i]

    # Linear SVM
    clf1 = GridSearchCV(estimator=svm_lin, param_grid=p_grid_lin, cv=inner_cv)
    clf1.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # RBF
    clf2 = GridSearchCV(estimator=svm_rbf, param_grid=p_grid_rbf, cv=inner_cv)
    clf2.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # Add validation results for the given fold
    res_val_svm_lin.append(pd.DataFrame(clf1.cv_results_)[inner_res_svm_lin])
    res_val_svm_rbf.append(pd.DataFrame(clf2.cv_results_)[inner_res_svm_rbf])
    
    # Test set classification
    pred_lin = clf1.predict(test_in_dat)
    pred_rbf = clf2.predict(test_in_dat)
    
    # Mean test accuracy
    mean_test_svm_lin.append(np.mean(np.array(pred_lin) == np.array(test_in_class)))
    mean_test_svm_rbf.append(np.mean(np.array(pred_rbf) == np.array(test_in_class)))
    
    # Standard deviation test accuracy
    std_test_svm_lin.append(np.std(np.array(pred_lin) == np.array(test_in_class)))
    std_test_svm_rbf.append(np.std(np.array(pred_rbf) == np.array(test_in_class)))
    

In [None]:
# Best results
final_res = pd.DataFrame(data=np.transpose(np.array([mean_test_svm_lin, 
                                                  std_test_svm_lin, 
                                                  mean_test_svm_rbf, 
                                                  std_test_svm_rbf])),
                         columns=["mean_test_svm_lin", "std_test_svm_lin", "mean_test_svm_rbf", "std_test_svm_rbf"])

In [None]:
# Best hyperparameters - linear SVM
best_param_lin = pd.DataFrame(columns=["param_C", "mean_test_score", "std_test_score"])
for i in range(len(res_val_svm_lin)):
    max_row_val = res_val_svm_lin[i]["mean_test_score"].argmax()
    max_row = res_val_svm_lin[i].iloc[max_row_val]
    best_param_lin = best_param_lin.append(max_row, ignore_index=True)
best_param_lin = best_param_lin.drop(columns=["rank_test_score"])

In [None]:
# Best hyperparameters - SVM with an RBF kernel
best_param_rbf = pd.DataFrame(columns=["param_C", "param_gamma", "mean_test_score", "std_test_score"])
for i in range(len(res_val_svm_rbf)):
    max_row_val = res_val_svm_rbf[i]["mean_test_score"].argmax()
    max_row = res_val_svm_rbf[i].iloc[max_row_val]
    best_param_rbf = best_param_rbf.append(max_row, ignore_index=True)
best_param_rbf = best_param_rbf.drop(columns=["rank_test_score"])

In [None]:
# Show results
# best_param_lin
# best_param_rbf
final_res

In [None]:
# Filenames for best parameters and final results
# Best hyperparameters RBF
bp_rbf_filename = "../results/best_param_rbf_novar_noscale.csv"
bp_rbf_filename = "../results/best_param_rbf_var_noscale.csv"
# Best hyperparameters linear SVM
bp_lin_filename = "../results/best_param_lin_novar_noscale.csv"
bp_lin_filename = "../results/best_param_lin_var_noscale.csv"
# Final results
final_res_filename = "../results/final_res_novar_noscale.csv"
final_res_filename = "../results/final_res_var_noscale.csv"

# Save results
best_param_rbf.to_csv(bp_rbf_filename)
best_param_lin.to_csv(bp_lin_filename)
final_res.to_csv(final_res_filename)

### Scaled features

* Without variance features 
    * StandardScaler
    * MinMaxScaler

In [13]:
# Model hyperparameters
# RBF
p_grid_rbf = {"svc__C": [0.01, 0.1, 1, 10, 100], 
              "svc__gamma": [0.01, 0.1], 
              "svc__kernel": ["rbf"], 
              "svc__random_state": [0]}
# Linear SVM
p_grid_lin = {"svc__C": [0.01, 0.1, 1, 10, 100], 
              "svc__max_iter": [1000], 
              "svc__random_state": [0]}

In [14]:
# Select feature scaling method
feat_scaler = MinMaxScaler()
# feat_scaler = StandardScaler()
# feat_scaler = RobustScaler()

In [57]:
# Results of interest
inner_res_svm_lin = ['param_svc__C', 'mean_test_score', 'std_test_score', 'rank_test_score']
inner_res_svm_rbf = ['param_svc__C', 'param_svc__gamma', 'mean_test_score', 'std_test_score', 'rank_test_score']

# Validation results lists
res_val_svm_lin = []
res_val_svm_rbf = []
# Test results list
res_test_svm_lin = []
res_test_svm_rbf = []
# Mean test lists
mean_test_svm_lin = []
mean_test_svm_rbf = []
# Standard deviation test accuracy lists
std_test_svm_lin = []
std_test_svm_rbf = []

# Outer cross-validation scheme
outer_cv = LeaveOneGroupOut()
# Inner cross-validation layer - 7 training subjects, 3 validation subjects  
inner_cv = LeavePGroupsOut(n_groups=3)

for train_out_i, test_out_i in outer_cv.split(X=dataset, y=classes, groups=subj_groups):
    train_in_dat = dataset_no_var.loc[train_out_i, :]
    test_in_dat = dataset_no_var.loc[test_out_i, :]
    train_in_class = classes[train_out_i]
    test_in_class = classes[test_out_i]
    train_in_groups = subj_groups[train_out_i]
    test_in_groups = subj_groups[test_out_i]
    
    # Feature scaling inside the CV scheme (separate for training, separate for validation)
    pipe_rbf = Pipeline([('scaler', feat_scaler), ('svc', SVC())])
    pipe_lin = Pipeline([('scaler', feat_scaler), ('svc', LinearSVC())])
    
    # SVC
    clf1 = GridSearchCV(estimator=pipe_lin, param_grid=p_grid_lin, cv=inner_cv)
    clf1.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # RBF
    clf2 = GridSearchCV(estimator=pipe_rbf, param_grid=p_grid_rbf, cv=inner_cv)
    clf2.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # Add validation results for the given fold    
    res_val_svm_lin.append(pd.DataFrame(clf1.cv_results_)[inner_res_svm_lin])
    res_val_svm_rbf.append(pd.DataFrame(clf2.cv_results_)[inner_res_svm_rbf])
    
    # Test set classification
    pred_lin = clf1.predict(test_in_dat)
    pred_rbf = clf2.predict(test_in_dat)
    
    # Mean test accuracy
    mean_test_svm_lin.append(np.mean(np.array(pred_lin) == np.array(test_in_class)))
    mean_test_svm_rbf.append(np.mean(np.array(pred_rbf) == np.array(test_in_class)))
    
    # Standard deviation test accuracy
    std_test_svm_lin.append(np.std(np.array(pred_lin) == np.array(test_in_class)))
    std_test_svm_rbf.append(np.std(np.array(pred_rbf) == np.array(test_in_class)))
    




















































































In [58]:
# Best results
final_res = pd.DataFrame(data=np.transpose(np.array([mean_test_svm_lin, 
                                                  std_test_svm_lin, 
                                                  mean_test_svm_rbf, 
                                                  std_test_svm_rbf])),
                         columns=["mean_test_svm_lin", "std_test_svm_lin", "mean_test_svm_rbf", "std_test_svm_rbf"])

In [59]:
# Best linear SVM parameters
best_param_lin = pd.DataFrame(columns=["param_svc__C", "mean_test_score", "std_test_score"])
for i in range(len(res_val_svm_lin)):
    max_row_val = res_val_svm_lin[i]["mean_test_score"].argmax()
    max_row = res_val_svm_lin[i].iloc[max_row_val]
    best_param_lin = best_param_lin.append(max_row, ignore_index=True)
best_param_lin = best_param_lin.drop(columns=["rank_test_score"])

In [60]:
# Best RBF kernel SVM parameters
best_param_rbf = pd.DataFrame(columns=["param_svc__C", "param_svc__gamma", "mean_test_score", "std_test_score"])
for i in range(len(res_val_svm_rbf)):
    max_row_val = res_val_svm_rbf[i]["mean_test_score"].argmax()
    max_row = res_val_svm_rbf[i].iloc[max_row_val]
    best_param_rbf = best_param_rbf.append(max_row, ignore_index=True)
best_param_rbf = best_param_rbf.drop(columns=["rank_test_score"])

In [61]:
# Display results
# best_param_lin
# best_param_rbf
final_res

Unnamed: 0,mean_test_svm_lin,std_test_svm_lin,mean_test_svm_rbf,std_test_svm_rbf
0,0.855072,0.352028,0.942029,0.233689
1,0.46087,0.498466,0.46087,0.498466
2,0.202899,0.402158,0.173913,0.379035
3,0.413043,0.492381,0.423913,0.494177
4,0.234783,0.423863,0.243478,0.429181
5,0.282609,0.450268,0.315217,0.464602
6,0.530435,0.499073,0.530435,0.499073
7,0.231884,0.422035,0.231884,0.422035
8,0.530435,0.499073,0.634783,0.481491
9,0.513043,0.49983,0.547826,0.497707


In [62]:
# Filenames for best parameters and final results
bp_rbf_filename = "../results/best_param_rbf_novar_minmax_entr.csv"
bp_lin_filename = "../results/best_param_lin_novar_minmax_entr.csv"
final_res_filename = "../results/final_res_novar_minmax_entr.csv"

# Save results
best_param_rbf.to_csv(bp_rbf_filename)
best_param_lin.to_csv(bp_lin_filename)
final_res.to_csv(final_res_filename)

## Fundamental frequencies and harmonics as separate features

* With entropy
* Features without variance
    * StandardScaler
    * MinMaxScaler
* Features with variance
    * StandardScaler
    * MinMaxScaler


In [11]:
# Model hyperparameters
# RBF
p_grid_rbf = {"svc__C": [0.01, 0.1, 1, 10, 100], 
              "svc__gamma": [0.01, 0.1], 
              "svc__kernel": ["rbf"], 
              "svc__random_state": [0]}
# Linear SVM
p_grid_lin = {"svc__C": [0.01, 0.1, 1, 10, 100], 
              "svc__max_iter": [1000], 
              "svc__random_state": [0]}

In [12]:
# Choose feature scaling method
feat_scaler = MinMaxScaler()
# feat_scaler = StandardScaler()
# feat_scaler = RobustScaler()

In [13]:
# Results of interest
inner_res_svm_lin = ['param_svc__C', 'mean_test_score', 'std_test_score', 'rank_test_score']
inner_res_svm_rbf = ['param_svc__C', 'param_svc__gamma', 'mean_test_score', 'std_test_score', 'rank_test_score']

# Validation results lists
res_val_svm_lin = []
res_val_svm_rbf = []
# Test results list
res_test_svm_lin = []
res_test_svm_rbf = []
# Mean test lists
mean_test_svm_lin = []
mean_test_svm_rbf = []
# Standard deviation test accuracy lists
std_test_svm_lin = []
std_test_svm_rbf = []

# Outer cross-validation scheme
outer_cv = LeaveOneGroupOut()
# Inner cross-validation layer - 7 training subjects, 3 validation subjects 
inner_cv = LeavePGroupsOut(n_groups=3)

for train_out_i, test_out_i in outer_cv.split(X=dataset, y=classes, groups=subj_groups):
    train_in_dat = dataset_no_var.loc[train_out_i, :]
    test_in_dat = dataset_no_var.loc[test_out_i, :]
    train_in_class = classes[train_out_i]
    test_in_class = classes[test_out_i]
    train_in_groups = subj_groups[train_out_i]
    test_in_groups = subj_groups[test_out_i]
    
    # Feature scaling inside the CV scheme (separate for training, separate for validation)    pipe_rbf = Pipeline([('scaler', feat_scaler), ('svc', SVC())])
    pipe_lin = Pipeline([('scaler', feat_scaler), ('svc', LinearSVC())])
    
    # Linear SVM
    clf1 = GridSearchCV(estimator=pipe_lin, param_grid=p_grid_lin, cv=inner_cv)
    clf1.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # RBF
    clf2 = GridSearchCV(estimator=pipe_rbf, param_grid=p_grid_rbf, cv=inner_cv)
    clf2.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # Add validation results for the given fold    
    res_val_svm_lin.append(pd.DataFrame(clf1.cv_results_)[inner_res_svm_lin])
    res_val_svm_rbf.append(pd.DataFrame(clf2.cv_results_)[inner_res_svm_rbf])
    
    # Test set classification
    pred_lin = clf1.predict(test_in_dat)
    pred_rbf = clf2.predict(test_in_dat)
    
    # Mean test accuracy
    mean_test_svm_lin.append(np.mean(np.array(pred_lin) == np.array(test_in_class)))
    mean_test_svm_rbf.append(np.mean(np.array(pred_rbf) == np.array(test_in_class)))
    
    # Standard deviation test accuracy
    std_test_svm_lin.append(np.std(np.array(pred_lin) == np.array(test_in_class)))
    std_test_svm_rbf.append(np.std(np.array(pred_rbf) == np.array(test_in_class)))
    














































































































































In [14]:
# Best results
final_res = pd.DataFrame(data=np.transpose(np.array([mean_test_svm_lin, 
                                                  std_test_svm_lin, 
                                                  mean_test_svm_rbf, 
                                                  std_test_svm_rbf])),
                         columns=["mean_test_svm_lin", "std_test_svm_lin", "mean_test_svm_rbf", "std_test_svm_rbf"])

In [15]:
# Best linear SVM parameters
best_param_lin = pd.DataFrame(columns=["param_svc__C", "mean_test_score", "std_test_score"])
for i in range(len(res_val_svm_lin)):
    max_row_val = res_val_svm_lin[i]["mean_test_score"].argmax()
    max_row = res_val_svm_lin[i].iloc[max_row_val]
    best_param_lin = best_param_lin.append(max_row, ignore_index=True)
best_param_lin = best_param_lin.drop(columns=["rank_test_score"])

In [23]:
res_val_svm_lin[0].to_csv("../results/inner_cv_svc_ex.csv")

In [16]:
# Best RBF kernel SVM parameters
best_param_rbf = pd.DataFrame(columns=["param_svc__C", "param_svc__gamma", "mean_test_score", "std_test_score"])
for i in range(len(res_val_svm_rbf)):
    max_row_val = res_val_svm_rbf[i]["mean_test_score"].argmax()
    max_row = res_val_svm_rbf[i].iloc[max_row_val]
    best_param_rbf = best_param_rbf.append(max_row, ignore_index=True)
best_param_rbf = best_param_rbf.drop(columns=["rank_test_score"])

In [None]:
# Display results of the inner CV layer 
# best_param_lin
best_param_rbf

In [None]:
# Display final results
final_res

In [17]:
# Filenames for best parameters and final results
bp_rbf_filename = "../results/bp_rbf_novar_minmax_harmsep_entr.csv"
bp_lin_filename = "../results/bp_lin_novar_minmax_harmsep_entr.csv"
final_res_filename = "../results/final_res_novar_minmax_harmsep_entr.csv"

# Save results
best_param_rbf.to_csv(bp_rbf_filename)
best_param_lin.to_csv(bp_lin_filename)
final_res.to_csv(final_res_filename)

Unnamed: 0,param_svc__C,param_svc__gamma,mean_test_score,std_test_score
0,100,0.01,0.562098,0.134262
1,100,0.1,0.573692,0.138457
2,100,0.01,0.603157,0.132361
3,100,0.01,0.591752,0.136477
4,100,0.01,0.607442,0.125488
5,100,0.01,0.61155,0.129602
6,100,0.01,0.578182,0.140233
7,100,0.01,0.601676,0.130606
8,100,0.01,0.532315,0.122034
9,100,0.1,0.552567,0.136009
