In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn

In [3]:
# Load dataset
# filename = "../data/feats_rel.csv"
# filename = "../data/feats_snr.csv"
filename = "../data/feats_rel_harmsep_entr.csv"
# filename = "../data/feats_rel_noharm_entr.csv"

In [4]:
# Data with variance
dataset = pd.read_csv(filename)

In [5]:
# Cross-validation groups across subjects
subj_groups = dataset["id_subject"]
classes = dataset["class"]

In [6]:
# Dataset with variance features
dataset_w_var = dataset.drop(['class', 'id_subject'], axis=1)

In [7]:
# Dataset without variance features
dataset_no_var = dataset_w_var
for column in dataset_no_var.columns:
    if 'var' in column:
        dataset_no_var = dataset_no_var.drop(column, axis=1)

# Model training - Neural Networks, nested CV

In [8]:
# Import CV methods
from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, GridSearchCV, cross_val_score

In [9]:
# Import pipelines and preprocessing tools
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [10]:
# Import classifiers
from sklearn.neural_network import MLPClassifier

In [11]:
# Choose training dataset
dataset = dataset_no_var
# dataset = dataset_w_var

In [12]:
# Feature statistics
dataset.describe()

Unnamed: 0,mean_667,mean_750,mean_850,mean_1000,mean_1200,mean_harm2_667,mean_harm2_750,mean_harm2_850,mean_harm2_1000,mean_harm2_1200,...,mean_harm3_750,mean_harm3_850,mean_harm3_1000,mean_harm3_1200,mean_harm4_667,mean_harm4_750,mean_harm4_850,mean_harm4_1000,mean_harm4_1200,entropy
count,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,...,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0,1081.0
mean,0.109392,0.109049,0.100243,0.099852,0.112305,0.081792,0.062466,0.044798,0.043348,0.034596,...,0.04324,0.030063,0.016913,0.014274,0.023318,0.016913,0.013996,0.012847,0.018847,-894.342406
std,0.099751,0.058941,0.055266,0.066142,0.077184,0.056992,0.048784,0.033235,0.034684,0.027205,...,0.037389,0.021527,0.010965,0.010658,0.01388,0.010965,0.009628,0.00972,0.036298,153.938829
min,0.007185,0.007831,0.010049,0.009737,0.007627,0.005848,0.003691,0.004406,0.00204,0.002112,...,0.002528,0.002155,0.000962,0.000787,0.001344,0.000962,0.000982,0.000622,0.00023,-1965.53408
25%,0.050952,0.062644,0.05935,0.053794,0.060203,0.045555,0.033089,0.025116,0.023158,0.018721,...,0.021106,0.016562,0.009114,0.006712,0.013675,0.009114,0.006882,0.005858,0.004006,-958.790537
50%,0.084552,0.101156,0.090355,0.085397,0.093206,0.066173,0.049497,0.036114,0.034398,0.027434,...,0.03196,0.024609,0.014367,0.011296,0.019868,0.014367,0.011609,0.010249,0.007761,-877.773039
75%,0.133012,0.141517,0.130077,0.125029,0.136603,0.098709,0.073139,0.052461,0.04957,0.041196,...,0.049119,0.037509,0.021857,0.019186,0.030164,0.021857,0.018986,0.0172,0.015633,-803.613066
max,0.846664,0.467301,0.366278,0.527608,0.598257,0.465069,0.438886,0.349356,0.276234,0.261667,...,0.319056,0.232362,0.075818,0.083953,0.095434,0.075818,0.064765,0.054048,0.362373,-614.814856


In [13]:
# Model hyperparameters
# Activation function (ReLU by default) 
# Regularization parameters
alphas = np.logspace(-1, 1, 5)
p_grid_nn = {'nn__hidden_layer_sizes': [(5, 5)], # (10, 10), (20, 20), (40, 40), (80, 80), 
                                    # (5, 5, 5), (10, 10, 10), (20, 20, 20), (40, 40, 40), (80, 80, 80)], 
             'nn__alpha': list(alphas), 
             'nn__max_iter': [1000]}

In [14]:
# Choose feature scaling method
# feat_scaler = MinMaxScaler()
feat_scaler = StandardScaler()
# feat_scaler = RobustScaler()

In [None]:
# Results of interest
inner_res_nn = ['param_nn__hidden_layer_sizes', 'param_nn__alpha', 'mean_test_score', 'std_test_score', 'rank_test_score']

# Validation results list
res_val_nn = []
# Test set results list
res_test_nn = []
# Mean test list
mean_test_nn = []
# Standard deviation test accuracy
std_test_nn = []

# Outer CV scheme
outer_cv = LeaveOneGroupOut()
# # Inner cross-validation layer - 7 training subjects, 3 validation subjects 
inner_cv = LeavePGroupsOut(n_groups=3)

for train_out_i, test_out_i in outer_cv.split(X=dataset, y=classes, groups=subj_groups):
    train_in_dat = dataset_no_var.loc[train_out_i, :]
    test_in_dat = dataset_no_var.loc[test_out_i, :]
    train_in_class = classes[train_out_i]
    test_in_class = classes[test_out_i]
    train_in_groups = subj_groups[train_out_i]
    test_in_groups = subj_groups[test_out_i]
    
    # # Feature scaling inside the CV scheme (separate for training, separate for validation)
    pipe_nn = Pipeline([('scaler', feat_scaler), ('nn', MLPClassifier())])
    
    # Training
    clf = GridSearchCV(estimator=pipe_nn, param_grid=p_grid_nn, cv=inner_cv)
    clf.fit(X=train_in_dat, y=train_in_class, groups=train_in_groups)
    
    # Add validation results for the given fold    
    res_val_nn.append(pd.DataFrame(clf.cv_results_)[inner_res_nn])
    
    # Test set classification
    pred_nn = clf.predict(test_in_dat)
    
    # Mean test accuracy
    mean_test_nn.append(np.mean(np.array(pred_nn) == np.array(test_in_class)))
        
    # Standard deviation test accuracy
    std_test_nn.append(np.std(np.array(pred_nn) == np.array(test_in_class)))












In [None]:
# Best results
final_res = pd.DataFrame(data=np.transpose(np.array([mean_test_nn, 
                                                  std_test_nn, 
                                                  mean_test_nn, 
                                                  std_test_nn])),
                         columns=["mean_test_nn", "std_test_nn"])

In [None]:
# Best parameters
best_param_nn = pd.DataFrame(columns=["param_nn__hidden_layer_sizes", "param_nn__alpha", "mean_test_score", "std_test_score"])
for i in range(len(res_val_nn)):
    max_row_val = res_val_nn[i]["mean_test_score"].argmax()
    max_row = res_val_nn[i].iloc[max_row_val]
    best_param_nn = best_param_nn.append(max_row, ignore_index=True)
best_param_nn = best_param_nn.drop(columns=["rank_test_score"])

In [None]:
# Display results
# best_param_nn
final_res

In [None]:
# Filenames for best parameters and final results
bp_nn_filename = "../results/bp_nn_novar_stdscale_harmsep_entr.csv"
final_res_filename = "../results/final_res_nn_novar_stdscale_harmsep_entr.csv"

# Save results
best_param_nn.to_csv(bp_nn_filename)
final_res.to_csv(final_res_filename)