# Model Random Forests and Extra-Trees

In [2]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

In [3]:
# load basic libraries
import pandas as pd
import numpy as np
import mne
from pathlib import Path
import pickle
import time

from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score

%matplotlib widget
import matplotlib
import matplotlib.pyplot as plt

# set directories
# %cd D:/Programy/Anaconda3/Projects/EEG ML project # working directory
%cd D:
pkls = './Pickles/' # objects & variables

D:\Programy\Anaconda3\Projects\EEG ML project


In [4]:
# load split sets, eeg only
with open(pkls +'xy_train.pkl', 'rb') as handle:
    x_train = pickle.load(handle)
    y_train = pickle.load(handle)
y_train.shape
x_train.shape

(6040, 94)

### Extra-Trees: DOC-Forest model
Replicating settings from Endemann et al., 2018 

In [28]:
# original Endemann et al. model
n_estimators = 200 # default: 2000

doc = ExtraTreesClassifier(n_estimators=n_estimators, max_features=1, criterion='entropy',
        max_depth=4, random_state=42, class_weight='balanced')

doc.fit(x_train, y_train.values.ravel())

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                     criterion='entropy', max_depth=4, max_features=1,
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=200,
                     n_jobs=None, oob_score=False, random_state=42, verbose=0,
                     warm_start=False)

In [29]:
# evaluate
cv = 5
scores = cross_val_score(doc, x_train, y_train.values.ravel(), cv=cv)
scoresf1 = cross_val_score(doc, x_train, y_train.values.ravel(), cv=cv, scoring='f1_weighted')

doc_res = pd.DataFrame(data = [round(scores.mean(),3), round(scores.std() * 2, 3), 
                               round(scoresf1.mean(),3), round(scoresf1.std() * 2,3)]).T
doc_res.columns = ['acc mean', 'acc 2sd', 'f1 mean','f1 2sd']
doc_res.index = ['doc']
doc_res

Unnamed: 0,acc mean,acc 2sd,f1 mean,f1 2sd
doc,0.495,0.103,0.473,0.099


In [30]:
with open('./Pickles/doc_forest.pkl', 'wb') as handle:
    pickle.dump(doc, handle)
    pickle.dump(doc_res, handle)

In [None]:
with open(pkls + 'doc_forest.pkl', 'rb') as handle:
    doc = pickle.load(handle)
    doc_res = pickle.load(handle)

## Grid search  

### Extra-Trees

In [14]:
# 1st run
cv = 10
n_estimators = 1000 # go for ~2000
param_grid = {'max_depth': [2, 4, 8, 16], 
              'max_features': ['log2', 'auto'],
              'max_leaf_nodes': [8, 16, 24],
              'min_samples_leaf': [1,2,3]}

scorers = {'accuracy': make_scorer(accuracy_score),
           'f1'      : make_scorer(f1_score, average = 'weighted')}
grid = GridSearchCV(ExtraTreesClassifier(n_estimators = n_estimators), param_grid, refit='accuracy', verbose=1, 
                    scoring = scorers, return_train_score=True, cv = cv, n_jobs = -1)

grid.fit(x_train, y_train.values.ravel())
best_extr = grid.best_estimator_

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 143 out of 150 | elapsed:    3.9s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    4.3s finished


In [19]:
# evaluate and save
best_extr_res = round(pd.DataFrame(index = ['extr'], data  = 
          {'acc mean':[grid.cv_results_['mean_test_accuracy'][grid.best_index_]],
           'acc 2sd': [2*grid.cv_results_['std_test_accuracy'][grid.best_index_]],
           'f1 mean': [grid.cv_results_['mean_test_f1'][grid.best_index_]],
           'f1 2sd' : [2*grid.cv_results_['std_test_f1'][grid.best_index_]]}),4)
best_extr_res

Unnamed: 0,acc mean,acc 2sd,f1 mean,f1 2sd
extr,0.4911,0.0581,0.4851,0.065


In [18]:
# train set score for comparison
best_extr.fit(x_train, y_train.values.ravel())
print("training acc mean: {:.4f}".format(
    best_extr.score(x_train, y_train)))
print("training f1  mean: {:.4f}".format(
    f1_score(y_train, best_extr.predict(x_train), average = 'weighted')))

training acc mean: 0.9714
training f1  mean: 0.9714


In [20]:
# save 
with open('./Pickles/best_extr.pkl', 'wb') as handle:
    pickle.dump(best_extr, handle)
    pickle.dump(best_extr_res, handle)

In [None]:
with open(pkls + 'best_extr.pkl', 'rb') as handle:
    best_extr = pickle.load(handle)
    best_extr_res = pickle.load(handle)

### Random-Forest 

In [21]:
# RandomForest 1st run
cv = 10
n_estimators = 1000 # go for ~2000
param_grid = {'max_depth': [2, 4, 8, 16], 
              'max_features': ['log2', 'auto'],
              'max_leaf_nodes': [8, 16, 24],
              'min_samples_leaf': [1,2,3]}

scorers = {'accuracy': make_scorer(accuracy_score),
           'f1'      : make_scorer(f1_score, average = 'weighted')}
grid = GridSearchCV(RandomForestClassifier(n_estimators = n_estimators), param_grid, refit='accuracy', verbose=1, 
                    scoring = scorers, return_train_score=True, cv = cv, n_jobs = -1)

grid.fit(x_train, y_train.values.ravel())
best_forest = grid.best_estimator_

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 344 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:   30.7s finished


In [12]:
best_forest

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=16, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [8]:
# evaluate and save
best_forest_res = round(pd.DataFrame(index = ['forest'], data  = 
          {'acc mean':[grid.cv_results_['mean_test_accuracy'][grid.best_index_]],
           'acc 2sd': [2*grid.cv_results_['std_test_accuracy'][grid.best_index_]],
           'f1 mean': [grid.cv_results_['mean_test_f1'][grid.best_index_]],
           'f1 2sd' : [2*grid.cv_results_['std_test_f1'][grid.best_index_]]}),4)
best_forest_res

Unnamed: 0,acc mean,acc 2sd,f1 mean,f1 2sd
forest,0.5361,0.0627,0.5276,0.0711


In [25]:
# train set score for comparison
best_forest.fit(x_train, y_train.values.ravel())
print("training acc mean: {:.4f}".format(
    best_forest.score(x_train, y_train)))
print("training f1  mean: {:.4f}".format(
    f1_score(y_train, best_forest.predict(x_train), average = 'weighted')))

training acc mean: 0.6250
training f1  mean: 0.6228


In [26]:
# save
with open('./Pickles/best_forest.pkl', 'wb') as handle:
    pickle.dump(best_forest, handle)
    pickle.dump(best_forest_res, handle)

In [11]:
with open(pkls + 'best_forest.pkl', 'rb') as handle:
    best_forest = pickle.load(handle)
    best_forest_res = pickle.load(handle)
    
# best_forest_res
best_forest

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=16, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
# features importances
for score, name in sorted(zip(doc.feature_importances_, x_train[1:]), reverse = True):
    print(name, '-', round(score, 5))

a_mm - 0.0556
a_norm_mm - 0.03164
d_ms - 0.02905
d_norm_ms - 0.0271
summ_msf_ms - 0.02469
FT7 - 0.02388
FC5 - 0.02206
summ_sef90_ms - 0.02188
b_ms - 0.02161
C5 - 0.02126
summ_sef95_ms - 0.02076
b_norm_ms - 0.02029
summ_sef95_mm - 0.01973
F6 - 0.01967
Fpz - 0.01912
K_mm - 0.01816
FT8 - 0.01803
F7 - 0.01758
a_norm_ms - 0.01723
b_mm - 0.01663
summ_msf_mm - 0.01648
CP3 - 0.01608
F8 - 0.01561
d_mm - 0.01484
t_ms - 0.01435
a_ms - 0.01398
summ_sef90_mm - 0.01353
CP4 - 0.01194
F4 - 0.01174
FC6 - 0.01129
summ_se_ms - 0.01107
d_norm_mm - 0.01063
b_norm_mm - 0.00983
t_mm - 0.00927
FC1 - 0.00907
P3 - 0.00888
C4 - 0.00887
FC3 - 0.00853
POz - 0.00844
summ_se_mm - 0.00836
T8 - 0.00812
Fz - 0.00809
T7 - 0.00789
F2 - 0.00773
Fp2 - 0.00771
K_ms - 0.00758
O1 - 0.00749
PO8 - 0.00746
FC2 - 0.00745
F5 - 0.00735
Fp1 - 0.0072
CP5 - 0.00712
F1 - 0.0071
C6 - 0.00707
C3 - 0.00685
wSMI_ms - 0.00659
TP8 - 0.00654
P5 - 0.00642
AF3 - 0.00638
C2 - 0.00636
P6 - 0.00635
AF4 - 0.00624
FC4 - 0.00596
AF7 - 0.00573
wSMI_mm