# Random Forest Classification (with feature importance)

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/mudo121/thesis_eeg/blob/master/machine_learning_notebooks/random_forest.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

In [7]:
# Imports
import os, sys
import numpy as np
import matplotlib.pyplot as plt

# to enable local imports
module_path = os.path.abspath('../code')
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

from machine_learning_load_data import loadOnlineEEGdata

/home/nvidia/masterthesis/thesis_eeg/code


In [8]:
# Load some online EEG Data
eegData, freqData, entropyData = loadOnlineEEGdata(dirPath='../../EEG_Data/eeg_data_online', splitData=True)
eegX_train, eegy_train, eegX_test, eegy_test = eegData
freqX_train, freqy_train, freqX_test, freqy_test = freqData
X_train_entropy, y_train_entropy, X_test_entropy, y_test_entropy = entropyData

# reshape
freqX_train = freqX_train.reshape(freqX_train.shape[0], freqX_train.shape[2])
freqX_test = freqX_test.reshape(freqX_test.shape[0], freqX_test.shape[2])

X_train_entropy = X_train_entropy.reshape(X_train_entropy.shape[0], X_train_entropy.shape[2])
X_test_entropy = X_test_entropy.reshape(X_test_entropy.shape[0], X_test_entropy.shape[2])

Loading Online EEG Data from ../../EEG_Data/eeg_data_online ...
EEG Data Shape:
(5024, 512, 40) (5024,) (2154, 512, 40) (2154,)
Freq Data Shape:
(1008, 1, 1200) (1008,) (432, 1, 1200) (432,)
Entropy Data Shape:
(5024, 1, 200) (5024,) (2154, 1, 200) (2154,)


In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_predict, cross_val_score

from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score) 

# Create a model
def evaluateModel(model,X_train, y_train, kfoldTimes=8, n_jobs=None):
    
    print("Model: {}".format(model))

    # generate cross val score
    kfoldTimes = kfoldTimes
    print("Calculating cross val scores...")
    accuaries = cross_val_score(model, X_train, y_train, cv=kfoldTimes, scoring=f1_scorer, n_jobs=n_jobs)
    print("Cross val scores (Accuracies):")
    for i in range(0, len(accuaries)):
        print(" Fold {fold}: {acc}".format(fold=i+1, acc=accuaries[i]))

    # make predictions with the model
    print("\nCaclulating cross val predictions...")
    y_train_pred = cross_val_predict(model, X_train, y_train, cv=kfoldTimes, n_jobs=n_jobs)


    svm_confusionMatrix = confusion_matrix(y_train, y_train_pred)
    print("""\nConfusion Matrix\n------------------------
    True Negative:   {tn} - False Positive: {fp}
    False Negatives: {fn} - True positive:  {tp}""".format(tn=svm_confusionMatrix[0][0],
                                                           fp=svm_confusionMatrix[0][1],
                                                           fn=svm_confusionMatrix[1][0],
                                                           tp=svm_confusionMatrix[1][1]))

    print("----------------------")
    print("Precision: {} ".format(precision_score(y_train, y_train_pred)))
    print("Recall:    {}".format(recall_score(y_train, y_train_pred)))
    print("F1 Score:  {}".format(f1_score(y_train, y_train_pred)))

In [5]:
from sklearn.ensemble import RandomForestClassifier

evaluateModel(model=RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1),
              X_train=X_train_entropy,
              y_train=y_train_entropy)

Model: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=16, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
Calculating cross val scores...
Cross val scores (Accuracies):
 Fold 1: 0.10207939508506617
 Fold 2: 0.5725915875169607
 Fold 3: 0.4761904761904763
 Fold 4: 0.71731843575419
 Fold 5: 0.6969026548672566
 Fold 6: 0.8240109140518417
 Fold 7: 0.4659400544959128
 Fold 8: 0.5524625267665952

Caclulating cross val predictions...

Confusion Matrix
------------------------
    True Negative:   1189 - False Positive: 1204
    False Negatives: 1028 - True p

In [8]:
y_train_entropy

array([1, 1, 1, ..., 1, 1, 1])

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Create a model to test
model = RandomForestClassifier()

# Create a parameter grid - here you specifiy which combinations you want to test
param_grid = [
    {'n_estimators': [500, 1000, 2000],
     'min_samples_split' : [2, 4, 8],
     'criterion' : ['gini', 'entropy'],
     'max_features' : ['auto', 'log2'],
    }
]

kFoldTimes = 8

# create a grid search
grid_search = GridSearchCV(model, param_grid, cv=kFoldTimes,
                            scoring=f1_scorer,
                            return_train_score=True,
                            n_jobs=-1)

# fit it with the data
result = grid_search.fit(X_train_entropy, y_train_entropy)

In [11]:
print("Best Params: {}".format(grid_search.best_params_))
print("Best Estimator: {}".format(grid_search.best_estimator_))

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

Best Params: {'max_features': 'log2', 'min_samples_split': 4, 'n_estimators': 2000, 'criterion': 'entropy'}
Best Estimator: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.6154663116812853 {'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 500, 'criterion': 'gini'}
0.6198912904171416 {'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 1000, 'criterion': 'gini'}
0.6190303375797785 {'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 2000, 'criterion': '

In [12]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
evaluateModel(grid_search.best_estimator_, X_train_entropy, y_train_entropy)