# Notebook to test feature selection methods
- PCA
- RFE
- Feature importance
- Univariate Selection

In [2]:
# Imports
import os, sys
import numpy as np
import matplotlib.pyplot as plt

from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# to enable local imports
module_path = os.path.abspath('code')
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

from machine_learning_data_generation import loadOnlineEEGdata

D:\Masterthesis\thesis_eeg\code


Using TensorFlow backend.


In [3]:
# Load some online EEG Data
eegData, freqData, entropyData = loadOnlineEEGdata(splitData=True)
eegX_train, eegy_train, eegX_test, eegy_test = eegData
freqX_train, freqy_train, freqX_test, freqy_test = freqData
X_train_entropy, y_train_enoptry, X_test_entropy, y_test_entropy = entropyData

# reshape
freqX_train = freqX_train.reshape(freqX_train.shape[0], freqX_train.shape[2])
freqX_test = freqX_test.reshape(freqX_test.shape[0], freqX_test.shape[2])

X_train_entropy = X_train_entropy.reshape(X_train_entropy.shape[0], X_train_entropy.shape[2])
X_test_entropy = X_test_entropy.reshape(X_test_entropy.shape[0], X_test_entropy.shape[2])

Loading Online EEG Data from D:/Masterthesis/EEG_Data/eeg_data_online ...
EEG Data Shape:
(5024, 512, 40) (5024,) (2154, 512, 40) (2154,)
Freq Data Shape:
(1008, 1, 1200) (1008,) (432, 1, 1200) (432,)
Entropy Data Shape:
(5024, 1, 200) (5024,) (2154, 1, 200) (2154,)


## Principal Component Analysis (PCA)

## Recursive Feature Elimination (RFE)

In [4]:
X = X_train_entropy
y = y_train_enoptry

In [9]:
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=25)
# fit RFE
rfe.fit(X, y)
# summarize all features
for i in range(X.shape[1]):
    print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected True, Rank: 1.000
Column: 1, Selected False, Rank: 77.000
Column: 2, Selected False, Rank: 82.000
Column: 3, Selected False, Rank: 47.000
Column: 4, Selected False, Rank: 72.000
Column: 5, Selected False, Rank: 73.000
Column: 6, Selected False, Rank: 66.000
Column: 7, Selected False, Rank: 64.000
Column: 8, Selected False, Rank: 84.000
Column: 9, Selected False, Rank: 98.000
Column: 10, Selected False, Rank: 7.000
Column: 11, Selected False, Rank: 88.000
Column: 12, Selected False, Rank: 49.000
Column: 13, Selected False, Rank: 75.000
Column: 14, Selected False, Rank: 55.000
Column: 15, Selected True, Rank: 1.000
Column: 16, Selected False, Rank: 74.000
Column: 17, Selected False, Rank: 96.000
Column: 18, Selected False, Rank: 35.000
Column: 19, Selected False, Rank: 91.000
Column: 20, Selected False, Rank: 5.000
Column: 21, Selected False, Rank: 59.000
Column: 22, Selected False, Rank: 60.000
Column: 23, Selected False, Rank: 18.000
Column: 24, Selected False, Rank

In [8]:
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score)

# create pipeline
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=50)
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[('s',rfe),('m',model)])
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring=f1_scorer, cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy / F1 Score: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy / F1 Score: 0.883 (0.013)


<sklearn.model_selection._split.RepeatedStratifiedKFold at 0x253ead6d608>