### Recursive Feature Elimination
EstimatorCV objects for efficient parameter search

In [3]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# load dataset
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)

# recusive feature elimination
# (drops the feature with the smallest absolute coefficient)
# (then trains the model on the new feature set)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# reducing to 2 features
feature_elim_lr = RFE(LogisticRegression(C=100), n_features_to_select=2)
feature_elim_lr.fit(X_train, y_train)
score_two_f = feature_elim_lr.score(X_test, y_test)
print('='*8)
print('Logistic Regression model reduced to 2 features.')
print('Score: {}'.format(score_two_f))
print('='*8)

Logistic Regression model reduced to 2 features.
Score: 0.9473684210526315


In [4]:
# using grid search to find optimal number of features

from sklearn.grid_search import GridSearchCV

param_grid = {'n_features_to_select': list(range(1, 5))}
grid_search = GridSearchCV(feature_elim_lr, param_grid, cv=5)
grid_search.fit(X_train, y_train)
grid_search.score(X_test, y_test)

# optimal features
print('='*8)
print(grid_search.best_params_)
print('Out of [{}] possible features.'.format(X_train.shape[1]))
print('='*8)

{'n_features_to_select': 4}
Out of [4] possible features.


In [7]:
# RFECV - automatic parameter selection using cross validation
from sklearn.feature_selection import RFECV

rfecv = RFECV(LogisticRegression(C=100)).fit(X_train, y_train)
rfecv.score(X_test, y_test)
rfecv.n_features_

4

In [12]:
# LR score with 4 features
feature_elim_lr = RFE(LogisticRegression(C=100), n_features_to_select=4)
feature_elim_lr.fit(X_train, y_train)
score_four_f = feature_elim_lr.score(X_test, y_test)
print('='*8)
print('Comparing LR feature dimensions')
print('4 Feature Score: {}'.format(score_four_f))
print('2 Feature Score: {}'.format(score_two_f))
print('='*8)

Comparing LR feature dimensions
4 Feature Score: 0.9736842105263158
2 Feature Score: 0.9473684210526315
