In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [4]:
from sklearn.datasets import load_breast_cancer

In [5]:
data = load_breast_cancer()
data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [6]:
X = pd.DataFrame(data = data.data, columns=data.feature_names)
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
y = data.target

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((455, 30), (114, 30))

In [9]:
sel = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))
sel.fit(X_train, y_train)
sel.get_support()

array([ True, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
        True, False, False])

In [10]:
X_train.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [11]:
features = X_train.columns[sel.get_support()]
features

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst concave points'],
      dtype='object')

In [12]:
len(features)

10

In [13]:
np.mean(sel.estimator_.feature_importances_)

0.03333333333333334

In [14]:
sel.estimator_.feature_importances_

array([0.03699612, 0.01561296, 0.06016409, 0.0371452 , 0.0063401 ,
       0.00965994, 0.0798662 , 0.08669071, 0.00474992, 0.00417092,
       0.02407355, 0.00548033, 0.01254423, 0.03880038, 0.00379521,
       0.00435162, 0.00452503, 0.00556905, 0.00610635, 0.00528878,
       0.09556258, 0.01859305, 0.17205401, 0.05065305, 0.00943096,
       0.01565491, 0.02443166, 0.14202709, 0.00964898, 0.01001304])

In [15]:
X_train_rfc = sel.transform(X_train)
X_test_rfc = sel.transform(X_test)

In [16]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))

In [17]:
%%time
run_randomForest(X_train_rfc, X_test_rfc, y_train, y_test)

Accuracy:  0.9473684210526315
CPU times: user 309 ms, sys: 28.8 ms, total: 337 ms
Wall time: 355 ms


In [18]:
%%time
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy:  0.9649122807017544
CPU times: user 338 ms, sys: 33.4 ms, total: 371 ms
Wall time: 459 ms


# Recursive Feature Elimination (RFE)

In [19]:
from sklearn.feature_selection import RFE
sel = RFE(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1), n_features_to_select = 15)
sel.fit(X_train, y_train)

RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=-1,
                                     oob_score=False, random_state=0, verbose=0,
                                     warm_start=False),
    n_features_to_select=15, step=1, verbose=0)

In [20]:
sel.get_support()

array([ True,  True,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True,  True,  True,  True,  True, False,  True,
        True,  True, False])

In [21]:
features = X_train.columns[sel.get_support()]
features

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean concavity', 'mean concave points', 'area error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
       'worst concavity', 'worst concave points', 'worst symmetry'],
      dtype='object')

In [22]:
len(features)

15

In [23]:
X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)

In [24]:
%%time
run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)

Accuracy:  0.9736842105263158
CPU times: user 290 ms, sys: 34.9 ms, total: 325 ms
Wall time: 361 ms


In [25]:
%%time
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy:  0.9649122807017544
CPU times: user 329 ms, sys: 44.2 ms, total: 374 ms
Wall time: 462 ms


# Feature selection by GradientBoost Tree Importance


In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [27]:
sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select = 12)
sel.fit(X_train, y_train)

RFE(estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                         criterion='friedman_mse', init=None,
                                         learning_rate=0.1, loss='deviance',
                                         max_depth=3, max_features=None,
                                         max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         n_estimators=100,
                                         n_iter_no_change=None,
                                         presort='deprecated', random_state=0,
                                         subsample=1.0, tol=0.0001,
                                         validation_frac

In [28]:
sel.get_support()

array([False,  True, False, False,  True, False, False,  True,  True,
       False, False, False, False,  True, False, False,  True, False,
       False, False,  True,  True,  True,  True, False, False,  True,
        True, False, False])

In [29]:
features = X_train.columns[sel.get_support()]
features

Index(['mean texture', 'mean smoothness', 'mean concave points',
       'mean symmetry', 'area error', 'concavity error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst concavity',
       'worst concave points'],
      dtype='object')

In [30]:
len(features)

12

In [33]:
X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)

In [34]:
%%time
run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)

Accuracy:  0.9736842105263158
CPU times: user 295 ms, sys: 38 ms, total: 333 ms
Wall time: 367 ms


In [35]:
%%time
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy:  0.9649122807017544
CPU times: user 344 ms, sys: 30.4 ms, total: 375 ms
Wall time: 456 ms


In [36]:
for index in range(1, 31):
    sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select = index)
    sel.fit(X_train, y_train)
    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)
    print('Selected Feature: ', index)
    run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)
    print()

Selected Feature:  1
Accuracy:  0.8771929824561403

Selected Feature:  2
Accuracy:  0.9035087719298246

Selected Feature:  3
Accuracy:  0.9649122807017544

Selected Feature:  4
Accuracy:  0.9736842105263158

Selected Feature:  5
Accuracy:  0.9649122807017544

Selected Feature:  6
Accuracy:  0.9912280701754386

Selected Feature:  7
Accuracy:  0.9736842105263158

Selected Feature:  8
Accuracy:  0.9649122807017544

Selected Feature:  9
Accuracy:  0.9736842105263158

Selected Feature:  10
Accuracy:  0.956140350877193

Selected Feature:  11
Accuracy:  0.956140350877193

Selected Feature:  12
Accuracy:  0.9736842105263158

Selected Feature:  13
Accuracy:  0.956140350877193

Selected Feature:  14
Accuracy:  0.956140350877193

Selected Feature:  15
Accuracy:  0.9649122807017544

Selected Feature:  16
Accuracy:  0.956140350877193

Selected Feature:  17
Accuracy:  0.9649122807017544

Selected Feature:  18
Accuracy:  0.9473684210526315

Selected Feature:  19
Accuracy:  0.9649122807017544

Selecte

In [37]:
sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select = 6)
sel.fit(X_train, y_train)
X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)
print('Selected Feature: ', 6)
run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)
print()

Selected Feature:  6
Accuracy:  0.9912280701754386



In [38]:
features = X_train.columns[sel.get_support()]
features

Index(['mean concave points', 'area error', 'worst texture', 'worst perimeter',
       'worst area', 'worst concave points'],
      dtype='object')

In [None]:
for index in range(1, 31):
    sel = RFE(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1), n_features_to_select = index)
    sel.fit(X_train, y_train)
    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)
    print('Selected Feature: ', index)
    run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)
    print()

Selected Feature:  1
Accuracy:  0.8947368421052632

Selected Feature:  2
Accuracy:  0.9298245614035088

Selected Feature:  3
Accuracy:  0.9473684210526315

Selected Feature:  4
Accuracy:  0.9649122807017544

Selected Feature:  5
Accuracy:  0.9649122807017544

Selected Feature:  6
Accuracy:  0.956140350877193

Selected Feature:  7
Accuracy:  0.956140350877193

Selected Feature:  8
Accuracy:  0.9649122807017544

Selected Feature:  9
Accuracy:  0.9736842105263158

Selected Feature:  10
Accuracy:  0.9736842105263158

Selected Feature:  11
Accuracy:  0.9649122807017544

Selected Feature:  12
Accuracy:  0.9736842105263158

Selected Feature:  13
Accuracy:  0.9649122807017544

