# Feature Selection

In [1]:
from sklearn.datasets import make_regression, make_classification
import numpy as np

np.random.seed(37)

def get_regression_data():
    return make_regression(**{
        'n_samples': 1000,
        'n_features': 50,
        'n_informative': 10,
        'n_targets': 1,
        'bias': 5.3,
        'random_state': 37
    })

def get_classification_data():
    return make_classification(**{
        'n_samples': 2000,
        'n_features': 20,
        'n_informative': 2,
        'n_redundant': 2,
        'n_repeated': 0,
        'n_classes': 2,
        'n_clusters_per_class': 2,
        'random_state': 37
    })

A, b = get_regression_data()
C, d = get_classification_data()

## Univariate 

Univariate feature selection determines importance of each feature individually. This approach is accomplished through `GenericUnivariateSelect`. In a classification problem, use `chi2` or `mutual_info_classif` for the score function. Note that `chi2` requires your feature matrix to be non-negative. There are a variety of modes, but we are using the `percentile` and `k_best` modes.

In [2]:
from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

def get_best_indexes(scores, max_index, reverse=True):
    tups = sorted([(i, s) for i, s in enumerate(scores)], key=lambda tup: tup[1], reverse=reverse)
    tups = tups[:max_index]
    return [t[0] for t in tups]

def get_classification_performance(tr_index, te_index, X, y, selector):
    X_tr, X_te = X[tr_index], X[te_index]
    y_tr, y_te = y[tr_index], y[te_index]
    
    rf = RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1)
    
    model = Pipeline([
        ('selector', selector),
        ('rf', rf)
    ])
    
    model.fit(X_tr, y_tr)
    y_pr = model.predict_proba(X_te)[:, 1]
    
    return roc_auc_score(y_te, y_pr)

p_selector = GenericUnivariateSelect(**{
    'score_func': mutual_info_classif, 
    'mode': 'percentile', 
    'param': 15
})

k_selector = GenericUnivariateSelect(**{
    'score_func': mutual_info_classif, 
    'mode': 'k_best', 
    'param': 2
})

tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))

print(get_classification_performance(tr_index, te_index, C, d, p_selector))
print(get_classification_performance(tr_index, te_index, C, d, k_selector))

0.9765500000000001
0.977


When your output variable is continuous, use `f_regression` and `mutual_info_regression` for the score function.

In [3]:
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

def get_regression_performance(tr_index, te_index, X, y, selector):
    X_tr, X_te = X[tr_index], X[te_index]
    y_tr, y_te = y[tr_index], y[te_index]
    
    rf = RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1)
    
    model = Pipeline([
        ('selector', selector),
        ('rf', rf)
    ])
    
    model.fit(X_tr, y_tr)
    y_pr = model.predict(X_te)
    
    return mean_absolute_error(y_te, y_pr)

fp_selector = GenericUnivariateSelect(**{
    'score_func': f_regression, 
    'mode': 'percentile', 
    'param': 15
})

mp_selector = GenericUnivariateSelect(**{
    'score_func': mutual_info_regression, 
    'mode': 'percentile', 
    'param': 15
})

fk_selector = GenericUnivariateSelect(**{
    'score_func': f_regression, 
    'mode': 'k_best', 
    'param': 2
})

mk_selector = GenericUnivariateSelect(**{
    'score_func': mutual_info_regression, 
    'mode': 'k_best', 
    'param': 2
})

tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))

print(get_regression_performance(tr_index, te_index, A, b, fp_selector))
print(get_regression_performance(tr_index, te_index, A, b, fk_selector))
print(get_regression_performance(tr_index, te_index, A, b, mp_selector))
print(get_regression_performance(tr_index, te_index, A, b, mk_selector))

64.62026528579264
134.81849592816624
80.6040460679759
138.10921666118406


## Models

Models maybe used to select features as well through using `SelectFromModel`. The coefficients or variable importances of a model may be used to decide which features will be useful. 

In [5]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

lr_selector = SelectFromModel(**{
    'estimator': LogisticRegression(n_jobs=-1),
    'max_features': 5
})

rf_selector = SelectFromModel(**{
    'estimator': RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1),
    'max_features': 5
})

tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))

print(get_classification_performance(tr_index, te_index, C, d, lr_selector))
print(get_classification_performance(tr_index, te_index, C, d, rf_selector))

0.9742500000000001
0.9742500000000001


In [6]:
from sklearn.linear_model import LinearRegression

lr_selector = SelectFromModel(**{
    'estimator': LinearRegression(n_jobs=-1),
    'max_features': 5
})

rf_selector = SelectFromModel(**{
    'estimator': RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1),
    'max_features': 5
})

tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))

print(get_regression_performance(tr_index, te_index, A, b, lr_selector))
print(get_regression_performance(tr_index, te_index, A, b, rf_selector))

91.86886242490726
76.19174124554559


## Sequential

Sequential feature selection.

In [7]:
from sklearn.feature_selection import SequentialFeatureSelector

SequentialFeatureSelector(LogisticRegression(n_jobs=-1), n_features_to_select=5, n_jobs=-1, scoring='neg_mean_absolute_error')

ImportError: cannot import name 'SequentialFeatureSelector' from 'sklearn.feature_selection' (C:\Users\jeev\Anaconda3\lib\site-packages\sklearn\feature_selection\__init__.py)