In [None]:
# Excellent
https://www.analyticsvidhya.com/blog/2016/12/introduction-to-feature-selection-methods-with-an-example-or-how-to-select-the-right-variables/
https://hub.packtpub.com/4-ways-implement-feature-selection-python-machine-learning/
http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
https://stats.stackexchange.com/questions/204141/difference-between-selecting-features-based-on-f-regression-and-based-on-r2

https://machinelearningmastery.com/feature-selection-machine-learning-python/


In [None]:
'''
Links:

https://github.com/Yorko/mlcourse_open/blob/master/jupyter_english/topic06_features/topic6_feature_engineering_feature_selection.ipynb
'''


# Feature selection
    
    - Reducing overfitting (Variance)
    - Interpretability
    - Inference speed

# Types of Feature Selection
http://scikit-learn.org/stable/modules/feature_selection.html
    - Filter
        - Statistical
        - Model based
    - Wrapper
        - Forward Selection
        - Backward Elimination
    - Embedded
        - Lasso
        - Random Forest
        - Gradient Boosted Trees

## Filter Feature Selection
    - Filters out input features that do not carry information about output features
    - Statistical approches
    - ML algorithm approaches

### Statistical Feature Selection

In [None]:
from sklearn.datasets import make_classification

x_data_generated, y_data_generated = make_classification()
x_data_generated.shape

In [None]:
from sklearn.feature_selection import VarianceThreshold

print(VarianceThreshold(.2).fit_transform(x_data_generated).shape)
print(VarianceThreshold(.7).fit_transform(x_data_generated).shape)
print(VarianceThreshold(.9).fit_transform(x_data_generated).shape)

### Select from Model

In [96]:
# select based on 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

selection=SelectFromModel(LogisticRegression(), threshold=1.25) 
# Change thresholds, threshold represents distance from the mean

fitted_selection=selection.fit(x_data_generated, y_data_generated)

fitted_selection.transform(x_data_generated).shape

(100, 2)

In [63]:
from sklearn.ensemble import RandomForestClassifier

# Pipelines
# http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

from sklearn.pipeline import make_pipeline

x_data_generated, y_data_generated = make_classification()

# Creates LR modes
lr = LogisticRegression()

#Creates RF model
rf = RandomForestClassifier()

# Creates pipe. Selection is based on RF and modeling (predictions) on LR. There is another way for creating pipeline
# Will be shown in next example

pipe = make_pipeline(SelectFromModel(estimator=RandomForestClassifier()), LogisticRegression())


-0.551842364751
-0.699983459749
-0.311079205877


In [116]:
'''
Evaluation of RF, LR and Pipe (combination of RF and LR)
'''

from sklearn.model_selection import cross_val_score

print(cross_val_score(lr, x_data_generated, y_data_generated, scoring='neg_log_loss').mean())
print(cross_val_score(rf, x_data_generated, y_data_generated, scoring='neg_log_loss').mean())
print(cross_val_score(pipe, x_data_generated, y_data_generated, scoring='neg_log_loss').mean())

-0.551842364751
-0.724338133196
-0.874929124628


In [128]:
# Gradient boosting and Logistic regression may also be used for feature selection

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

pipe1 = Pipeline([('feature_selection', SelectFromModel(estimator=LogisticRegression())),
                   ('classification', RandomForestClassifier())])
#pipe1.fit(x_data_generated, y_data_generated)

print(cross_val_score(pipe1, x_data_generated, y_data_generated, scoring='neg_log_loss').mean())

-0.618607502552


### Select K Best

In [None]:
# Select K best gives control on number of features

# Selects K best features based on provided function (statistical or model)

# Classification - f_classif (default), mutual_info_classif, chi2
# Regression - f_regression, mutual_info_regression, select_percentile


In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
from sklearn.feature_selection import SelectKBest

In [71]:
SelectKBest(k=2).fit(x_data_generated, y_data_generated).transform(x_data_generated).shape

(100, 2)

In [130]:
SelectKBest(k=2).fit(x_data_generated, y_data_generated).get_support() #.transform(x_data_generated)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
        True, False], dtype=bool)

### Select K Best with an arbitrary function

In [64]:
import numpy as np
def coefs(X,y):
    LR=LogisticRegression().fit(X,y)
    return(np.abs(LR.coef_)[0])

In [68]:
test = SelectKBest(score_func=coefs, k=4)
fit=test.fit(x_data_generated, y_data_generated)
fit.transform(x_data_generated).shape

(100, 4)

## Embedded Feature Selection

### Lasso (L1) Regularization

In [None]:
# Many algorithms enable feature selection during their optimization process: LR, SVM, RF
# Usually these algorithms are not used in combination with other Feature Selection techniques

In [131]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

In [132]:
# regularized_cost = cost + regularization_penalty
# larger alpha => less features
lasso=Lasso(alpha=0.1).fit(x_data_generated, y_data_generated)

In [133]:
# Learning the best alpha in lasso 
lasso_cv=LassoCV(alphas=[0.1,0.2,0.3]).fit(x_data_generated, y_data_generated)

In [136]:
lasso_cv.coef_

array([-0.        ,  0.        ,  0.        ,  0.        , -0.        ,
        0.        , -0.        ,  0.        ,  0.        , -0.        ,
       -0.        ,  0.        ,  0.        , -0.        , -0.04395427,
       -0.        ,  0.        ,  0.        ,  0.17662538,  0.        ])

In [143]:
lasso_cv.alpha_

0.10000000000000001

In [146]:
lasso_cv.coef_[np.abs(lasso_cv.coef_)>0]

array([-0.04395427,  0.17662538])

In [147]:
np.nonzero(lasso_cv.coef_)

(array([14, 18]),)

In [149]:
np.where(np.abs(lasso_cv.coef_)>0)

(array([14, 18]),)

##  Wrapper Feature Selection (Grid Search)

In [None]:
# Computationally most intensive procedure - for each combination of features model is built and evaluated
# More accurate than filter, but not from Embedded
# Heuristics - Backward Elimination (Recursive Feature Elimination), Forward Selection

In [150]:
from sklearn.feature_selection import RFE
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE

In [160]:
selector=RFE(estimator=LogisticRegression(), n_features_to_select=5, step=1)

In [161]:
selector.fit(x_data_generated, y_data_generated)

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=5, step=1, verbose=0)

In [162]:
selector.support_

array([False, False, False, False, False, False,  True, False, False,
        True, False, False, False,  True,  True, False, False, False,
        True, False], dtype=bool)

In [163]:
selector.ranking_

array([ 8, 12,  7,  4, 13,  9,  1,  3, 10,  1, 11, 16,  5,  1,  1, 14,  2,
        6,  1, 15])

In [165]:
selector.transform(x_data_generated).shape

(100, 5)

In [172]:
# RFE with Cross Validation
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

selectorCV=RFECV(estimator=LogisticRegression(), step=1, cv=StratifiedKFold(5),\
              scoring='accuracy')


selectorCV.fit(x_data_generated, y_data_generated)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
   estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
   n_jobs=1, scoring='accuracy', step=1, verbose=0)

In [173]:
selectorCV.ranking_

array([11, 15, 10,  7, 16, 12,  3,  6, 13,  4, 14, 19,  8,  2,  1, 17,  5,
        9,  1, 18])

In [7]:
## ML Extend Library

# !pip install mlextend
from mlxtend.feature_selection import SequentialFeatureSelector

selector = SequentialFeatureSelector(LogisticRegression(), scoring='neg_log_loss', 
                                     verbose=2, k_features=3, forward=True, n_jobs=-1)

selector.fit(x_data, y_data)

/bin/sh: pip: command not found


In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
LR=LogisticRegression()

In [10]:
model.coef_

array([-0.69117985,  1.06785652,  2.05245624,  0.11008769, -0.54483436,
       -1.04298774, -0.00544058, -0.12356116,  0.36532097, -0.37298204,
       -0.13874104, -0.18786809, -0.03069759, -0.08491215,  0.1852439 ,
       -0.17491275,  0.12791625,  0.04778611, -0.03101822, -0.2430916 ])

In [56]:
coefs(x_data_generated, y_data_generated)[0]

array([ 0.69117985,  1.06785652,  2.05245624,  0.11008769,  0.54483436,
        1.04298774,  0.00544058,  0.12356116,  0.36532097,  0.37298204,
        0.13874104,  0.18786809,  0.03069759,  0.08491215,  0.1852439 ,
        0.17491275,  0.12791625,  0.04778611,  0.03101822,  0.2430916 ])