1.Variance Threshold

In [0]:
#Returns List of Selected Columns
def applyVarianceSelection(X,threshold):
  original_cols = X.columns
  selected_cols = list()
  from sklearn.feature_selection import VarianceThreshold
  sel = VarianceThreshold(threshold=(threshold * (1 - threshold))).fit(X,y)
  columns_selection = dict(zip(original_cols, sel.get_support()))
  for key,value in columns_selection.items():
    if value == True:
      selected_cols.append(key)
  return selected_cols

2.Univariate Tests

In [0]:
#Returns List of K best Selected Columns
def applyKBestSelection(X,y,kbest):
  original_cols = X.columns
  selected_cols = list()
  from sklearn.feature_selection import SelectKBest
  from sklearn.feature_selection import f_regression
  from sklearn.feature_selection import mutual_info_regression
  sel = SelectKBest(mutual_info_regression, k=kbest).fit(X, y) # 
  columns_selection = dict(zip(original_cols, sel.get_support()))
  for key,value in columns_selection.items():
    if value == True:
      selected_cols.append(key)
  return selected_cols

**f_classif** : ANOVA F-value between label/feature for classification tasks.

**mutual_info_classif** :Mutual information for a discrete target.

**chi2**:Chi-squared stats of non-negative features for classification tasks.

**f_regression**:F-value between label/feature for regression tasks.

**mutual_info_regression**:Mutual information for a continuous target.

**SelectPercentile**:Select features based on percentile of the highest scores.

**SelectFpr**:Select features based on a false positive rate test.

**SelectFdr**:Select features based on an estimated false discovery rate.

**SelectFwe**:Select features based on family-wise error rate.

**GenericUnivariateSelect**:Univariate feature selector with configurable mode.

3.Recursive Feature Elimination

In [0]:
#Returns List of Selected Columns
def applyRFE(X,y,n_features,steps):
  original_cols = X.columns
  selected_cols = list()
  from sklearn.feature_selection import RFE
  from sklearn.svm import SVR
  svr = SVR(kernel="linear", C=1)
  rfe = RFE(estimator=svr, n_features_to_select=n_features, step=steps)
  rfe.fit(X, y)
  columns_selection = dict(zip(original_cols,rfe.support_))
  for key,value in columns_selection.items():
    if value == True:
      selected_cols.append(key)
  return selected_cols

4.Recursive Feature Elimination with Cross Validation

In [0]:
#Returns List of Selected Columns
def applyRFECV(X,y,steps):
  original_cols = X.columns
  selected_cols = list()
  from sklearn.model_selection import StratifiedKFold
  from sklearn.feature_selection import RFECV
  from sklearn.svm import SVR
  svr = SVR(kernel="linear", C=1)
  rfecv = RFECV(estimator=svr, step=steps, cv=5) #cv= StratifiedKFold(5) for classification only
  rfecv.fit(X, y)
  columns_selection = dict(zip(original_cols,rfecv.support_))
  for key,value in columns_selection.items():
    if value == True:
      selected_cols.append(key)
  return selected_cols

5.Feature selection using SelectFromModel Meta-Transformer

In [0]:
#Model-Based
def applySelectFromModel(X,y):
  original_cols = X.columns
  selected_cols = list()
  from sklearn.svm import LinearSVR
  from sklearn.feature_selection import SelectFromModel
  lsvr = LinearSVR(C=0.001).fit(X, y) #For Classfication lsvc = LinearSVR(C=0.01,penalty='l1',dual=False).fit(X, y)
  model = SelectFromModel(lsvr, prefit=True)
  columns_selection = dict(zip(original_cols,model.get_support()))
  for key,value in columns_selection.items():
    if value == True:
      selected_cols.append(key)
  return selected_cols
  
#Tree Based
def applySelectFromModelEnsembleTrees(X,y):
  original_cols = X.columns
  selected_cols = list()
  from sklearn.feature_selection import SelectFromModel
  from sklearn.ensemble import ExtraTreesClassifier,ExtraTreesRegressor
  from sklearn.feature_selection import SelectFromModel
  clf = ExtraTreesRegressor(n_estimators=50) #ExtraTreesClassifier for classification
  clf = clf.fit(X, y) 
  model = SelectFromModel(clf, prefit=True)
  columns_selection = dict(zip(original_cols,model.get_support()))
  for key,value in columns_selection.items():
    if value == True:
      selected_cols.append(key)
  return selected_cols

Driver Code

In [83]:
from sklearn.datasets import load_iris
import pandas as pd
data = load_iris()
df = pd.DataFrame(data.data, columns=['sep_len','sep_wid','pet_len','pet_wid'])
target = 'pet_wid'
X = df.loc[:, df.columns != target]
y = df[[target]].to_numpy().ravel()
print("Variance ",applyVarianceSelection(X,0.8))
print("Kbest ", applyKBestSelection(X,y,3))
print("RFE",applyRFE(X,y,3,10))
print("RFE with Cross Validation",applyRFECV(X,y,10))
print("Model Based", applySelectFromModel(X,y))
print("Ensemble Tree Bases", applySelectFromModelEnsembleTrees(X,y))

Variance  ['sep_len', 'sep_wid', 'pet_len']
Kbest  ['sep_len', 'sep_wid', 'pet_len']
RFE ['sep_len', 'sep_wid', 'pet_len']
RFE with Cross Validation ['sep_len', 'sep_wid', 'pet_len']
Model Based ['pet_len']
Ensemble Tree Bases ['pet_len']
