In [1]:
import numpy as np
import pandas as pd
from scipy.stats import *
import seaborn as sns

In [2]:
df = sns.load_dataset('iris')

In [3]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
df.shape

(150, 5)

In [6]:
x = df.iloc[:,0:-1]
y = df['species']

# Wrapper Method:

## Exhaustive Feature Selection:
### Very efficient with small number of columns
#### Try every combination of feature and train the model with each combination. Measure performance for each combination using cross validation. Feature with best performance score are selected.

In [7]:
df1 = x.copy()

In [8]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [9]:
lr = LogisticRegression(max_iter=1000)
efs = EFS(lr, max_features=4, scoring='accuracy', cv=5)

<IPython.core.display.Javascript object>

In [10]:
efs.fit(df1,y)

Features: 15/15

In [11]:
efs.best_score_

0.9733333333333334

In [12]:
efs.best_feature_names_

('sepal_length', 'sepal_width', 'petal_length', 'petal_width')

In [13]:
efs.subsets_

{0: {'feature_idx': (0,),
  'cv_scores': array([0.66666667, 0.73333333, 0.76666667, 0.76666667, 0.83333333]),
  'avg_score': 0.7533333333333333,
  'feature_names': ('sepal_length',)},
 1: {'feature_idx': (1,),
  'cv_scores': array([0.53333333, 0.56666667, 0.53333333, 0.56666667, 0.63333333]),
  'avg_score': 0.5666666666666667,
  'feature_names': ('sepal_width',)},
 2: {'feature_idx': (2,),
  'cv_scores': array([0.93333333, 1.        , 0.9       , 0.93333333, 1.        ]),
  'avg_score': 0.9533333333333334,
  'feature_names': ('petal_length',)},
 3: {'feature_idx': (3,),
  'cv_scores': array([1.        , 0.96666667, 0.9       , 0.93333333, 1.        ]),
  'avg_score': 0.96,
  'feature_names': ('petal_width',)},
 4: {'feature_idx': (0, 1),
  'cv_scores': array([0.73333333, 0.83333333, 0.76666667, 0.86666667, 0.86666667]),
  'avg_score': 0.8133333333333332,
  'feature_names': ('sepal_length', 'sepal_width')},
 5: {'feature_idx': (0, 2),
  'cv_scores': array([0.93333333, 1.        , 0.9   

In [14]:
pd.DataFrame(efs.subsets_).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names
0,"(0,)","[0.6666666666666666, 0.7333333333333333, 0.766...",0.753333,"(sepal_length,)"
1,"(1,)","[0.5333333333333333, 0.5666666666666667, 0.533...",0.566667,"(sepal_width,)"
2,"(2,)","[0.9333333333333333, 1.0, 0.9, 0.9333333333333...",0.953333,"(petal_length,)"
3,"(3,)","[1.0, 0.9666666666666667, 0.9, 0.9333333333333...",0.96,"(petal_width,)"
4,"(0, 1)","[0.7333333333333333, 0.8333333333333334, 0.766...",0.813333,"(sepal_length, sepal_width)"
5,"(0, 2)","[0.9333333333333333, 1.0, 0.9, 0.9333333333333...",0.953333,"(sepal_length, petal_length)"
6,"(0, 3)","[0.9333333333333333, 0.9666666666666667, 0.933...",0.953333,"(sepal_length, petal_width)"
7,"(1, 2)","[0.9333333333333333, 1.0, 0.9, 0.9333333333333...",0.953333,"(sepal_width, petal_length)"
8,"(1, 3)","[0.9333333333333333, 0.9666666666666667, 0.9, ...",0.94,"(sepal_width, petal_width)"
9,"(2, 3)","[0.9666666666666667, 0.9666666666666667, 0.933...",0.96,"(petal_length, petal_width)"


In [15]:
x_train = efs.transform(x)

## Sequential Backward Selection/Elimination:

In [16]:
df1 = pd.read_csv('https://raw.githubusercontent.com/mohitgabani1/Feature-Engineering/main/mobile_dataset.csv')

In [17]:
df1.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [18]:
x = df1.iloc[:,0:-1]
y = df1['price_range']

In [19]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [20]:
standard_scaler = StandardScaler()
x_scaled = standard_scaler.fit_transform(x)

In [21]:
dtc = LogisticRegression()

<IPython.core.display.Javascript object>

In [22]:
backward_sfs = SequentialFeatureSelector(estimator=dtc,
                         cv = 5,
                         forward=False,
                         k_features='best')

In [23]:
backward_sfs.fit(x_scaled,y)

In [24]:
backward_sfs.k_feature_names_

('0', '6', '8', '11', '12', '13', '14', '19')

In [25]:
backward_sfs.k_score_

0.9754999999999999

In [26]:
pd.DataFrame(backward_sfs.subsets_).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names
20,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.96, 0.955, 0.9675, 0.9625, 0.9675]",0.9625,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
19,"(0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0.97, 0.955, 0.9775, 0.96, 0.97]",0.9665,"(0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
18,"(0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15...","[0.975, 0.955, 0.9825, 0.965, 0.97]",0.9695,"(0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15..."
17,"(0, 1, 2, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...","[0.975, 0.96, 0.98, 0.97, 0.97]",0.971,"(0, 1, 2, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1..."
16,"(0, 1, 2, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...","[0.975, 0.96, 0.9825, 0.965, 0.9725]",0.971,"(0, 1, 2, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1..."
15,"(0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, ...","[0.975, 0.9625, 0.9825, 0.97, 0.9725]",0.9725,"(0, 1, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, ..."
14,"(0, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18,...","[0.975, 0.96, 0.9825, 0.9725, 0.97]",0.972,"(0, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18,..."
13,"(0, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 19)","[0.9725, 0.9625, 0.975, 0.9825, 0.9725]",0.973,"(0, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 19)"
12,"(0, 5, 6, 8, 9, 11, 12, 13, 14, 15, 17, 19)","[0.9725, 0.9625, 0.975, 0.98, 0.9825]",0.9745,"(0, 5, 6, 8, 9, 11, 12, 13, 14, 15, 17, 19)"
11,"(0, 5, 6, 8, 9, 11, 12, 13, 14, 15, 19)","[0.975, 0.9625, 0.9725, 0.9775, 0.9825]",0.974,"(0, 5, 6, 8, 9, 11, 12, 13, 14, 15, 19)"


## Sequential Forward Selection:

In [27]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [28]:
lr = LogisticRegression()

<IPython.core.display.Javascript object>

In [29]:
forward_sfs = SequentialFeatureSelector(estimator=lr,
                         cv = 5,
                         forward=True,
                         k_features='best')

In [30]:
forward_sfs.fit(x_scaled,y)

In [31]:
forward_sfs.k_feature_names_

('0', '4', '6', '8', '10', '11', '12', '13', '14', '19')

In [32]:
forward_sfs.k_score_

0.977

In [33]:
pd.DataFrame(forward_sfs.subsets_).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names
1,"(13,)","[0.7725, 0.7525, 0.7675, 0.7475, 0.7375]",0.7555,"(13,)"
2,"(0, 13)","[0.815, 0.825, 0.8325, 0.825, 0.82]",0.8235,"(0, 13)"
3,"(0, 11, 13)","[0.9175, 0.8975, 0.9075, 0.935, 0.8825]",0.908,"(0, 11, 13)"
4,"(0, 11, 12, 13)","[0.9675, 0.9475, 0.9625, 0.96, 0.95]",0.9575,"(0, 11, 12, 13)"
5,"(0, 8, 11, 12, 13)","[0.975, 0.9625, 0.9675, 0.975, 0.9675]",0.9695,"(0, 8, 11, 12, 13)"
6,"(0, 8, 11, 12, 13, 14)","[0.9825, 0.9725, 0.9675, 0.9775, 0.9675]",0.9735,"(0, 8, 11, 12, 13, 14)"
7,"(0, 8, 11, 12, 13, 14, 19)","[0.9875, 0.9675, 0.975, 0.975, 0.9675]",0.9745,"(0, 8, 11, 12, 13, 14, 19)"
8,"(0, 6, 8, 11, 12, 13, 14, 19)","[0.98, 0.97, 0.98, 0.9775, 0.97]",0.9755,"(0, 6, 8, 11, 12, 13, 14, 19)"
9,"(0, 4, 6, 8, 11, 12, 13, 14, 19)","[0.98, 0.9725, 0.9825, 0.9725, 0.9725]",0.976,"(0, 4, 6, 8, 11, 12, 13, 14, 19)"
10,"(0, 4, 6, 8, 10, 11, 12, 13, 14, 19)","[0.9825, 0.9725, 0.9775, 0.9775, 0.975]",0.977,"(0, 4, 6, 8, 10, 11, 12, 13, 14, 19)"


## Recursive Feature Elimination:

In [34]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

In [35]:
from sklearn.svm import SVC

In [36]:
svc = SVC(kernel='linear')

In [37]:
rfe = RFE(estimator=svc,
    n_features_to_select=10)

In [38]:
rfe.fit(x,y)

In [39]:
rfe.n_features_

10

In [40]:
rfe.n_features_in_

20

In [41]:
rfe.feature_names_in_

array(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'], dtype=object)

In [42]:
rfe.ranking_

array([ 9,  1,  1,  1,  5,  1,  6,  1,  7,  1,  4, 10, 11,  8,  1,  3,  2,
        1,  1,  1])

In [43]:
x.columns[rfe.support_]

Index(['blue', 'clock_speed', 'dual_sim', 'four_g', 'm_dep', 'n_cores', 'sc_h',
       'three_g', 'touch_screen', 'wifi'],
      dtype='object')

In [44]:
rfe.classes_

array([0, 1, 2, 3], dtype=int64)