In [1]:
import pandas as pd
import numpy as np

<h4 class="text-center"> Feature with low Variance </h4>

In [2]:
from sklearn import datasets

In [3]:
iris = datasets.load_iris()

In [4]:
feature_iris = iris.data
target_iris = iris.target

<span class="badge"> Variance threshold </span>

In [5]:
from sklearn.feature_selection import VarianceThreshold

In [6]:
feature_iris.shape

(150, 4)

In [7]:
variance_threshold = VarianceThreshold(threshold=.5)

In [8]:
feature_variance = variance_threshold.fit_transform(feature_iris)

In [9]:
feature_variance.shape

(150, 3)

In [10]:
variance_threshold.variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

<span class="badge"> If Feature Scale down to mean 0 variance_threshold will not work </span>

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()

In [13]:
feature_scale = scaler.fit_transform(feature_iris)

In [14]:
variance_scale_threshold = VarianceThreshold()

In [15]:
variance_scale_threshold.fit(feature_scale)

VarianceThreshold()

In [16]:
variance_scale_threshold.variances_

array([1., 1., 1., 1.])

<h4 class="text-center"> Binary Feature </h4>

In [17]:
binary_feature = [[0,1,0],
                 [1,1,0],
                  [0,0,1],
                  [1,0,1]]

In [18]:
binary_threshold = VarianceThreshold(threshold= 0.18)

In [19]:
binary_threshold.fit_transform(binary_feature)

array([[0, 1, 0],
       [1, 1, 0],
       [0, 0, 1],
       [1, 0, 1]])

In [20]:
binary_threshold.variances_

array([0.25, 0.25, 0.25])

<h4 class="text-center"> Highly Co-related Feature </h4>

In [21]:
co_features = np.array([[1, 1, 1],
                    [2, 2, 0],
                    [3, 3, 1],
                    [4, 4, 0],
                    [5, 5, 1],
                    [6, 6, 0],
                    [7, 7, 1],
                    [8, 7, 0],
                    [9,7,1]])

In [22]:
co_data = pd.DataFrame(co_features,columns=['A','B','C'])


In [23]:
co_data.head()

Unnamed: 0,A,B,C
0,1,1,1
1,2,2,0
2,3,3,1
3,4,4,0
4,5,5,1


<span class="badge"> Co-Relation Matrix </span>

In [31]:
corr_matrix = co_data.corr().abs()
corr_matrix

Unnamed: 0,A,B,C
A,1.0,0.976103,0.0
B,0.976103,1.0,0.034503
C,0.0,0.034503,1.0


In [32]:
#selecting upper tringle of co-matix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

In [33]:
upper

Unnamed: 0,A,B,C
A,,0.976103,0.0
B,,,0.034503
C,,,


In [35]:
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
to_drop

['B']

In [40]:
co_data.drop( to_drop, axis=1)

Unnamed: 0,A,C
0,1,1
1,2,0
2,3,1
3,4,0
4,5,1
5,6,0
6,7,1
7,8,0
8,9,1


<h4 class="text-center"> Irrelevant Feature </h4>

In [41]:
#categorical target

In [44]:
feature_iris.shape

(150, 4)

In [45]:
target_iris.shape

(150,)

In [46]:
## Convert Categorical to Int

In [47]:
feature_int = feature_iris.astype(int)

<span class="badge"> SelectKBest </span>

In [48]:
from sklearn.feature_selection import SelectKBest

<span class="badge"> chi2 </span>

In [49]:
from sklearn.feature_selection import chi2

In [50]:
chi2selector = SelectKBest(chi2, k=2)

In [51]:
feature_kbest = chi2selector.fit_transform(feature_int,target_iris)

In [52]:
feature_kbest.shape

(150, 2)

In [53]:
feature_int.shape

(150, 4)

##### For Quantitative
<span class="badge"> f_classif </span>

In [54]:
from sklearn.feature_selection import f_classif

In [55]:
f_selectBest = SelectKBest(f_classif,k=2)


In [56]:
feature_f_classif = f_selectBest.fit_transform(feature_int,target_iris)

In [57]:
feature_f_classif.shape

(150, 2)

In [58]:
# Create a percentive to aumoate select number of feature

<span class="badge"> SelectPercentile </span>

In [59]:
from sklearn.feature_selection import SelectPercentile

In [60]:
#selecting 80% feature of highest f-stat value

In [61]:
fvalue_selector = SelectPercentile(f_classif, percentile=80)
fvalue_selector.fit(feature_int,target_iris)

SelectPercentile(percentile=80)

In [64]:
fvalue_selector.transform(feature_int).shape

(150, 3)

<h4 class="text-center"> Keeping Important Features </h4>

#### Recursive Feature Elimination with cross-validation

In [65]:
#creating a linear Dataset
feature ,target = datasets.make_regression(n_samples=10000, n_features=100, n_informative=2)

In [66]:
feature.shape, target.shape

((10000, 100), (10000,))

In [67]:
from sklearn.linear_model import LinearRegression

In [68]:
model = LinearRegression()

<span class="badge"> RFECV </span>

In [69]:
from sklearn.feature_selection import RFECV

In [70]:
rfecv = RFECV(estimator=model,step=1,scoring='neg_mean_squared_error')

In [71]:
rfecv.fit(feature,target)

RFECV(estimator=LinearRegression(), scoring='neg_mean_squared_error')

In [72]:
rfecv.transform(feature)

array([[ 0.0394817 , -0.05438283],
       [ 0.43656807,  0.18844971],
       [-0.72714936,  0.25499332],
       ...,
       [-0.01216269,  1.03222103],
       [ 1.18972884, -0.49680536],
       [-0.31280153,  1.0373736 ]])

In [74]:
rfecv.n_features_ #number of best features

2

In [77]:
rfecv.support_ #important feature in binary

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False])

In [78]:
rfecv.ranking_ #feature ranking

array([42, 77, 99, 83, 98, 51, 47, 23, 97, 61, 32,  4, 22, 56,  1, 24, 34,
       39, 94, 18, 88, 15, 46, 27, 93, 64,  2, 90, 20, 68, 57, 84, 28, 16,
       52, 87, 45, 31, 37, 11, 73, 26, 54, 41, 92, 66, 44, 29, 53, 10, 63,
        3, 91, 40, 30, 74, 86, 60, 13, 62, 12,  6, 19, 25, 71, 76, 21, 49,
        9, 70, 50, 69, 55, 59, 81, 35,  7, 72, 17, 85, 79, 80, 89, 75,  8,
       14, 67, 95, 38, 58, 36, 43, 96, 48, 78, 65, 33, 82,  1,  5])