In [1]:
import warnings

import pandas as pd
import numpy as np

from sklearn.datasets import make_regression,load_iris
from sklearn.feature_selection import RFECV,SelectPercentile,SelectKBest,VarianceThreshold,chi2,f_classif
from sklearn import datasets,linear_model
from sklearn.preprocessing import StandardScaler

10.1 : Thresholding Numerical Feature Variance

In [2]:
iris = datasets.load_iris()

features = iris.data
target = iris.target

thresholder = VarianceThreshold(threshold=.5)
features_high_variance = thresholder.fit_transform(features)
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

Discussion

In [3]:
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [4]:
scaler = StandardScaler()
features_std = scaler.fit_transform(features)
selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

10.2 : Thresholding Binary Feature Variance

In [5]:
features = [[0,1,0],
            [0,1,1],
            [0,1,0],
            [0,1,1],
            [1,0,0]]

thresholder = VarianceThreshold(threshold=(.75*(1-.75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

10.3 : Handling Highly Correlated Features

In [6]:
features = np.array([[1, 1, 1],
                    [2, 2, 0],
                    [3, 3, 1],
                    [4, 4, 0],
                    [5, 5, 1],
                    [6, 6, 0],
                    [7, 7, 1],
                    [8, 7, 0],
                    [9, 7, 1]])

dataframe = pd.DataFrame(features)
corr_matrix = dataframe.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
dataframe.drop(dataframe.columns[to_drop],axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


Discussion

In [7]:
dataframe.corr()

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0


In [8]:
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


10.4 : Removing Irrelevant Features for Classification

In [9]:
iris = load_iris()
features= iris.data
target = iris.target

features = features.astype(int)
chi2_selector = SelectKBest(chi2,k=2)
features_kbest = chi2_selector.fit_transform(features,target)

print("Original number of features:",features.shape[1])
print("Reduced number of features:",features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [10]:
fvalue_selector = SelectKBest(f_classif,k=2)
features_kbest = fvalue_selector.fit_transform(features,target)
print("Original number of features:",features.shape[1])
print("Reduced number of features:",features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [11]:
fvalue_selector = SelectPercentile(f_classif,percentile=75)
features_kbest = fvalue_selector.fit_transform(features,target)
print("Original number of features:",features.shape[1])
print("Reduced number of features:",features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


10.5 : Recursively Eliminating Features

In [13]:
warnings.filterwarnings(action="ignore",module="scipy",message="^internal gelsd")

features, target = make_regression(n_samples=10000,
                                   n_features=100,
                                   n_informative=2,
                                   random_state=1)

ols = linear_model.LinearRegression()
rfecv = RFECV(estimator=ols,step=1,scoring="neg_mean_squared_error")
rfecv.fit(features,target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 , -0.1595391 ,  2.14404686],
       [-1.07500204,  2.56148527, -1.68400704, -0.99941336],
       [ 1.37940721, -1.77039484,  0.58847495, -1.54338276],
       ...,
       [-0.80331656, -1.60648007,  0.56935412,  0.96189783],
       [ 0.39508844, -1.34564911,  0.68749393, -1.38924719],
       [-0.55383035,  0.82880112,  0.8233291 , -0.02461965]],
      shape=(10000, 4))

In [14]:
rfecv.n_features_

np.int64(4)

In [15]:
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [16]:
rfecv.ranking_

array([86, 29, 41, 27, 46,  1, 87, 64, 39, 47, 68, 96, 24, 79, 88, 92, 54,
       82, 81, 69, 77, 83,  4, 50, 78, 58, 94, 16, 70, 89, 31, 35, 93,  2,
        3, 43, 33, 59, 95,  1, 85, 21, 76,  8, 66, 60, 26, 23, 17, 52, 61,
       25, 97, 22,  5, 10, 30, 71, 49, 19, 20, 51, 18, 84, 91, 40, 67, 90,
       53, 42, 34, 75, 57, 44,  1, 62, 73, 32, 28, 12, 14,  7,  1, 38, 45,
       74, 56, 55,  6, 80, 48, 36, 65, 63, 13, 37, 72, 15,  9, 11])