<h1>Chap10 - Dimensionality Reduction Using Feature Selection</h1>

Imports

In [33]:
from sklearn import datasets, linear_model
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, f_classif, SelectPercentile, RFECV
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import warnings

## 10.1 Thresholidng Numerical Feature Variance 

In [4]:
iris = datasets.load_iris()

In [5]:
features = iris.data
target = iris.target

In [9]:
thresholder = VarianceThreshold(threshold=0.5)
features_high_variance = thresholder.fit_transform(features)
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [10]:
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [12]:
#Take carefull with different features units/dimensions and also do not standardize the features
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

## 10.2 Thresholding Binary Feature Variance 

In [14]:
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]

thresholder = VarianceThreshold(.75 * (1 - .75))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

Variance

$${Var(x) = p(1-p)}$$

## 10.3 Handling Highly Correlated Features 

In [16]:
features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0], 
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])

In [18]:
dataframe = pd.DataFrame(features)
corr_matrix = dataframe.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))
to_drop = [columns for columns in upper.columns if any (upper[columns] > 0.95)]
dataframe.drop(dataframe.columns[to_drop], axis = 1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


## 10.4 Removing Irrelavant Features for Classification 

In [22]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

features = features.astype(int)

In [23]:
chi2_selector = SelectKBest(chi2, k=2) # select two features with highest chi2
features_kbest = chi2_selector.fit_transform(features, target)

print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [25]:
fvalue_selector = SelectKBest(f_classif, k = 2) # select two features with highest F-values
features_kbest = fvalue_selector.fit_transform(features, target)
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [30]:
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

In [31]:
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


Chi-saquared
$$x^2 = \sum{\frac{(O_i-E_i)^2}{E_i}}$$

## 10.5 Recursively Eliminating Features

In [35]:
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [42]:
features, target = datasets.make_regression(n_samples = 10000,
                                   n_features = 100,
                                   n_informative = 2,
                                   random_state = 1)

In [43]:
ols = linear_model.LinearRegression()

rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[-0.76165969,  0.00850799, -0.1743876 , ..., -0.26903836,
        -0.00334006, -0.43443631],
       [-0.46550514, -1.07500204, -0.13792708, ..., -0.71990239,
         0.83025386,  0.49521824],
       [ 1.36836958,  1.37940721,  0.04954512, ...,  0.49558942,
         1.57437341, -0.86151394],
       ...,
       [ 0.39023196, -0.80331656,  1.19182077, ...,  0.22900133,
         0.95453857, -0.32710863],
       [ 0.44825266,  0.39508844,  0.1869861 , ...,  0.02723818,
         0.3561363 , -0.18744426],
       [ 1.15616404, -0.55383035, -2.20666674, ..., -1.08865284,
        -0.47269736, -0.36458525]])

In [44]:
rfecv.n_features_

9

In [45]:
rfecv.support_

array([False,  True, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False,  True, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True, False, False,
        True, False, False, False, False, False, False, False, False,
       False])

In [46]:
rfecv.ranking_

array([43,  1, 52, 40, 25,  1, 30, 12, 72,  5, 75, 24, 55, 45, 39, 73, 47,
       36,  1, 38, 74, 87, 21, 17, 13, 56, 11,  4,  3, 33,  9, 59, 22, 29,
        1, 63, 34, 41, 84,  1, 10, 26, 28, 71, 78, 42, 91,  1, 88, 92, 85,
       54, 80, 81, 31, 86, 48,  7, 20, 62, 83, 50,  6, 37, 60, 65, 57, 76,
       46, 49, 44,  2, 15, 66, 16, 35, 90, 82, 77, 69, 64, 32, 18, 51, 23,
       67,  1,  1,  8, 53,  1, 89, 68, 58, 79, 61, 27, 70, 14, 19])