# Feature Selection

## Variance Thresholding For Numerical Features

In [1]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

In [2]:
iris = datasets.load_iris()

In [4]:
features = iris.data
target = iris.target

In [5]:
features[0:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [6]:
features.shape

(150, 4)

In [7]:
thresholder = VarianceThreshold(threshold=0.5)

In [8]:
feature_high_variance = thresholder.fit_transform(features)

In [9]:
feature_high_variance[0:10]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2],
       [5.4, 1.7, 0.4],
       [4.6, 1.4, 0.3],
       [5. , 1.5, 0.2],
       [4.4, 1.4, 0.2],
       [4.9, 1.5, 0.1]])

In [10]:
feature_high_variance.shape

(150, 3)

## Binary Data Variance Thresholding

In [11]:
features = [[0,1,0],[0,1,1],[0,1,0],[0,1,1],[1,0,0]]

In [12]:
thresholder = VarianceThreshold(threshold=0.6*(1-0.6))

In [13]:
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

## Handling High Correlated Features

In [15]:
import pandas as pd
import numpy as np

In [16]:
features = np.array([[1,1,1],[2,2,0],[3,3,1],[4,4,0],[5,5,1],[6,6,0],[7,7,1],[8,7,0],[9,7,1]])

In [17]:
features

array([[1, 1, 1],
       [2, 2, 0],
       [3, 3, 1],
       [4, 4, 0],
       [5, 5, 1],
       [6, 6, 0],
       [7, 7, 1],
       [8, 7, 0],
       [9, 7, 1]])

In [18]:
dataframe = pd.DataFrame(features)

In [20]:
dataframe

Unnamed: 0,0,1,2
0,1,1,1
1,2,2,0
2,3,3,1
3,4,4,0
4,5,5,1
5,6,6,0
6,7,7,1
7,8,7,0
8,9,7,1


In [21]:
corr_matrix = dataframe.corr().abs()

In [22]:
corr_matrix

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,0.034503
2,0.0,0.034503,1.0


In [23]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

In [24]:
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


In [26]:
to_drop = [column for column in upper.columns if any (upper[column]>0.95)]

In [27]:
to_drop

[1]

In [28]:
dataframe.drop(dataframe.columns[to_drop],axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


## Removing irrelevant features from Categorical Data

In [31]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif

In [32]:
iris = load_iris()
features = iris.data
target = iris.target

In [33]:
features = features.astype(int)

In [37]:
#if the features are categorical
chi2_selector = SelectKBest(chi2,k=2)
features_kbest = chi2_selector.fit_transform(features,target)

In [38]:
print("Original Number Of Features : ",features.shape[1])
print("Reduced NUmber of Features : ",features_kbest.shape[1])

Original Number Of Features :  4
Reduced NUmber of Features :  2
