In [1]:
# thresholding numerical variance
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

iris = datasets.load_iris()

features = iris.data
target = iris.target

thresholder = VarianceThreshold(threshold=0.5)

features_high_variance = thresholder.fit_transform(features)

features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [2]:
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [3]:
import pandas as pd
import numpy as np

features = np.array([[1,1,1],
                    [2,2,0],
                    [3,3,1],
                    [4,4,0],
                    [5,5,1],
                    [6,6,0],
                    [7,7,1],
                    [8,7,0],
                    [9,7,1]])

df = pd.DataFrame(features)
df

Unnamed: 0,0,1,2
0,1,1,1
1,2,2,0
2,3,3,1
3,4,4,0
4,5,5,1
5,6,6,0
6,7,7,1
7,8,7,0
8,9,7,1


In [4]:
corr_matrix = df.corr().abs()

corr_matrix

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,0.034503
2,0.0,0.034503,1.0


In [6]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                                 k=1).astype(np.bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

df.drop(df.columns[to_drop], axis=1).head()

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1
3,4,0
4,5,1


In [8]:
# removing irrelevant features

from sklearn.feature_selection import SelectKBest, chi2, f_classif

features = iris.data
target = iris.target

features = features.astype(int)

# chi2 for categorical features, f_classif for quantitative
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

print('original number of features', features.shape[1])
print('reduced number of features', features_kbest.shape[1])

original number of features 4
reduced number of features 2


In [14]:
# recursively eliminating features
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

warnings.filterwarnings(action='ignore', module='scipy',
                       message='^internal gelsd')

features, target = make_regression(n_samples=10000,
                                  n_features=100,
                                  n_informative=2,
                                  random_state=1)

In [15]:
ols = linear_model.LinearRegression()

# recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring='neg_mean_squared_error')
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 ],
       [-1.07500204,  2.56148527],
       [ 1.37940721, -1.77039484],
       ...,
       [-0.80331656, -1.60648007],
       [ 0.39508844, -1.34564911],
       [-0.55383035,  0.82880112]])

In [16]:
rfecv.n_features_

2

In [17]:
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [18]:
rfecv.ranking_

array([59, 78, 54, 26, 91,  1, 56, 33, 38, 20, 12,  5, 19, 47, 66, 73, 49,
       31,  3, 24, 77, 69, 39, 83, 25,  6,  4, 92, 36, 82,  7, 21, 18, 62,
       84, 15, 88,  9, 22,  1, 87, 37, 96, 79, 52, 32, 11, 97, 48, 94, 95,
       76, 67, 72, 34, 68, 85, 13, 23, 75, 43, 71, 86, 90, 51, 63, 89, 55,
       44, 65, 80, 30, 81, 93, 27, 61, 46, 16, 70, 64, 35, 40,  2, 28, 57,
       98,  8, 29, 53, 41, 45, 10, 14, 42, 58, 50, 17, 60, 99, 74])