In [5]:
##Thresholding Numerical Feature Variance

# Load libraries
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

# import some data to play with
iris = datasets.load_iris()

# Create features and target
features = iris.data
target = iris.target

# Create thresholder
thresholder = VarianceThreshold(threshold=.5)

# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)

# View high variance feature matrix
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [4]:
# View variances
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [2]:
#### if the features have been standardized (to mean zero and unit variance),
##then for obvious reasons variance thresholding will not work correctly:

# Load library
from sklearn.preprocessing import StandardScaler

# Standardize feature matrix
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# Caculate variance of each feature
selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

In [6]:
### Thresholding Binary Feature Variance

# Load library
from sklearn.feature_selection import VarianceThreshold
# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1

features = [[0, 1, 0],
[0, 1, 1],
[0, 1, 0],
[0, 1, 1],
[1, 0, 0]]

In [7]:
# Run threshold by variance
thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [8]:
### Handling Highly Correlated Features

# Load libraries
import pandas as pd
import numpy as np

# Create feature matrix with two highly correlated features
features = np.array([[1, 1, 1],
[2, 2, 0],
[3, 3, 1],
[4, 4, 0],
[5, 5, 1],
[6, 6, 0],
[7, 7, 1],
[8, 7, 0],
[9, 7, 1]])

In [10]:
# Convert feature matrix into DataFrame
dataframe = pd.DataFrame(features)

# Create correlation matrix
corr_matrix = dataframe.corr().abs()
corr_matrix

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,0.034503
2,0.0,0.034503,1.0


In [11]:
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
k=1).astype(np.bool))
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


In [12]:
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


In [13]:
## Removing Irrelevant Features for Classification

## You have a categorical target vector and want to remove uninformative features.

# Load libraries
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

In [16]:
# Load data
iris = load_iris()
features = iris.data
target = iris.target

# Convert to categorical data by converting data to integers
features = features.astype(int)

In [17]:
# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

In [18]:
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [19]:
#If the features are quantitative, compute the ANOVA F-value between each
#feature and the target vector

# Select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [20]:
## SelectPercentile to select the top n percent of features

# Load library
from sklearn.feature_selection import SelectPercentile

# Select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


In [21]:
### Recursively Eliminating Features

# Load libraries
import warnings
from sklearn.datasets import make_regression

from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",
message="^internal gelsd")

In [22]:
# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
n_features = 100,
n_informative = 2,
random_state = 1)

# Create a linear regression
ols = linear_model.LinearRegression()

# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 , -0.34606121],
       [-1.07500204,  2.56148527, -1.8392567 ],
       [ 1.37940721, -1.77039484, -0.90016708],
       ...,
       [-0.80331656, -1.60648007, -1.28329706],
       [ 0.39508844, -1.34564911,  0.85012142],
       [-0.55383035,  0.82880112,  0.27741159]])

In [23]:
# Number of best features
rfecv.n_features_

3

In [25]:
# Which categories are best
bol=rfecv.support_
len(bol)

100

In [27]:
#We can even view the rankings of the features:

# Rank features best (1) to worst
rfecv.ranking_

array([50, 47, 66, 74, 28,  1, 33, 36, 39, 35, 41, 45,  5, 32, 30, 51, 42,
       88, 10, 27, 82, 84, 54, 95, 24, 98, 73, 20, 13, 59, 40, 76, 26,  9,
       17, 21, 65, 75, 57,  1, 14, 38, 64, 96, 19, 78,  2, 37, 90, 89, 93,
       12, 25, 58, 29, 97, 22,  7, 83, 48,  4, 15, 77,  6, 46, 67, 87, 80,
       53, 49, 86, 60, 68, 62, 79, 34,  3, 31,  1, 69, 44, 72,  8, 71, 85,
       81, 11, 55, 94, 56, 52, 18, 43, 61, 16, 70, 91, 23, 92, 63])