In [2]:
#10.1 Thresholding Numerical Feature Variance

# Load libraries
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

# Import some data to play with
iris = datasets.load_iris()

# Create features and target
features = iris.data
target = iris.target

# Create thresholder
thresholder = VarianceThreshold(threshold=.5)
#هر ویژگی که واریانس آن ≤ 0.5 باشد حذف خواهد شد

# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)

# View high variance feature matrix



In [4]:
# View variances
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [5]:
# Load library
from sklearn.preprocessing import StandardScaler

# Standardize feature matrix
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# Caculate variance of each feature
selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

In [None]:
#10.2 Thresholding Binary Feature Variance

# Load library
from sklearn.feature_selection import VarianceThreshold

# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]

# Run threshold by variance
# In binary features: Var(x) = p(1 − p)
thresholder = VarianceThreshold(threshold=(.75 * (1- .75)))
thresholder.fit_transform(features)

#0.75 * 0.25 = 0.1875
# Feature 0 -> p = 0.2 and var = 0.16 -> remove
# Feature 0 -> p = 0.8 and var = 0.16 -> remove
# Feature 0 -> p = 0.4 and var = 0.24 -> keep


array([[0],
       [1],
       [0],
       [1],
       [0]])

In [None]:
#10.3 Handling Highly Correlated Features

# Load libraries
import pandas as pd
import numpy as np

# Create feature matrix with two highly correlated features
features = np.array([[1, 1, 1],
                     [2, 2 ,0],
                     [3, 3, 1],
                     [4, 4 ,0],
                     [5, 5, 1],
                     [6, 6 ,0],
                     [7, 7 ,1],
                     [8, 7, 0],
                     [9, 7 ,1]])

# Convert feature matrix into DataFrame
df = pd.DataFrame(features)

# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                                  k=1).astype(bool))
# k =1 -> Main diagonal = 0
# np.triu -> Return the upper triangular matrix

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
#  any -> At least with one of columns

# Drop features
df.drop(df.columns[to_drop], axis=1).head(3)


Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


In [14]:
# Correlation matrix
df.corr()

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0


In [15]:
# Upper triangle of correlation matrix
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


In [26]:
#10.4 Removing Irrelevant Features for Classification

# Load libraries
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

# Load data
iris = load_iris()
features = iris.data
target = iris.target

# Convert to categorical data by converting data to integers
features = features.astype(int)

# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])


Original number of features: 4
Reduced number of features: 2


In [29]:
# Select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [34]:
# Load library
from sklearn.feature_selection import SelectPercentile

# Select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features,target)
# It means that 75% of the features with the highest F-values are selected

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


In [None]:
#10.5 Recursively Eliminating Features

# Load libraries
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

# Suppress an annoying but harmless warning
warnings.filterwarnings(action='ignore', module='scipy',
                        message='^internal gelsd')
# module='scipy' -> The warnings that start with 'scipy'

# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples=10000,
                                   n_features=100,
                                   n_informative=2,
                                   random_state=1)
# n_informative=2 -> Only two features actually contribute to predicting the target

# Create a linear regression
ols = linear_model.LinearRegression()

# Recursively eliminate features
rfecv = RFECV(estimator=ols, # Base model to evaluate feature importance (here OLS)
              step=1,  # Number of features to remove at each iteration
              scoring='neg_mean_squared_error' # Model performance metric for feature selection
              )
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 ],
       [-1.07500204,  2.56148527],
       [ 1.37940721, -1.77039484],
       ...,
       [-0.80331656, -1.60648007],
       [ 0.39508844, -1.34564911],
       [-0.55383035,  0.82880112]], shape=(10000, 2))

In [36]:
# Number of best features
rfecv.n_features_

np.int64(2)

In [37]:
# Which categories are best
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [38]:
# Rank features best (1) to worst
rfecv.ranking_

array([38, 24, 80, 58, 79,  1,  3, 63, 54, 66, 85, 64, 61, 81, 98, 69, 15,
       34, 12, 36, 78, 62, 42, 60, 96, 92, 70, 55, 46, 95, 84, 37, 65, 56,
       59, 74, 87, 10, 23,  1, 14, 16, 40,  6, 50, 67, 33, 11, 75, 41, 83,
       26, 86, 49, 52, 32, 94,  4, 77, 44,  2, 18, 28, 90, 88,  8, 45, 19,
       25, 73,  7, 30, 48, 17, 47, 89, 51, 53, 13, 82,  9,  5, 21, 22, 57,
       99, 20, 91, 71, 76, 97, 93, 39, 27, 68, 43, 31, 35, 72, 29])