In [1]:
import numpy as np
import pandas as pd 

In [2]:
from sklearn.feature_extraction import DictVectorizer 
data = [
    {'age':4, 'height':96.0},
    {'age':1, 'height':73.9},
    {'age':3, 'height':88.9},
    {'age':2, 'height':81.6}
]

dv = DictVectorizer(sparse=False)
data_transformed = dv.fit_transform(data)
np.var(data_transformed, axis=0)

array([ 1.25 , 67.735])

In [3]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=9)
data_new = vt.fit_transform(data_transformed)
print(data_new)

[[96. ]
 [73.9]
 [88.9]
 [81.6]]


In [4]:
# Variance = 1/n (summation of [(xi - mean)**2] over i=1 to n )

In [5]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_regression

X_cali, y_cali = fetch_california_housing(return_X_y=True)

# X_cali_df = pd.DataFrame(X_cali, columns=fetch_california_housing().feature_names)
# y_cali_df = pd.DataFrame(y_cali, columns=['target'])

X, y = X_cali[:2000, :], y_cali[:2000]
print(f"Shape of feature matrix before feature selection: {X.shape}")


Shape of feature matrix before feature selection: (2000, 8)


In [6]:
skb = SelectKBest(mutual_info_regression, k=3)
X_new = skb.fit_transform(X,y)
print(f"Shape of feature matrix after feature selection : {X_new.shape}")

Shape of feature matrix after feature selection : (2000, 3)


In [7]:
skb.get_feature_names_out()

array(['x0', 'x6', 'x7'], dtype=object)

In [8]:
pd.DataFrame(X_new).head()

Unnamed: 0,0,1,2
0,8.3252,37.88,-122.23
1,8.3014,37.86,-122.22
2,7.2574,37.85,-122.24
3,5.6431,37.85,-122.25
4,3.8462,37.85,-122.25


In [9]:
fetch_california_housing().feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [12]:
from sklearn.feature_selection import SelectPercentile
sp = SelectPercentile(mutual_info_regression, percentile=30) #select the top 30 percentile
X_newsp = sp.fit_transform(X,y)
print(f"Shape of feature matrix after feature selection {X_newsp.shape}")

Shape of feature matrix after feature selection (2000, 3)


Genric Univariate Select - It applies univariate feature selection with a certain strategy, which is passed to the API via mode paremeter.
The mode can take any one of the following values : percentile, k_best, fpr, fdr, fwe

In [13]:
from sklearn.feature_selection import GenericUnivariateSelect
gus = GenericUnivariateSelect(mutual_info_regression, mode='k_best', param=3)
X_gus = gus.fit_transform(X,y)
print(f"Shape of a feature matrix before feature selection : {X.shape}")
print(f"Shape of a feature matrix after selection : {X_gus.shape}")

Shape of a feature matrix before feature selection : (2000, 8)
Shape of a feature matrix after selection : (2000, 3)


Wrapper Based Method :
RFE(Recursive Feature Elimination) - Step1) Fits a model and Step2) Rank the features, afterwards it removes one or more features(depending upon step parameter), these 2 steps are repeated untill desired number of features are selected

In [15]:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
selector = RFE(lr, n_features_to_select=3, step=1)
selector = selector.fit(X,y)

print(selector.support_)
print(f"Rank of each feature is : {selector.ranking_}")

[ True False False False False False  True  True]
Rank of each feature is : [1 5 4 3 6 2 1 1]


In [17]:
X_rfe = selector.transform(X)
print(f"Shape of feature matrix after feature selection : {X_rfe.shape}")

Shape of feature matrix after feature selection : (2000, 3)


In [18]:
from sklearn.feature_selection import SelectFromModel
lr = LinearRegression()
lr.fit(X,y)

print(f"Coefficients of features : {lr.coef_}")
print(f"Indices of top {3} features : {np.argsort(lr.coef_)[-3:]}")

t = np.argsort(np.abs(lr.coef_))[-3:]
model = SelectFromModel(lr, max_features=3, prefit=True)

X_sfm = model.transform(X)
print(f"Shape of feature matrix after feature selection : {X_sfm.shape}")

Coefficients of features : [ 3.64048292e-01  5.56221906e-03  5.13591243e-02 -1.64474348e-01
  5.90411479e-05 -1.64573915e-01 -2.17724525e-01 -1.85343265e-01]
Indices of top 3 features : [1 2 0]
Shape of feature matrix after feature selection : (2000, 3)


In [19]:
from sklearn.feature_selection import SequentialFeatureSelector
lr = LinearRegression()
sfs = SequentialFeatureSelector(lr, n_features_to_select=3)
sfs.fit(X,y)
print(sfs.get_support())

[ True False False False False  True  True False]


In [20]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)