# Feature Selection

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from itertools import compress

In [2]:
data, target = datasets.make_classification(n_samples=1000, n_features=20, n_informative=5, n_redundant=5, n_repeated=5, random_state=0, shuffle=False)
data, target = np.array(data), np.array(target)
feature_names = [f'informative{i}' for i in range(1,6)] + [f'redundant{i}' for i in range(1,6)] + [f'repeated{i}' for i in range(1,6)] + [f'random{i}' for i in range(1,6)]
pd.DataFrame(data, columns=feature_names)

Unnamed: 0,informative1,informative2,informative3,informative4,informative5,redundant1,redundant2,redundant3,redundant4,redundant5,repeated1,repeated2,repeated3,repeated4,repeated5,random1,random2,random3,random4,random5
0,-0.046248,1.144745,-0.987279,3.333160,4.051760,-4.706648,6.771132,-4.819717,3.825925,-1.933259,4.051760,-4.706648,3.825925,3.333160,3.333160,-0.576410,-1.107288,0.135283,-1.052778,2.052293
1,-1.413096,1.182676,-0.708346,1.768376,-0.272121,-0.570264,2.793967,-0.403252,1.700657,-0.415913,-0.272121,-0.570264,1.700657,1.768376,1.768376,0.985309,-0.372840,-1.486236,-0.111661,-1.959615
2,-0.150822,1.693635,-2.436068,-1.472918,1.246785,-0.638807,1.091351,2.081265,0.195681,-0.715404,1.246785,-0.638807,0.195681,-1.472918,-1.472918,1.060701,0.791298,1.448645,-0.556655,0.805564
3,-3.185603,0.738761,0.847584,2.593292,-0.771326,-0.258864,3.276316,-1.678765,1.954428,-0.936575,-0.771326,-0.258864,1.954428,2.593292,2.593292,1.384041,-0.435665,-0.733842,1.348113,0.145476
4,-1.078696,1.169550,-0.482905,3.814971,1.504417,-2.842212,5.703426,-3.496012,3.465157,-1.085219,1.504417,-2.842212,3.465157,3.814971,3.814971,0.425298,-0.088040,-1.248473,1.104597,-2.029386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-2.676634,0.067559,-2.623262,0.300857,-1.986317,3.731116,2.513367,2.114415,3.438996,-1.171576,-1.986317,3.731116,3.438996,0.300857,0.300857,0.907339,0.197391,1.373848,-0.989330,-0.989102
996,-0.294485,-1.785435,-1.101075,2.994150,-0.565161,1.755694,3.826372,-2.566137,4.900834,-0.846099,-0.565161,1.755694,4.900834,2.994150,2.994150,-2.476985,0.770254,-0.086127,-0.857668,0.639174
997,-3.564600,1.162421,-1.035621,-1.439683,-1.215158,2.191775,0.556896,2.836753,0.281108,-1.241245,-1.215158,2.191775,0.281108,-1.439683,-1.439683,-0.247200,0.233380,0.870814,0.673855,1.627267
998,-1.474395,-2.439949,-1.794685,5.192866,0.118911,1.891707,7.842495,-5.068235,8.632696,-2.301303,0.118911,1.891707,8.632696,5.192866,5.192866,0.751697,-0.788341,-0.808221,-1.373604,-0.404599


#### Scaling

In [3]:
from sklearn.preprocessing import StandardScaler
data = StandardScaler().fit_transform(data, target)
pd.DataFrame(data, columns=feature_names)

Unnamed: 0,informative1,informative2,informative3,informative4,informative5,redundant1,redundant2,redundant3,redundant4,redundant5,repeated1,repeated2,repeated3,repeated4,repeated5,random1,random2,random3,random4,random5
0,0.293510,0.806724,-0.307361,1.635257,2.694419,-2.929566,2.254338,-1.977202,1.173644,-1.292973,2.694419,-2.929566,1.173644,1.635257,1.635257,-0.564683,-1.048601,0.177917,-1.119199,2.025407
1,-0.617033,0.832875,-0.110304,0.535004,0.142642,-0.625004,0.600202,-0.064122,0.129430,-0.178516,0.142642,-0.625004,0.129430,0.535004,0.535004,1.017104,-0.316257,-1.496956,-0.142845,-1.931718
2,0.223847,1.185154,-1.330878,-1.744061,1.039037,-0.663193,-0.107931,1.012095,-0.610015,-0.398486,1.039037,-0.663193,-0.610015,-1.744061,-1.744061,1.093465,0.844546,1.534494,-0.604500,0.795703
3,-1.797811,0.526820,0.988905,1.115030,-0.151969,-0.451510,0.800815,-0.616636,0.254116,-0.560931,-0.151969,-0.451510,0.254116,1.115030,1.115030,1.420960,-0.378901,-0.719805,1.371586,0.144628
4,-0.394268,0.823826,0.048962,1.974035,1.191081,-1.890806,1.810270,-1.403813,0.996387,-0.670107,1.191081,-1.890806,0.996387,1.974035,1.974035,0.449897,-0.032272,-1.251370,1.118953,-2.000536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-1.458755,0.064062,-1.463124,-0.496859,-0.869007,1.771484,0.483498,1.026455,0.983533,-0.733534,-0.869007,1.771484,0.983533,-0.496859,-0.496859,0.938132,0.252341,1.457237,-1.053376,-0.974458
996,0.128144,-1.213477,-0.387753,1.396887,-0.030299,0.670889,1.029588,-1.001019,1.701782,-0.494478,-0.030299,0.670889,1.701782,1.396887,1.396887,-2.489681,0.823563,-0.050777,-0.916784,0.631584
997,-2.050285,0.818911,-0.341512,-1.720692,-0.413900,0.913849,-0.330215,1.339350,-0.568042,-0.784704,-0.413900,0.913849,-0.568042,-1.720692,-1.720692,-0.231243,0.288226,0.937651,0.672083,1.606185
998,-0.657868,-1.664729,-0.877764,2.942880,0.373413,0.746668,2.699927,-2.084852,3.535370,-1.563293,0.373413,0.746668,3.535370,2.942880,2.942880,0.780490,-0.730567,-0.796631,-1.452038,-0.397936


## Linear Regression

In [4]:
from sklearn.feature_selection import SelectKBest, f_regression

linear_regression = SelectKBest(f_regression, k=5)
data_new = linear_regression.fit_transform(data, target)

names = compress(feature_names, linear_regression.get_support())
pd.DataFrame(data_new, columns=names)

Unnamed: 0,informative2,informative3,redundant1,redundant2,repeated2
0,0.806724,-0.307361,-2.929566,2.254338,-2.929566
1,0.832875,-0.110304,-0.625004,0.600202,-0.625004
2,1.185154,-1.330878,-0.663193,-0.107931,-0.663193
3,0.526820,0.988905,-0.451510,0.800815,-0.451510
4,0.823826,0.048962,-1.890806,1.810270,-1.890806
...,...,...,...,...,...
995,0.064062,-1.463124,1.771484,0.483498,1.771484
996,-1.213477,-0.387753,0.670889,1.029588,0.670889
997,0.818911,-0.341512,0.913849,-0.330215,0.913849
998,-1.664729,-0.877764,0.746668,2.699927,0.746668


## Linear Regression and regularization

In [5]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

clf = LassoCV()
sfm = SelectFromModel(clf, max_features=5)
data_new = sfm.fit_transform(data, target)
names = compress(feature_names, sfm.get_support())
pd.DataFrame(data_new, columns=names)

Unnamed: 0,informative2,informative3,informative5,redundant2,repeated1
0,0.806724,-0.307361,2.694419,2.254338,2.694419
1,0.832875,-0.110304,0.142642,0.600202,0.142642
2,1.185154,-1.330878,1.039037,-0.107931,1.039037
3,0.526820,0.988905,-0.151969,0.800815,-0.151969
4,0.823826,0.048962,1.191081,1.810270,1.191081
...,...,...,...,...,...
995,0.064062,-1.463124,-0.869007,0.483498,-0.869007
996,-1.213477,-0.387753,-0.030299,1.029588,-0.030299
997,0.818911,-0.341512,-0.413900,-0.330215,-0.413900
998,-1.664729,-0.877764,0.373413,2.699927,0.373413


## Recursive feature selection

In [6]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor

# object which will be used to score the current selection
estimator = DecisionTreeRegressor()

# object performing recursive feature selection
selector = RFE(estimator, n_features_to_select=5, step=1)

data_new = selector.fit_transform(data, target)
names = compress(feature_names, selector.get_support())

pd.DataFrame(data_new, columns=names)

Unnamed: 0,informative2,informative3,redundant1,redundant2,repeated3
0,0.806724,-0.307361,-2.929566,2.254338,1.173644
1,0.832875,-0.110304,-0.625004,0.600202,0.129430
2,1.185154,-1.330878,-0.663193,-0.107931,-0.610015
3,0.526820,0.988905,-0.451510,0.800815,0.254116
4,0.823826,0.048962,-1.890806,1.810270,0.996387
...,...,...,...,...,...
995,0.064062,-1.463124,1.771484,0.483498,0.983533
996,-1.213477,-0.387753,0.670889,1.029588,1.701782
997,0.818911,-0.341512,0.913849,-0.330215,-0.568042
998,-1.664729,-0.877764,0.746668,2.699927,3.535370
