In [2]:
import pandas as pd
import numpy as np
from sklearn import feature_selection
import matplotlib.pyplot as plt

In [8]:
#variance threshold
df = pd.DataFrame({'A':['m','f','m','m','m','m','m','m'], 
              'B':[1,2,3,1,2,1,1,1], 
              'C':[1,2,3,1,2,1,1,1]})

df

Unnamed: 0,A,B,C
0,m,1,1
1,f,2,2
2,m,3,3
3,m,1,1
4,m,2,2
5,m,1,1
6,m,1,1
7,m,1,1


In [9]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [10]:
df['A']=le.fit_transform(df.A)
df

Unnamed: 0,A,B,C
0,1,1,1
1,0,2,2
2,1,3,3
3,1,1,1
4,1,2,2
5,1,1,1
6,1,1,1
7,1,1,1


In [12]:
vt=feature_selection.VarianceThreshold(threshold=0.2)
print(vt.fit_transform(df))
print(vt.variances_)

[[1 1]
 [2 2]
 [3 3]
 [1 1]
 [2 2]
 [1 1]
 [1 1]
 [1 1]]
[0.109375 0.5      0.5     ]


In [16]:
#chi square for non-negative feature classes
chi2,pval=feature_selection.chi2(df.drop('C',axis=1),df.C)
chi2

array([0.42857143, 2.66666667])

In [17]:
#anova using f_classif
from sklearn.datasets import load_breast_cancer
cancer_data = load_breast_cancer()
X = cancer_data.data
Y = cancer_data.target

In [20]:
chi2,pval=feature_selection.f_classif(X,Y)
np.round(chi2)

array([647., 118., 697., 573.,  84., 313., 534., 862.,  70.,   0., 269.,
         0., 254., 244.,   3.,  53.,  39., 113.,   0.,   3., 861., 150.,
       898., 662., 122., 304., 437., 964., 119.,  66.])

In [22]:
#f regression test
from sklearn.datasets import california_housing
house_data = california_housing.fetch_california_housing()
X,Y = house_data.data, house_data.target
F,pval=feature_selection.f_regression(X,Y)
F

array([1.85565716e+04, 2.32841479e+02, 4.87757462e+02, 4.51085756e+01,
       1.25474103e+01, 1.16353421e+01, 4.38005453e+02, 4.36989761e+01])

In [29]:
cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship'
        ,'race','sex','capital-gain','capital-loss','hours-per-week','native-country','Salary']
adult_data = pd.read_csv('https://raw.githubusercontent.com/zekelabs/data-science-complete-tutorial/master/Data/adult.data.txt', names=cols)
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [37]:
#select Kbest
cat_cols = list(adult_data.select_dtypes('object').columns)
cat_cols.remove('Salary')
from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    le = LabelEncoder()
    adult_data[col]  = le.fit_transform(adult_data[col])

In [40]:

selector = feature_selection.SelectKBest(k=7, score_func=feature_selection.f_classif)
data = selector.fit_transform(adult_data.drop('Salary',axis=1),adult_data.Salary)
selector.scores_

array([1.88670731e+03, 8.69361605e+01, 2.91559359e+00, 2.06129509e+02,
       4.12009578e+03, 1.34685178e+03, 1.86500322e+02, 2.18764583e+03,
       1.68934788e+02, 1.59310791e+03, 1.70915006e+03, 7.54830452e+02,
       1.81338628e+03, 8.17155711e+00])

In [41]:
data[0]

array([  39,   13,    4,    1,    1, 2174,   40])

In [42]:

selector = feature_selection.SelectKBest(k=7, score_func=feature_selection.mutual_info_classif)
data = selector.fit_transform(adult_data.drop('Salary',axis=1),adult_data.Salary)
selector.scores_

array([0.06524136, 0.01472585, 0.03392783, 0.06440705, 0.06628356,
       0.11476061, 0.06378236, 0.11897485, 0.00711522, 0.02712156,
       0.08318369, 0.03911336, 0.04324735, 0.00792524])

In [43]:
data[0]

array([  39,    9,   13,    4,    1,    1, 2174])

In [44]:
#select percentile
selector=feature_selection.SelectPercentile(percentile=20,score_func=feature_selection.mutual_info_classif)
data=selector.fit_transform(adult_data.drop('Salary',axis=1),adult_data.Salary)
data[0]

array([   4,    1, 2174])

In [46]:
#seleect from model
from sklearn.datasets import load_boston
boston = load_boston()
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
sfm=feature_selection.SelectFromModel(clf,threshold=0.25)
sfm.fit_transform(boston.data,boston.target).shape

(506, 7)

In [47]:
#recursive feature elimination
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
X, y = make_regression(n_samples=50, n_features=10, random_state=0)
estimator = SVR(kernel="linear")
selector = RFE(estimator, 5, step=1)
data = selector.fit_transform(X, y)

In [48]:
X.shape

(50, 10)

In [49]:
data.shape

(50, 5)

In [50]:
selector.ranking_

array([1, 1, 4, 3, 1, 6, 1, 2, 5, 1])