# Feature Selection

In [5]:
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, f_classif, chi2


training = pd.read_csv("training.csv",sep=";")
test= pd.read_csv("test.csv",sep=";")


# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'species'. 
training[' workclass']= label_encoder.fit_transform(training[' workclass'])
training[' education']= label_encoder.fit_transform(training[' education'])
training[' marital-status']= label_encoder.fit_transform(training[' marital-status'])
training[' occupation']= label_encoder.fit_transform(training[' occupation'])
training[' relationship']= label_encoder.fit_transform(training[' relationship'])
training[' race']= label_encoder.fit_transform(training[' race'])
training[' sex']= label_encoder.fit_transform(training[' sex'])
training[' native-country']= label_encoder.fit_transform(training[' native-country'])
training[' salary-classification']= label_encoder.fit_transform(training[' salary-classification'])

#test

# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'species'. 
test[' workclass']= label_encoder.fit_transform(test[' workclass'])
test[' education']= label_encoder.fit_transform(test[' education'])
test[' marital-status']= label_encoder.fit_transform(test[' marital-status'])
test[' occupation']= label_encoder.fit_transform(test[' occupation'])
test[' relationship']= label_encoder.fit_transform(test[' relationship'])
test[' race']= label_encoder.fit_transform(test[' race'])
test[' sex']= label_encoder.fit_transform(test[' sex'])
test[' native-country']= label_encoder.fit_transform(test[' native-country'])
test[' salary-classification']= label_encoder.fit_transform(test[' salary-classification'])



data = training[['age', ' workclass', ' fnlwgt', ' education', ' education-num', ' marital-status', ' occupation', 
             ' relationship', ' race', ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country']]
target = training[' salary-classification']


data_test = test[['age', ' workclass', ' fnlwgt', ' education', ' education-num', ' marital-status', ' occupation', 
             ' relationship', ' race', ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country']]
target_test = test[' salary-classification']


In [14]:
pd.DataFrame(data.isnull().sum())

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,0
relationship,0
race,0
sex,0


# Filter methods

### SelectKBest

In [7]:
#selectkbest
selector = SelectKBest(chi2, k=10) #testar de 1 a 19
fit = selector.fit(data, target)
#features selection
cols = selector.get_support(indices=True)
cols_names = list(data.columns[cols])
selector.get_support(indices=True)
fit.scores_

array([8.60061182e+03, 4.75081192e+01, 1.71147683e+05, 2.97942270e+02,
       2.40142178e+03, 1.12346982e+03, 5.04558854e+02, 3.65914312e+03,
       3.30313051e+01, 5.02439419e+02, 8.21924671e+07, 1.37214589e+06,
       6.47640900e+03, 1.36192560e+01])

### VarianceThreshold

In [8]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold()
selector.fit_transform(data)
selector.variances_

array([7.30000000e+01, 2.11975372e+00, 1.47242000e+06, 1.49784830e+01,
       6.61868663e+00, 2.26863420e+00, 1.40000000e+01, 2.58163360e+00,
       7.20448827e-01, 2.21369502e-01, 9.99990000e+04, 4.35600000e+03,
       9.80000000e+01, 4.10000000e+01])

# Wrapper methods

### Recursive Feature Elimination

In [10]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


model = LogisticRegression(solver='lbfgs',max_iter=5000)
rfe = RFE(model,10)
fit = rfe.fit(data,target)
print(fit.support_)
print(fit.ranking_)
cols = rfe.get_support(indices=True)
cols_names = list(data.columns[cols])
cols



[ True  True False  True  True  True  True  True  True  True False False
  True False]
[1 1 5 1 1 1 1 1 1 1 4 3 1 2]


array([ 0,  1,  3,  4,  5,  6,  7,  8,  9, 12], dtype=int32)

# Embedded methods

### Principal Component Analysis

In [12]:
from sklearn.decomposition import PCA

pca=PCA()
fit=pca.fit(data)
print(fit.explained_variance_ratio_)

[9.95113633e-01 4.87183945e-03 1.44878129e-05 1.66472783e-08
 1.32821762e-08 5.46332990e-09 1.60533706e-09 1.42986911e-09
 4.49647372e-10 2.17725494e-10 1.75033388e-10 1.68427510e-10
 6.16270754e-11 1.23071689e-11]


### Feature Importance

In [13]:
from sklearn.ensemble import ExtraTreesClassifier


model = ExtraTreesClassifier(n_estimators=19)
model.fit(data, target)
model.feature_importances_

array([0.16652223, 0.04498153, 0.16339744, 0.04158615, 0.08359418,
       0.06766071, 0.07706281, 0.08304814, 0.01443687, 0.02676704,
       0.09135605, 0.02883226, 0.09281332, 0.01794127])