# Seleção de atributos

In [1]:
import pandas as pd
import numpy as np
import seaborn as srn
srn.set()
import matplotlib.pyplot as plt

In [16]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold

## Preparação da base de dados

In [2]:
base_census = pd.read_csv('dados\census.csv')
print(base_census.shape)
base_census.head()

(32561, 15)


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
colunas = base_census.columns [:-1]
colunas

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country'],
      dtype='object')

In [5]:
X_census = base_census.iloc[:, 0:14].values
y_census = base_census.iloc[:, 14].values
X_census.shape, y_census.shape

((32561, 14), (32561,))

In [8]:
# Label Encoder para atributos categóricos

label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_country = LabelEncoder()

X_census[:,1] = label_encoder_workclass.fit_transform(X_census[:,1])
X_census[:,3] = label_encoder_education.fit_transform(X_census[:,3])
X_census[:,5] = label_encoder_marital.fit_transform(X_census[:,5])
X_census[:,6] = label_encoder_occupation.fit_transform(X_census[:,6])
X_census[:,7] = label_encoder_relationship.fit_transform(X_census[:,7])
X_census[:,8] = label_encoder_race.fit_transform(X_census[:,8])
X_census[:,9] = label_encoder_sex.fit_transform(X_census[:,9])
X_census[:,13] = label_encoder_country.fit_transform(X_census[:,13])

In [11]:
# normalização
scaler = MinMaxScaler()
X_census_scaler = scaler.fit_transform(X_census)
X_census_scaler.shape

(32561, 14)

## Low varience

In [19]:
X_census_scaler.shape

(32561, 14)

In [17]:
# variancia dos atributos
for i in range(X_census_scaler.shape[1]):
    print(X_census_scaler[:, i].var())

0.034913808595952486
0.03312115190663569
0.005138537590667898
0.06657103564450892
0.029416385024073417
0.06301761677301636
0.09123816653931152
0.10326534394406342
0.04502805169292987
0.22136950173699113
0.00545419549240862
0.008557270623428908
0.015874043397822807
0.03641266114220053


In [18]:
selecao = VarianceThreshold(threshold=0.05)
X_census_variancia = selecao.fit_transform(X_census_scaler)
X_census_variancia.shape # apenas 5 atributos foram selecionados

(32561, 5)

In [20]:
selecao.variances_

array([0.03491381, 0.03312115, 0.00513854, 0.06657104, 0.02941639,
       0.06301762, 0.09123817, 0.10326534, 0.04502805, 0.2213695 ,
       0.0054542 , 0.00855727, 0.01587404, 0.03641266])

In [22]:
indices = np.where(selecao.variances_>0.05)
indices

(array([3, 5, 6, 7, 9], dtype=int64),)

In [50]:
colunas_base = list(colunas[indices].values)
colunas_base.append('income')
colunas_base

['education', 'marital-status', 'occupation', 'relationship', 'sex', 'income']

In [51]:
base_census_variancia = base_census[colunas_base]
base_census_variancia.head()

Unnamed: 0,education,marital-status,occupation,relationship,sex,income
0,Bachelors,Never-married,Adm-clerical,Not-in-family,Male,<=50K
1,Bachelors,Married-civ-spouse,Exec-managerial,Husband,Male,<=50K
2,HS-grad,Divorced,Handlers-cleaners,Not-in-family,Male,<=50K
3,11th,Married-civ-spouse,Handlers-cleaners,Husband,Male,<=50K
4,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Female,<=50K


In [63]:
X_census_variacia = base_census_variancia.iloc[:, 0:5].values
y_census_variacia = base_census_variancia.iloc[:, 5].values
X_census_variacia.shape, y_census_variacia.shape

((32561, 5), (32561,))

In [58]:
#OneHotEncoder
onehotencoder = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [0,1,2,3,4])], remainder='passthrough')
X_census_variancia = onehotencoder.fit_transform(X_census_variancia).toarray()
X_census_variancia.shape

(32561, 46)

In [62]:
# normalização
scaler = MinMaxScaler()
X_census_variancia = scaler.fit_transform(X_census_variancia)
X_census_variancia.shape

(32561, 46)

In [64]:
# train e test
# treinamento e teste
X_census_train_var, X_census_test_var, y_census_train_var, y_census_test_var = train_test_split(X_census_variancia, y_census_variacia, test_size=0.15, random_state=0)
print('train:', X_census_train_var.shape, y_census_train_var.shape)
print('test:', X_census_test_var.shape, y_census_test_var.shape)

train: (27676, 46) (27676,)
test: (4885, 46) (4885,)


In [67]:
# randomforest
random_forest_var = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, n_estimators=100, random_state=0)
random_forest_var.fit(X_census_train_var, y_census_train_var)

RandomForestClassifier(criterion='entropy', min_samples_split=5, random_state=0)

In [68]:
previsoes = random_forest_var.predict(X_census_test_var)
accuracy_score(y_census_test_var, previsoes)

0.8176049129989764

## Extra tree