In [125]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,LabelEncoder,OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import ExtraTreesClassifier

In [126]:
df_cencus = pd.read_csv('../Bases de dados/census.csv')
df_cencus

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [127]:
columns = df_cencus.columns[:-1]
columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country'],
      dtype='object')

In [128]:
X_census = df_cencus.iloc[:,0:14].values
Y_census = df_cencus.iloc[:,14].values

In [129]:
label_encoder_worclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital_status = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_country = LabelEncoder()

X_census[:,1] = label_encoder_worclass.fit_transform(X_census[:,1])
X_census[:,3] = label_encoder_education.fit_transform(X_census[:,3])
X_census[:,5] = label_encoder_marital_status.fit_transform(X_census[:,5])
X_census[:,6] = label_encoder_marital_status.fit_transform(X_census[:,6])
X_census[:,7] = label_encoder_marital_status.fit_transform(X_census[:,7])
X_census[:,8] = label_encoder_marital_status.fit_transform(X_census[:,8])
X_census[:,9] = label_encoder_marital_status.fit_transform(X_census[:,9])
X_census[:,13] = label_encoder_marital_status.fit_transform(X_census[:,13])

In [130]:
scaler = MinMaxScaler()
X_census_scaler = scaler.fit_transform(X_census)
X_census_scaler.shape

(32561, 14)

In [131]:
for i in range(X_census_scaler.shape[1]):
    print(X_census_scaler[:,i].var())

0.034913808595952486
0.03312115190663569
0.005138537590667898
0.06657103564450892
0.029416385024073417
0.06301761677301636
0.09123816653931152
0.10326534394406342
0.04502805169292987
0.22136950173699113
0.00545419549240862
0.008557270623428908
0.015874043397822807
0.03641266114220053


In [132]:
selection = VarianceThreshold(threshold=0.05)
X_census_selection = selection.fit_transform(X_census_scaler)
X_census_selection.shape

(32561, 5)

In [133]:
X_census_selection

array([[0.6       , 0.66666667, 0.07142857, 0.2       , 1.        ],
       [0.6       , 0.33333333, 0.28571429, 0.        , 1.        ],
       [0.73333333, 0.        , 0.42857143, 0.2       , 1.        ],
       ...,
       [0.73333333, 1.        , 0.07142857, 0.8       , 0.        ],
       [0.73333333, 0.66666667, 0.07142857, 0.6       , 1.        ],
       [0.73333333, 0.33333333, 0.28571429, 1.        , 0.        ]])

In [134]:
selection.variances_

array([0.03491381, 0.03312115, 0.00513854, 0.06657104, 0.02941639,
       0.06301762, 0.09123817, 0.10326534, 0.04502805, 0.2213695 ,
       0.0054542 , 0.00855727, 0.01587404, 0.03641266])

In [135]:
index = np.where(selection.variances_ >0.05)
index

(array([3, 5, 6, 7, 9]),)

In [136]:
columns[index]

Index(['education', 'marital-status', 'occupation', 'relationship', 'sex'], dtype='object')

In [137]:
df_cencus_variance = df_cencus.drop(columns=['age', 'workclass', 'final-weight', 'education-num', 'race', 'capital-gain','capital-loos', 'hour-per-week','native-country'], axis = 1)

In [138]:
df_cencus_variance

Unnamed: 0,education,marital-status,occupation,relationship,sex,income
0,Bachelors,Never-married,Adm-clerical,Not-in-family,Male,<=50K
1,Bachelors,Married-civ-spouse,Exec-managerial,Husband,Male,<=50K
2,HS-grad,Divorced,Handlers-cleaners,Not-in-family,Male,<=50K
3,11th,Married-civ-spouse,Handlers-cleaners,Husband,Male,<=50K
4,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Female,<=50K
...,...,...,...,...,...,...
32556,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,Female,<=50K
32557,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,Male,>50K
32558,HS-grad,Widowed,Adm-clerical,Unmarried,Female,<=50K
32559,HS-grad,Never-married,Adm-clerical,Own-child,Male,<=50K


In [139]:
X_census_variance = df_cencus_variance.iloc[:,0:5].values
y_census_variance = df_cencus_variance.iloc[:,5].values

In [140]:

X_census_variance[:,0] = label_encoder_education.fit_transform(X_census_variance[:,0])
X_census_variance[:,1] = label_encoder_marital_status.fit_transform(X_census_variance[:,1])
X_census_variance[:,2] = label_encoder_occupation.fit_transform(X_census_variance[:,2])
X_census_variance[:,3] = label_encoder_relationship.fit_transform(X_census_variance[:,3])
X_census_variance[:,4] = label_encoder_sex.fit_transform(X_census_variance[:,4])

In [141]:
X_census_variance

array([[9, 4, 1, 1, 1],
       [9, 2, 4, 0, 1],
       [11, 0, 6, 1, 1],
       ...,
       [11, 6, 1, 4, 0],
       [11, 4, 1, 3, 1],
       [11, 2, 4, 5, 0]], dtype=object)

In [142]:
onehotEncoder = ColumnTransformer(transformers=[("OneHot",OneHotEncoder(),[0,1,2,3,4])],remainder='passthrough')
X_census_variance_transform = onehotEncoder.fit_transform(X_census_variance).toarray()
X_census_variance_transform.shape

(32561, 46)

In [143]:
X_census_scaler_2 = scaler.fit_transform(X_census_variance_transform)
X_census_scaler_2.shape

(32561, 46)

In [144]:
X_census_train_variance, X_census_test_variance, y_census_train_variance, y_census_test_variance = train_test_split(X_census_scaler_2, Y_census, test_size=0.15, random_state=0)
X_census_train_variance.shape, X_census_test_variance.shape

((27676, 46), (4885, 46))

In [145]:
random_forest_census = RandomForestClassifier(criterion = 'entropy', min_samples_leaf =  1, min_samples_split = 5, n_estimators = 100)
random_forest_census.fit(X_census_train_variance, y_census_train_variance)

In [146]:
predict = random_forest_census.predict(X_census_test_variance)
accuracy_score(y_census_test_variance, predict)

0.8184237461617195

In [147]:
extraTree = ExtraTreesClassifier()
extraTree.fit(X_census_scaler,Y_census)

In [148]:
importances = extraTree.feature_importances_
importances

array([0.15870041, 0.04515574, 0.16435674, 0.03728087, 0.08699774,
       0.07104982, 0.07530984, 0.08826862, 0.01413825, 0.02901048,
       0.09033129, 0.02827433, 0.09361393, 0.01751193])

In [149]:
importances.sum()

np.float64(0.9999999999999997)

In [150]:
index = []

for i in range(len(importances)):
    if importances[i] >= 0.029:
        index.append(i)

index

[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 12]

In [151]:
columns[index]

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'sex', 'capital-gain',
       'hour-per-week'],
      dtype='object')

In [152]:
x_census_extra = X_census[:,index]
x_census_extra

array([[39, 7, 77516, ..., 1, 2174, 40],
       [50, 6, 83311, ..., 1, 0, 13],
       [38, 4, 215646, ..., 1, 0, 40],
       ...,
       [58, 4, 151910, ..., 0, 0, 40],
       [22, 4, 201490, ..., 1, 0, 20],
       [52, 5, 287927, ..., 0, 15024, 40]], dtype=object)

In [153]:
onehotEncoder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [1,3,5,6,7])],remainder='passthrough')
x_census_extra = onehotEncoder.fit_transform(x_census_extra).toarray()
x_census_extra.shape

(32561, 59)

In [154]:
X_census_train_extra, X_census_test_extra, y_census_train_extra, y_census_test_extra = train_test_split(x_census_extra, Y_census, test_size=0.15, random_state=0)
X_census_train_extra.shape, X_census_test_extra.shape

((27676, 59), (4885, 59))

In [155]:
random_forest_extra = RandomForestClassifier(criterion = 'entropy', min_samples_leaf =  1, min_samples_split = 5, n_estimators = 100)
random_forest_extra.fit(X_census_train_extra, y_census_train_extra)

In [156]:
predict = random_forest_extra.predict(X_census_test_extra)
accuracy_score(y_census_test_extra, predict)

0.8466734902763562