# Projeto: Predição de renda
<hr>

#### Estudo e predição de dados de renda utilizando o algoritmo de Machine Learning MLPClassifier <br>

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
standard_scaler = StandardScaler()
df = pd.read_csv('census.csv')

> #### Iniciando exploração do dataset

In [3]:
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.shape

(32561, 15)

#### Definidas variáveis de entrada e variável alvo

In [5]:
x_census = df[['capital-gain', 'capital-loos', 'hour-per-week']]
y_census = df['income']

#### Utilizado StandardScaler para normalizacao dos dados de entrada

In [6]:
x_census = standard_scaler.fit_transform(x_census)

In [7]:
x_census

array([[ 0.1484529 , -0.21665953, -0.03542945],
       [-0.14592048, -0.21665953, -2.22215312],
       [-0.14592048, -0.21665953, -0.03542945],
       ...,
       [-0.14592048, -0.21665953, -0.03542945],
       [-0.14592048, -0.21665953, -1.65522476],
       [ 1.88842434, -0.21665953, -0.03542945]])

#### Efetuada separação dos dados de treinamento dos dados de teste

In [8]:
x_census_train, x_census_test, y_census_train, y_census_test = train_test_split(x_census, y_census, test_size = 0.25, random_state = 0)

In [9]:
(x_census_train.shape, x_census_test.shape, y_census_train.shape, y_census_test.shape)

((24420, 3), (8141, 3), (24420,), (8141,))

#### Iniciando treinamento do modelo

In [10]:
rede_neural = MLPClassifier(max_iter = 500, verbose = True, tol=0.00000100, hidden_layer_sizes = (100,100) )
rede_neural.fit(x_census_train, y_census_train)

Iteration 1, loss = 0.48973508
Iteration 2, loss = 0.45649206
Iteration 3, loss = 0.45387751
Iteration 4, loss = 0.45281176
Iteration 5, loss = 0.45154903
Iteration 6, loss = 0.45120837
Iteration 7, loss = 0.45086760
Iteration 8, loss = 0.45049512
Iteration 9, loss = 0.44976038
Iteration 10, loss = 0.44903634
Iteration 11, loss = 0.44822728
Iteration 12, loss = 0.44782079
Iteration 13, loss = 0.44800665
Iteration 14, loss = 0.44688648
Iteration 15, loss = 0.44670325
Iteration 16, loss = 0.44650865
Iteration 17, loss = 0.44594202
Iteration 18, loss = 0.44572669
Iteration 19, loss = 0.44599040
Iteration 20, loss = 0.44522930
Iteration 21, loss = 0.44499271
Iteration 22, loss = 0.44459138
Iteration 23, loss = 0.44469571
Iteration 24, loss = 0.44480041
Iteration 25, loss = 0.44490688
Iteration 26, loss = 0.44414983
Iteration 27, loss = 0.44405912
Iteration 28, loss = 0.44429054
Iteration 29, loss = 0.44389404
Iteration 30, loss = 0.44320332
Iteration 31, loss = 0.44425402
Iteration 32, los

In [11]:
previsoes = rede_neural.predict(x_census_test)
previsoes

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' <=50K'],
      dtype='<U6')

In [12]:
y_census_test

22278     <=50K
8950      <=50K
7838      <=50K
16505     <=50K
19140      >50K
          ...  
4149      <=50K
17168     <=50K
21748     <=50K
18155      >50K
150       <=50K
Name: income, Length: 8141, dtype: object

#### Avaliando a acurácia do modelo

In [13]:
accuracy_score(y_census_test, previsoes)

0.8156246161405233

#### Avaliando a acurácia de treinamento realizado em uma árvore de decisão

In [14]:
random_forest_census = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state = 0)
random_forest_census.fit(x_census_train, y_census_train)

In [15]:
previsoes = random_forest_census.predict(x_census_test)
previsoes

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' <=50K'],
      dtype=object)

In [16]:
accuracy_score(y_census_test, previsoes)

0.8253285837120747