In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('bank-full.csv',sep=';')
df.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [96]:
df['y'].value_counts()


no     39922
yes     5289
Name: y, dtype: int64

# Pré-processamento

In [97]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler

# Renomeando coluna alvo 
df.rename(columns={'y': 'subscribed_term_deposit'}, inplace=True)
# Renomeando coluna default  
df.rename(columns={'default': 'credit_in_default'}, inplace=True)


# Transformando coluna default em uma única coluna binária
df['credit_in_default_bin'] = 0
df.loc[df['credit_in_default']=='yes','credit_in_default_bin'] = 1

# Transformando noite_dia em uma única coluna binária
df['housing_bin'] = 0
df.loc[df['housing']=='yes','housing_bin'] = 1

# Transformando noite_dia em uma única coluna binária
df['loan_bin'] = 0
df.loc[df['loan']=='yes','loan'] = 1


atributos_numericos = ['age','balance','day','duration','campaign','pdays','previous']
atributos_binarios  = ['credit_in_default_bin','housing_bin','loan_bin']
atributos_categoricos = ['job', 'marital','education', 'contact', 'month', 'poutcome']
coluna_alvo = ['subscribed_term_deposit']


# Pré-processamento dos atributos numérios
scaler = StandardScaler()
X = df[atributos_numericos]
X = scaler.fit_transform(X)
feature_names = atributos_numericos

# Acrescentando as colunas binárias
X = np.concatenate((X, df[atributos_binarios]),axis=1)
feature_names = np.append(feature_names,atributos_binarios)


# Pré-processamento dos atributos categóricos
for atrb_name in atributos_categoricos:
    # binarizando a coluna de nome atrb_name
    lb = LabelBinarizer()
    dados = lb.fit_transform(df[atrb_name].values)
    
    # acrescentando a matriz com as novas colunas na matriz X
    X = np.concatenate((X, dados),axis=1)
    
    # criando nomes para as novas colunas no seguinte formato: nome_do_atributo=valor_do_atributo
    nomes_novos_atributos = []
    for class_name in lb.classes_:
        nomes_novos_atributos = np.append(nomes_novos_atributos,atrb_name+'='+str(class_name).strip())
        
    # acrescentando os nomes das novas colunas na lista completa do dataset
    feature_names = np.append(feature_names,nomes_novos_atributos)

# Pré-processamento da coluna alvo
y = df[coluna_alvo] == "yes"

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   age                      45211 non-null  int64 
 1   job                      45211 non-null  object
 2   marital                  45211 non-null  object
 3   education                45211 non-null  object
 4   credit_in_default        45211 non-null  object
 5   balance                  45211 non-null  int64 
 6   housing                  45211 non-null  object
 7   loan                     45211 non-null  object
 8   contact                  45211 non-null  object
 9   day                      45211 non-null  int64 
 10  month                    45211 non-null  object
 11  duration                 45211 non-null  int64 
 12  campaign                 45211 non-null  int64 
 13  pdays                    45211 non-null  int64 
 14  previous                 45211 non-nul

In [99]:
y

Unnamed: 0,subscribed_term_deposit
0,False
1,False
2,False
3,False
4,False
...,...
45206,True
45207,True
45208,True
45209,False


### Separando os conjuntos de treino e teste

In [106]:
from sklearn.model_selection import train_test_split

# separando os conjuntos de dados de treino e teste
df_treino, df_teste = train_test_split(df, test_size=0.2, random_state=42)

# separando a coluna alvo do conjunto de treino
df_treino_labels = df_treino['subscribed_term_deposit'].copy()
df_treino        = df_treino.drop(columns='subscribed_term_deposit')

# separando a coluna alvo do conjunto de teste
df_teste_labels = df_teste['subscribed_term_deposit'].copy()
df_teste        = df_teste.drop(columns='subscribed_term_deposit')

In [107]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

pipeline_atr_numericos = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

preproc_completo = ColumnTransformer([
    ('numericos',   pipeline_atr_numericos, atributos_numericos),
    ('categoricos', OneHotEncoder(),        atributos_categoricos),
    ], 
    sparse_threshold=0)

# pre-processamento do conjunto de treino
X_treino = preproc_completo.fit_transform(df_treino)

# pre-processamento do conjunto de teste
X_teste = preproc_completo.transform(df_teste)

# pre-processamento da coluna alvo para treno
y_treino = df_treino_labels.values == 'yes'

# pre-processamento da coluna alvo para teste
y_teste = df_teste_labels.values == 'yes'

In [108]:
X_treino.shape

(36168, 45)

In [109]:
y_treino.shape

(36168,)

In [110]:
X_teste.shape

(9043, 45)

In [111]:
y_teste.shape

(9043,)

## Treinamento e avaliação de desempenho

In [114]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [117]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_treino, y_treino)

y_previsto = log_reg.predict(X_teste)

print("Acurácia: ", accuracy_score(y_teste,y_previsto))
print("Precisão: ", precision_score(y_teste,y_previsto))
print("Recall:   ", recall_score(y_teste,y_previsto))
print("F1 Score: ", f1_score(y_teste,y_previsto))

Acurácia:  0.8987061815769103
Precisão:  0.6532399299474606
Recall:    0.34188817598533455
F1 Score:  0.44885679903730447


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [119]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(hidden_layer_sizes=(20))
mlp_clf.fit(X_treino, y_treino)

y_previsto = mlp_clf.predict(X_teste)
print("MLP:")
print("Acurácia: ", accuracy_score(y_teste,y_previsto))
print("Precisão: ", precision_score(y_teste,y_previsto))
print("Recall:   ", recall_score(y_teste,y_previsto))
print("F1 Score: ", f1_score(y_teste,y_previsto))

MLP:
Acurácia:  0.9061152272475949
Precisão:  0.6437054631828979
Recall:    0.49679193400549954
F1 Score:  0.5607863424728402


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
 {'hidden_layer_sizes': [(10), (20), (30), (10,10), (20,20), (20,10), (30,20,10)], 
  'activation': ['logistic', 'tanh', 'relu']},
 ]

mlp_clf = MLPClassifier()

grid_search = GridSearchCV(mlp_clf, param_grid)
grid_search.fit(X_treino, y_treino)





In [None]:
grid_search.best_params_