In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [46]:
df = pd.read_csv('./data/Churn_treino.csv', sep=';')

In [47]:
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0,1,1,1,10134888,1
1,608,Spain,Female,41,1,8380786,1,0,1,11254258,0
2,502,France,Female,42,8,1596608,3,1,0,11393157,1
3,699,France,Female,39,1,0,2,0,0,9382663,0
4,850,Spain,Female,43,2,12551082,1,1,1,790841,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0,2,1,0,9627064,0
9996,516,France,Male,35,10,5736961,1,1,1,10169977,0
9997,709,France,Female,36,7,0,1,0,1,4208558,1
9998,772,Germany,Male,42,3,7507531,2,1,0,9288852,1


In [48]:
X = df.drop("Exited", axis=1) #variáveis independentes
y = df["Exited"] #variável dependente que queremos prever

In [49]:
### Padronizando os dados com standardscaler

In [50]:
standardscaler = StandardScaler()
numerical = X.select_dtypes(include=['int64','float64']).columns
X[numerical] = standardscaler.fit_transform(X[numerical])

In [51]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,-0.326221,France,Female,0.293517,-1.041760,-1.110553,-0.911583,0.646092,0.970243,0.170614
1,-0.440036,Spain,Female,0.198164,-1.387538,0.222782,-0.911583,-1.547768,0.970243,0.353281
2,-1.536794,France,Female,0.293517,1.032908,-0.856542,2.527057,0.646092,-1.030670,0.375948
3,0.501521,France,Female,0.007457,-1.387538,-1.110553,0.807737,-1.547768,-1.030670,0.047859
4,2.063884,Spain,Female,0.388871,-1.041760,0.886252,-0.911583,0.646092,0.970243,-1.354223
...,...,...,...,...,...,...,...,...,...,...
9995,1.246488,France,Male,0.007457,-0.004426,-1.110553,0.807737,0.646092,-1.030670,0.087743
9996,-1.391939,France,Male,-0.373958,1.724464,-0.197835,-0.911583,0.646092,0.970243,0.176340
9997,0.604988,France,Female,-0.278604,0.687130,-1.110553,-0.911583,-1.547768,0.970243,-0.796492
9998,1.256835,Germany,Male,0.293517,-0.695982,0.083852,0.807737,0.646092,-1.030670,0.032551


In [52]:
labelencoder = LabelEncoder()

In [53]:
categorical = X.select_dtypes(include='object').columns
for col in categorical:
  X[col] = labelencoder.fit_transform(X[col])

In [54]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,-0.326221,0,0,0.293517,-1.041760,-1.110553,-0.911583,0.646092,0.970243,0.170614
1,-0.440036,2,0,0.198164,-1.387538,0.222782,-0.911583,-1.547768,0.970243,0.353281
2,-1.536794,0,0,0.293517,1.032908,-0.856542,2.527057,0.646092,-1.030670,0.375948
3,0.501521,0,0,0.007457,-1.387538,-1.110553,0.807737,-1.547768,-1.030670,0.047859
4,2.063884,2,0,0.388871,-1.041760,0.886252,-0.911583,0.646092,0.970243,-1.354223
...,...,...,...,...,...,...,...,...,...,...
9995,1.246488,0,1,0.007457,-0.004426,-1.110553,0.807737,0.646092,-1.030670,0.087743
9996,-1.391939,0,1,-0.373958,1.724464,-0.197835,-0.911583,0.646092,0.970243,0.176340
9997,0.604988,0,0,-0.278604,0.687130,-1.110553,-0.911583,-1.547768,0.970243,-0.796492
9998,1.256835,1,1,0.293517,-0.695982,0.083852,0.807737,0.646092,-1.030670,0.032551


In [55]:
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

### Topologia da rede

In [59]:
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1])) #neurônios, ativação, 
model.add(Dropout(0.4))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(units=1, activation='sigmoid'))

In [60]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#epochs - n vezes os dados vão passar pela rede
#batch_size - quantas instâncias vai haver atualização dos pesos na rede
model.fit(X_train, y_train, epochs=50, batch_size=32)

Epoch 1/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7252 - loss: 0.5843 
Epoch 2/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8058 - loss: 0.4607 
Epoch 3/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8049 - loss: 0.4524 
Epoch 4/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8083 - loss: 0.4414 
Epoch 5/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8119 - loss: 0.4209 
Epoch 6/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8197 - loss: 0.4212 
Epoch 7/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8172 - loss: 0.4194 
Epoch 8/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8317 - loss: 0.3958 
Epoch 9/50
[1m219/219[0m [32m

<keras.src.callbacks.history.History at 0x76cefd144bd0>

In [61]:
previsions = model.predict(X_test)
previsions

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  


array([[0.22916375],
       [0.29421052],
       [0.15763374],
       ...,
       [0.05922202],
       [0.11631647],
       [0.568229  ]], dtype=float32)

In [63]:
#transformando em 0 e 1
y_pred = (previsions > 0.5).astype('int32')
y_pred

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]], dtype=int32)

In [65]:
print('Acurácia: ', accuracy_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('Matriz de Confusão: \n', confusion_matrix(y_test, y_pred))

Acurácia:  0.855
F1:  0.5907808090310442
Recall:  0.5056360708534622
Matriz de Confusão: 
 [[2251  128]
 [ 307  314]]
