## Importação das bibliotecas

In [81]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px


In [82]:
data = pd.read_csv('Heart Failure Clinical Records.csv')
data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


## Tratamento de dados

In [83]:
data = data.drop(data[data['platelets']>420000].index)
data = data.drop(data[data['serum_creatinine']>2.5].index)
data = data.drop(data[data['creatinine_phosphokinase']>1500].index)

## Análise exploratória

In [84]:
print('Tamanho: ', end="")
print(data.shape)
print('-----------------------------------------------------------------------------------------------------------')
display(data.describe())
print('-----------------------------------------------------------------------------------------------------------')
print(data.info())

Tamanho: (239, 13)
-----------------------------------------------------------------------------------------------------------


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0
mean,60.913531,0.451883,345.313808,0.422594,38.221757,0.372385,249320.580042,1.184435,136.661088,0.635983,0.313808,131.242678,0.301255
std,11.814824,0.498724,296.169846,0.495009,11.997417,0.484455,70024.15133,0.401031,4.355788,0.482163,0.465013,75.884751,0.459766
min,40.0,0.0,30.0,0.0,14.0,0.0,25100.0,0.6,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,115.0,0.0,30.0,0.0,206000.0,0.9,134.0,0.0,0.0,74.5,0.0
50%,60.0,0.0,213.0,0.0,38.0,0.0,257000.0,1.1,137.0,1.0,0.0,118.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,289500.0,1.3,140.0,1.0,1.0,201.0,1.0
max,95.0,1.0,1419.0,1.0,80.0,1.0,418000.0,2.5,148.0,1.0,1.0,285.0,1.0


-----------------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 239 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       239 non-null    float64
 1   anaemia                   239 non-null    int64  
 2   creatinine_phosphokinase  239 non-null    int64  
 3   diabetes                  239 non-null    int64  
 4   ejection_fraction         239 non-null    int64  
 5   high_blood_pressure       239 non-null    int64  
 6   platelets                 239 non-null    float64
 7   serum_creatinine          239 non-null    float64
 8   serum_sodium              239 non-null    int64  
 9   sex                       239 non-null    int64  
 10  smoking                   239 non-null    int64  
 11  time                      239 non-null    int64  
 12  DEATH_EVENT 

Aqui observamos que temos mais mulheres do que homens na nossa base de dados.

In [85]:
tmp = data.copy()
tmp['sex'] = tmp['sex'].apply(lambda x: 'Mulher' if x == 0 else 'Homem')
tmp['DEATH_EVENT'] = tmp['DEATH_EVENT'].apply(lambda x: 'Não faleceu' if x == 0 else 'Faleceu')

fig = px.pie(tmp, names='sex', width=500, color_discrete_sequence=['#19647E', '#A31621'])
fig.update_xaxes(title_text='Sexo')
fig.update_yaxes(title_text='')
fig.update_layout(bargap=0.2)
# fig.show()
display(data)


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
5,90.0,1,47,0,40,1,204000.0,2.1,132,1,1,8,1
6,75.0,1,246,0,15,0,127000.0,1.2,137,1,0,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,60.0,0,320,0,35,0,133000.0,1.4,139,1,0,258,0
292,52.0,0,190,1,38,0,382000.0,1.0,140,1,1,258,0
293,63.0,1,103,1,35,0,179000.0,0.9,136,1,1,270,0
294,62.0,0,61,1,38,1,155000.0,1.1,143,1,1,270,0


Aqui podemos ver que apenas 32% dos homens e 32,4% das mulheres da base de dados faleceram durante o acompanhamento

In [86]:
fig = px.histogram(tmp, x='DEATH_EVENT', width=500, 
             color_discrete_sequence=['#19647E', '#A31621'], color='sex', labels={"sex": "Sexo"})
fig.update_xaxes(title_text='Evento')
fig.update_yaxes(title_text='')
fig.update_layout(bargap=0.2)
fig.show()

Aqui podemos observar que, dos pacientes falecidos, os homens são os que mais acabaram morrendo durante o acompanhamento

In [87]:
graph_df = tmp.loc[tmp['DEATH_EVENT'] == 'Faleceu']
fig = px.pie(graph_df, names='sex', width=500, 
             color_discrete_sequence=['#19647E', '#A31621'])
fig.show()

## Procurando features

In [88]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
age,1.0,0.0561,0.018336,-0.057426,0.072848,0.122123,-0.08791,0.207104,0.020792,0.050015,-0.004218,-0.197546,0.248983
anaemia,0.0561,1.0,-0.201025,0.006124,0.002844,-0.003784,0.026644,-0.057331,0.086269,-0.099355,-0.124851,-0.091727,0.063483
creatinine_phosphokinase,0.018336,-0.201025,1.0,-0.006755,-0.095706,-0.027232,-0.002856,0.023912,-0.180406,-0.012525,-0.01347,0.086162,0.023926
diabetes,-0.057426,0.006124,-0.006755,1.0,-0.006649,0.04186,-0.007973,0.005123,-0.155447,-0.144958,-0.103945,0.059897,-0.026341
ejection_fraction,0.072848,0.002844,-0.095706,-0.006649,1.0,0.046457,0.082162,-0.184522,0.171013,-0.141424,-0.081814,0.020898,-0.301618
high_blood_pressure,0.122123,-0.003784,-0.027232,0.04186,0.046457,1.0,0.101976,-0.00832,0.032183,-0.100776,-0.073278,-0.231509,0.097872
platelets,-0.08791,0.026644,-0.002856,-0.007973,0.082162,0.101976,1.0,-0.097105,0.071511,-0.093954,0.018899,0.036182,-0.11096
serum_creatinine,0.207104,-0.057331,0.023912,0.005123,-0.184522,-0.00832,-0.097105,1.0,-0.303413,0.022074,-0.077341,-0.113498,0.392654
serum_sodium,0.020792,0.086269,-0.180406,-0.155447,0.171013,0.032183,0.071511,-0.303413,1.0,-0.002972,0.029909,0.048249,-0.181689
sex,0.050015,-0.099355,-0.012525,-0.144958,-0.141424,-0.100776,-0.093954,0.022074,-0.002972,1.0,0.4554,0.051344,-0.014988


## Iniciando modelagem

In [89]:
from sklearn.model_selection import train_test_split

X = data[['time', 'serum_creatinine', 'ejection_fraction', 'age']]
y = data['DEATH_EVENT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [90]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn import tree

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
import tensorflow as tf

import lightgbm as lgb



# Criando modelos

Antes de mais nada, vamos esclarescer algumas decisões tomadas para decidirmos o melhor modelo:

    • Iremos prestar atenção principalmente para o recall do modelo, pois ele tenta analisar aqueles que o modelo realmente acertou (ideal para doenças pois analisa principalmente VERDADEIROS POSITIVOS)

LogisticRegression

In [91]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
prev = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, prev)}')
print(f'Recall: {recall_score(y_test, prev)}')
print(f'F1: {f1_score(y_test, prev)}')
print(confusion_matrix(y_test, prev))


Accuracy: 0.8860759493670886
Recall: 0.7619047619047619
F1: 0.7804878048780488
[[54  4]
 [ 5 16]]


DecisionTree

In [92]:
clf = tree.DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
prev = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, prev)}')
print(f'Recall: {recall_score(y_test, prev)}')
print(f'F1: {f1_score(y_test, prev)}')
print(confusion_matrix(y_test, prev))


Accuracy: 0.8860759493670886
Recall: 0.8095238095238095
F1: 0.7906976744186046
[[53  5]
 [ 4 17]]


KNN

In [93]:
neigh = KNeighborsClassifier(n_neighbors=2).fit(X_train, y_train)
prev = neigh.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, prev)}')
print(f'Recall: {recall_score(y_test, prev)}')
print(f'F1: {f1_score(y_test, prev)}')
print(confusion_matrix(y_test, prev))

Accuracy: 0.8354430379746836
Recall: 0.47619047619047616
F1: 0.6060606060606061
[[56  2]
 [11 10]]


SVM

In [94]:
model = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(X_train, y_train)
prev = model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, prev)}')
print(f'Recall: {recall_score(y_test, prev)}')
print(f'F1: {f1_score(y_test, prev)}')
print(confusion_matrix(y_test, prev))

Accuracy: 0.9240506329113924
Recall: 0.9047619047619048
F1: 0.8636363636363636
[[54  4]
 [ 2 19]]


RandomForest

In [95]:
clf = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)
prev = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, prev)}')
print(f'Recall: {recall_score(y_test, prev)}')
print(f'F1: {f1_score(y_test, prev)}')
print(confusion_matrix(y_test, prev))

Accuracy: 0.8734177215189873
Recall: 0.6190476190476191
F1: 0.7222222222222222
[[56  2]
 [ 8 13]]


GradientBoosting

In [96]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)
prev = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, prev)}')
print(f'Recall: {recall_score(y_test, prev)}')
print(f'F1: {f1_score(y_test, prev)}')
print(confusion_matrix(y_test, prev))

Accuracy: 0.810126582278481
Recall: 0.6190476190476191
F1: 0.6341463414634146
[[51  7]
 [ 8 13]]


Keras

In [97]:
classificador = Sequential()
classificador.add(Dense(units=3, activation='relu', input_dim=4))
classificador.add(Dense(units=3, activation='relu'))
classificador.add(Dense(units=1, activation='sigmoid'))
classificador.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Recall()])

classificador.fit(X_train, y_train, batch_size=10, epochs=400, verbose=0)

<keras.callbacks.History at 0x221b8049460>

In [98]:
prev = classificador.predict(X_test)
prev = (prev > 0.6)
print(f'Accuracy: {accuracy_score(y_test, prev)}')
print(f'Recall: {recall_score(y_test, prev)}')
print(f'F1: {f1_score(y_test, prev)}')
print(confusion_matrix(y_test, prev))

Accuracy: 0.8734177215189873
Recall: 0.6190476190476191
F1: 0.7222222222222222
[[56  2]
 [ 8 13]]


LGBMClassifier

In [99]:
clf = lgb.LGBMClassifier().fit(X_train, y_train)
prev = clf.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, prev)}')
print(f'Recall: {recall_score(y_test, prev)}')
print(f'F1: {f1_score(y_test, prev)}')
print(confusion_matrix(y_test, prev))

Accuracy: 0.8354430379746836
Recall: 0.6666666666666666
F1: 0.6829268292682926
[[52  6]
 [ 7 14]]


# Melhorando o melhor modelo

### O melhor modelo encontrado foi o SVC:

In [100]:
model = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(X_train, y_train)
prev = model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, prev)}')
print(f'Recall: {recall_score(y_test, prev)}')
print(f'F1: {f1_score(y_test, prev)}')
print(confusion_matrix(y_test, prev))

Accuracy: 0.9240506329113924
Recall: 0.9047619047619048
F1: 0.8636363636363636
[[54  4]
 [ 2 19]]


Testando hyperparametros

In [101]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'rbf', 'sigmoid'],
              'class_weight': ['balanced'],
              'random_state': [1]
            } 

grid = GridSearchCV(SVC(), param_grid, refit = True, scoring='recall')
grid.fit(X_train, y_train)

grid_predictions = grid.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, grid_predictions)}')
print(f'Recall: {recall_score(y_test, grid_predictions)}')
print(f'F1: {f1_score(y_test, grid_predictions)}')

print('--------------------------------------------')
print(grid.best_params_)
print(grid.best_estimator_)
print(confusion_matrix(y_test, grid_predictions))
print(grid.best_score_)

Accuracy: 0.7341772151898734
Recall: 0.0
F1: 0.0
--------------------------------------------
{'C': 0.1, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf', 'random_state': 1}
SVC(C=0.1, class_weight='balanced', gamma=1, random_state=1)
[[58  0]
 [21  0]]
1.0


Adicionando HyperParametros em nosso modelo

In [102]:
model = make_pipeline(StandardScaler(), SVC(C=0.1, class_weight='balanced', gamma=1, kernel='linear', random_state=1)).fit(X_train, y_train)
prev = model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, prev)}')
print(f'Recall: {recall_score(y_test, prev)}')
print(f'F1: {f1_score(y_test, prev)}')
print(confusion_matrix(y_test, prev))

Accuracy: 0.8481012658227848
Recall: 0.8571428571428571
F1: 0.75
[[49  9]
 [ 3 18]]
