# Autor: Natan Nobre Chaves
## Bacharelando em Engenharia de Computação

## Title: Stroke Prediction Dataset

### Context:<br>
According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

### Attribute Information:<br>
1) id: unique identifier<br>
2) gender: "Male", "Female" or "Other"<br>
3) age: age of the patient<br>
4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension<br>
5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease<br>
6) ever_married: "No" or "Yes"<br>
7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"<br>
8) Residence_type: "Rural" or "Urban"<br>
9) avg_glucose_level: average glucose level in blood<br>
10) bmi: body mass index<br>
11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*<br>
12) stroke: 1 if the patient had a stroke or 0 if not<br>
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient<br>

Source: https://www.kaggle.com/fedesoriano/stroke-prediction-dataset

# Bibliotecas

In [1357]:
from matplotlib import pyplot
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
# Generate and plot a synthetic imbalanced classification dataset
from numpy import where
from collections import Counter
import imblearn
from imblearn.over_sampling import SMOTE

# 1. Importação dos Dados

In [1358]:
cols = ["id", "sex", "age", "hypertension", "heart_disease", "ever_married", "work_type", "residence_type", "avg_glucose_level", "bmi", "smoking_status", "stroke"]
stroke = pd.read_csv("dataset/healthcare-dataset-stroke-data.csv")
stroke.columns = cols
stroke.head()

Unnamed: 0,id,sex,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# 2. Análise dos Dados

In [1359]:
stroke.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [1360]:
print(stroke.shape)
stroke.isnull().values.sum() # Quantidade de amostras faltando algum parâmetro

(5110, 12)


201

# 3. Pré-processamento dos dados

In [1361]:
stroke = stroke.dropna() # Retira as amostras que faltam algum parâmetro
print(stroke.shape)
stroke.isnull().values.sum()

(4909, 12)


0

In [1362]:
stroke = stroke.drop(stroke[stroke['sex'] == 'Other'].index[0])
stroke.groupby('stroke').count()

Unnamed: 0_level_0,id,sex,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,4699,4699,4699,4699,4699,4699,4699,4699,4699,4699,4699
1,209,209,209,209,209,209,209,209,209,209,209


## 3.1 Codificando as features: Strings -> Inteiros

In [1363]:
# create a mapping dict
sex_mapping = {label: idx for idx, label in enumerate(np.unique(stroke['sex']))}
ever_married_mapping = {label: idx for idx, label in enumerate(np.unique(stroke['ever_married']))}
work_type_mapping = {label: idx for idx, label in enumerate(np.unique(stroke['work_type']))}
residence_type_mapping = {label: idx for idx, label in enumerate(np.unique(stroke['residence_type']))}
smoking_status_mapping = {label: idx for idx, label in enumerate(np.unique(stroke['smoking_status']))}
print(sex_mapping)
print(ever_married_mapping)
print(work_type_mapping)
print(residence_type_mapping)
print(smoking_status_mapping)
# convert class labels from strings to integers
stroke['sex'] = stroke['sex'].map(sex_mapping)
stroke['ever_married'] = stroke['ever_married'].map(ever_married_mapping)
stroke['work_type'] = stroke['work_type'].map(work_type_mapping)
stroke['residence_type'] = stroke['residence_type'].map(residence_type_mapping)
stroke['smoking_status'] = stroke['smoking_status'].map(smoking_status_mapping)
print(stroke.dtypes)
stroke.head()

{'Female': 0, 'Male': 1}
{'No': 0, 'Yes': 1}
{'Govt_job': 0, 'Never_worked': 1, 'Private': 2, 'Self-employed': 3, 'children': 4}
{'Rural': 0, 'Urban': 1}
{'Unknown': 0, 'formerly smoked': 1, 'never smoked': 2, 'smokes': 3}
id                     int64
sex                    int64
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int64
work_type              int64
residence_type         int64
avg_glucose_level    float64
bmi                  float64
smoking_status         int64
stroke                 int64
dtype: object


Unnamed: 0,id,sex,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,56669,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [1364]:
y = stroke['stroke'].values
X = stroke
del stroke['stroke']
del stroke['id']
X = stroke.values
X.shape

(4908, 10)

In [1365]:
# summarize class distribution
counter = Counter(y)
print(counter)

Counter({0: 4699, 1: 209})


## 3.2. SMOTE - Synthetic Minority Oversampling Technique

In [1366]:
# transform the dataset
oversample = SMOTE(sampling_strategy='minority', random_state=42, k_neighbors=1) # Default: k_neighbors=5
X_smote, y_smote = oversample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y_smote)
print(counter)

Counter({1: 4699, 0: 4699})


# 4. Modelos preditivos de classificação

In [1367]:
# separando os dados aleatoriamente em 70%/30%
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42, stratify=y)
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, train_size=0.7, random_state=42, stratify=y_smote)

In [1368]:
# Normalizando os dados
mms = MinMaxScaler()

X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

X_train_smote_norm = mms.fit_transform(X_train_smote)
X_test_smote_norm = mms.transform(X_test_smote)

In [1369]:
# Standardizando os dados
stdsc = StandardScaler()

X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

X_train_smote_std = stdsc.fit_transform(X_train_smote)
X_test_smote_std = stdsc.transform(X_test_smote)

In [1370]:
quantidade_de_modelos = 1

# criando os modelos kNN
modelokNN = []
for idx in range(quantidade_de_modelos) :
    modelokNN.append(KNeighborsClassifier(n_neighbors=(idx+1)))

# treinando o modelo
for idx in range(quantidade_de_modelos) :
    modelokNN[idx].fit(X_train, y_train)

# predizendo as amostras de teste
y_pred = []
for idx in range(quantidade_de_modelos) :
    y_pred.append(np.array(modelokNN[idx].predict(X_test)))

In [1371]:
# criando os modelos kNN com os dados normalizados
modelokNN_norm = []
for idx in range(quantidade_de_modelos) :
    modelokNN_norm.append(KNeighborsClassifier(n_neighbors=(idx+1)))

# treinando o modelo
for idx in range(quantidade_de_modelos) :
    modelokNN_norm[idx].fit(X_train_norm, y_train)

# predizendo as amostras de teste
y_pred_norm = []
for idx in range(quantidade_de_modelos) :
    y_pred_norm.append(np.array(modelokNN_norm[idx].predict(X_test_norm)))

In [1372]:
# criando os modelos kNN com os dados standardizados
modelokNN_std = []
for idx in range(quantidade_de_modelos) :
    modelokNN_std.append(KNeighborsClassifier(n_neighbors=(idx+1)))

# treinando o modelo
for idx in range(quantidade_de_modelos) :
    modelokNN_std[idx].fit(X_train_std, y_train)

# predizendo as amostras de teste
y_pred_std = []
for idx in range(quantidade_de_modelos) :
    y_pred_std.append(np.array(modelokNN_std[idx].predict(X_test_std)))

In [1373]:
# criando os modelos kNN usando SMOTE
modelokNN_smote = []
for idx in range(quantidade_de_modelos) :
    modelokNN_smote.append(KNeighborsClassifier(n_neighbors=(idx+1)))

# treinando o modelo
for idx in range(quantidade_de_modelos) :
    modelokNN_smote[idx].fit(X_train_smote, y_train_smote)

# predizendo as amostras de teste
y_pred_smote = []
for idx in range(quantidade_de_modelos) :
    y_pred_smote.append(np.array(modelokNN_smote[idx].predict(X_test_smote)))

In [1374]:
# criando os modelos kNN usando SMOTE e dados normalizados
modelokNN_smote_norm = []
for idx in range(quantidade_de_modelos) :
    modelokNN_smote_norm.append(KNeighborsClassifier(n_neighbors=(idx+1)))

# treinando o modelo
for idx in range(quantidade_de_modelos) :
    modelokNN_smote_norm[idx].fit(X_train_smote_norm, y_train_smote)

# predizendo as amostras de teste
y_pred_smote_norm = []
for idx in range(quantidade_de_modelos) :
    y_pred_smote_norm.append(np.array(modelokNN_smote_norm[idx].predict(X_test_smote_norm)))

In [1375]:
# criando os modelos kNN usando SMOTE e dados standardizados
modelokNN_smote_std = []
for idx in range(quantidade_de_modelos) :
    modelokNN_smote_std.append(KNeighborsClassifier(n_neighbors=(idx+1)))

# treinando o modelo
for idx in range(quantidade_de_modelos) :
    modelokNN_smote_std[idx].fit(X_train_smote_std, y_train_smote)

# predizendo as amostras de teste
y_pred_smote_std = []
for idx in range(quantidade_de_modelos) :
    y_pred_smote_std.append(np.array(modelokNN_smote_std[idx].predict(X_test_smote_std)))

# 5. Métricas de avaliação dos resultados

## 5.1. Acurácia

In [1376]:
for idx in range(quantidade_de_modelos) :
    print(metrics.accuracy_score(y_test, y_pred[idx])) # DADOS CRUS

0.923285811269518


In [1377]:
for idx in range(quantidade_de_modelos) :
    print(metrics.accuracy_score(y_test, y_pred_norm[idx])) # DADOS CRUS NORMALIZADOS

0.9226069246435845


In [1378]:
for idx in range(quantidade_de_modelos) :
    print(metrics.accuracy_score(y_test, y_pred_std[idx])) # DADOS CRUS STANDARDIZADOS

0.9198913781398507


In [1379]:
for idx in range(quantidade_de_modelos) :
    print(metrics.accuracy_score(y_test_smote, y_pred_smote[idx])) # DADOS UTILIZANDO OVERSAMPLING(SMOTE)

0.9631205673758865


In [1380]:
for idx in range(quantidade_de_modelos) :
    print(metrics.accuracy_score(y_test_smote, y_pred_smote_norm[idx])) # DADOS UTILIZANDO OVERSAMPLING(SMOTE) E NORMALIZADO

0.9687943262411347


In [1381]:
for idx in range(quantidade_de_modelos) :
    print(metrics.accuracy_score(y_test_smote, y_pred_smote_std[idx])) # DADOS UTILIZANDO OVERSAMPLING(SMOTE) E STANDARDIZADO

0.9695035460992908


## 5.2. Precisão

In [1382]:
for idx in range(quantidade_de_modelos) :
    print(metrics.precision_score(y_test, y_pred[idx], average='weighted', zero_division=0)) # DADOS CRUS

0.9249959897131373


In [1383]:
for idx in range(quantidade_de_modelos) :
    print(metrics.precision_score(y_test, y_pred_norm[idx], average='weighted', zero_division=0)) # DADOS CRUS NORMALIZADOS

0.9270677626134621


In [1384]:
for idx in range(quantidade_de_modelos) :
    print(metrics.precision_score(y_test, y_pred_std[idx], average='weighted', zero_division=0)) # DADOS CRUS STANDARDIZADOS

0.9222536516514689


In [1385]:
for idx in range(quantidade_de_modelos) :
    print(metrics.precision_score(y_test_smote, y_pred_smote[idx], average='weighted', zero_division=0)) # DADOS UTILIZANDO OVERSAMPLING(SMOTE)

0.9655568904422116


In [1386]:
for idx in range(quantidade_de_modelos) :
    print(metrics.precision_score(y_test_smote, y_pred_smote_norm[idx], average='weighted', zero_division=0)) # DADOS UTILIZANDO OVERSAMPLING(SMOTE) E NORMALIZADO

0.9698872081134331


In [1387]:
for idx in range(quantidade_de_modelos) :
    print(metrics.precision_score(y_test_smote, y_pred_smote_std[idx], average='weighted', zero_division=0)) # DADOS UTILIZANDO OVERSAMPLING(SMOTE) E STANDARDIZADO

0.9710198314578391


## 5.3. Recall

In [1388]:
for idx in range(quantidade_de_modelos) :
    print(metrics.recall_score(y_test, y_pred[idx], average='weighted', zero_division=0)) # DADOS CRUS

0.923285811269518


In [1389]:
for idx in range(quantidade_de_modelos) :
    print(metrics.recall_score(y_test, y_pred_norm[idx], average='weighted', zero_division=0)) # DADOS CRUS NORMALIZADOS

0.9226069246435845


In [1396]:
for idx in range(quantidade_de_modelos) :
    print(metrics.recall_score(y_test, y_pred_std[idx], average='weighted', zero_division=0)) # DADOS CRUS STANDARDIZADOS

0.9198913781398507


In [1390]:
for idx in range(quantidade_de_modelos) :
    print(metrics.recall_score(y_test_smote, y_pred_smote[idx], average='weighted', zero_division=0)) # DADOS UTILIZANDO OVERSAMPLING(SMOTE)

0.9631205673758865


In [1391]:
for idx in range(quantidade_de_modelos) :
    print(metrics.recall_score(y_test_smote, y_pred_smote_norm[idx], average='weighted', zero_division=0)) # DADOS UTILIZANDO OVERSAMPLING(SMOTE) E NORMALIZADO

0.9687943262411347


In [1397]:
for idx in range(quantidade_de_modelos) :
    print(metrics.recall_score(y_test_smote, y_pred_smote_std[idx], average='weighted', zero_division=0)) # DADOS UTILIZANDO OVERSAMPLING(SMOTE) E STANDARDIZADOS

0.9695035460992908


## 5.4. F1-Measure

In [1392]:
for idx in range(quantidade_de_modelos) :
    print(metrics.f1_score(y_test, y_pred[idx], average='weighted', zero_division=0)) # DADOS CRUS

0.9241369878440843


In [1393]:
for idx in range(quantidade_de_modelos) :
    print(metrics.f1_score(y_test, y_pred_norm[idx], average='weighted', zero_division=0)) # DADOS CRUS NORMALIZADOS

0.9248070759758755


In [1398]:
for idx in range(quantidade_de_modelos) :
    print(metrics.f1_score(y_test, y_pred_std[idx], average='weighted', zero_division=0)) # DADOS CRUS STANDARDIZADOS

0.9210669231481349


In [1394]:
for idx in range(quantidade_de_modelos) :
    print(metrics.f1_score(y_test_smote, y_pred_smote[idx], average='weighted', zero_division=0)) # DADOS UTILIZANDO OVERSAMPLING(SMOTE)

0.9630722553876885


In [1395]:
for idx in range(quantidade_de_modelos) :
    print(metrics.f1_score(y_test_smote, y_pred_smote_norm[idx], average='weighted', zero_division=0)) # DADOS UTILIZANDO OVERSAMPLING(SMOTE) E NORMALIZADO

0.9687761708432648


In [1399]:
for idx in range(quantidade_de_modelos) :
    print(metrics.f1_score(y_test_smote, y_pred_smote_std[idx], average='weighted', zero_division=0)) # DADOS UTILIZANDO OVERSAMPLING(SMOTE) E STANDARDIZADO

0.9694789831361691


# 6. Apresentação dos resultados