### Stroke Prediction Dataset - Acidente Vascular Cerebral
* https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset

*  Stroke is a medical emergency. A stroke occurs when blood flow to a part of your brain is interrupted or reduced, preventing brain tissue from getting oxygen and nutrients. Brain cells begin to die within minutes

* Age: People aged 55 years and over

* Hypertension: if the systolic pressure is 140 mm Hg or more, or the diastolic pressure is 90 mm Hg or more

* Hypercholesterolemia: If the cholesterol level in the blood is 200 milligrams per deciliter

* Smoking

* Diabetes

* Obesity: if the body mass index (BMI) is 30 or more



In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# Carrega a base de dados
df=pd.read_csv('healthcare-dataset-stroke-data.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


### Limpeza e Normalização

In [3]:
# formato dos dados
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


Limpeza do atributo BMI

In [4]:
media = df['bmi'].mean()
media

28.893236911794666

In [5]:
df['bmi'].fillna(media, inplace=True)

In [6]:
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


Normalização com StandardScaler()

In [8]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [9]:
columns = ['age','avg_glucose_level','bmi']

In [10]:
scaler = StandardScaler()

In [11]:
ajuste = scaler.fit(df[columns])
df[columns] = ajuste.transform(df[columns])

In [12]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,1.051434,0,1,Yes,Private,Urban,2.706375,1.001234,formerly smoked,1
1,51676,Female,0.78607,0,0,Yes,Self-employed,Rural,2.121559,4.615554e-16,never smoked,1
2,31112,Male,1.62639,0,1,Yes,Private,Rural,-0.005028,0.4685773,never smoked,1
3,60182,Female,0.255342,0,0,Yes,Private,Urban,1.437358,0.7154182,smokes,1
4,1665,Female,1.582163,1,0,Yes,Self-employed,Rural,1.501184,-0.6357112,never smoked,1


In [13]:
# classe
df['stroke'].unique()

array([1, 0])

In [14]:
df['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

#### Transformação dos atributos categóricos
* OneHotEncoder aplicado em todos os atributos

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

In [22]:
categorical_columns

['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [25]:
# Gerar o objeto OneHotEncoder
encoder = OneHotEncoder()

# Transforma todos os atributos categóricos
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

In [27]:
one_hot_encoded.toarray()

array([[0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [28]:
# Transformar os dados em um formato dataframe
one_hot_df = pd.DataFrame(one_hot_encoded.toarray(), columns = encoder.get_feature_names_out(categorical_columns) )

In [29]:
one_hot_df

Unnamed: 0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5106,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5107,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5108,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [30]:
# Adicionar as novas colunas no Dataframe Original
df = pd.concat([df, one_hot_df], axis = 1)

In [31]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke', 'gender_Female', 'gender_Male',
       'gender_Other', 'ever_married_No', 'ever_married_Yes',
       'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes'],
      dtype='object')

In [32]:
# Remove as colunas categóricas originais
df = df.drop(categorical_columns, axis = 1)

In [33]:
df.columns

Index(['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level',
       'bmi', 'stroke', 'gender_Female', 'gender_Male', 'gender_Other',
       'ever_married_No', 'ever_married_Yes', 'work_type_Govt_job',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes'],
      dtype='object')

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              5110 non-null   int64  
 1   age                             5110 non-null   float64
 2   hypertension                    5110 non-null   int64  
 3   heart_disease                   5110 non-null   int64  
 4   avg_glucose_level               5110 non-null   float64
 5   bmi                             5110 non-null   float64
 6   stroke                          5110 non-null   int64  
 7   gender_Female                   5110 non-null   float64
 8   gender_Male                     5110 non-null   float64
 9   gender_Other                    5110 non-null   float64
 10  ever_married_No                 5110 non-null   float64
 11  ever_married_Yes                5110 non-null   float64
 12  work_type_Govt_job              51

In [35]:
df.shape

(5110, 23)

### Amostragem

Houldout e Balanceamento das classes
* Separação entre treino e teste (70% e 30%)

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [37]:
# Separar os atributos e a classe
X = df.drop(['stroke'], axis=1)
y = df['stroke']

In [38]:
X.shape, df.shape

((5110, 22), (5110, 23))

In [39]:
# Amostragem por houldout
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [40]:
X_train.shape, X_test.shape

((3577, 22), (1533, 22))

In [41]:
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)


In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95      1463
           1       0.12      0.14      0.13        70

    accuracy                           0.91      1533
   macro avg       0.54      0.55      0.54      1533
weighted avg       0.92      0.91      0.92      1533



In [43]:
confusion_matrix(y_test,y_pred)

array([[1387,   76],
       [  60,   10]])

In [44]:
y_test.value_counts()

stroke
0    1463
1      70
Name: count, dtype: int64

Balanceamento das classes
* https://medium.com/analytics-vidhya/undersampling-and-oversampling-an-old-and-a-new-approach-4f984a0e8392
* Abordagem SMOTE

In [45]:
df['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [46]:
from imblearn.over_sampling import SMOTE

In [47]:
sm = SMOTE()
X_balanced, y_balanced = sm.fit_resample(X, y)

In [48]:
X_balanced.shape

(9722, 22)

In [49]:
y_balanced.value_counts()

stroke
1    4861
0    4861
Name: count, dtype: int64

In [50]:
X_train,X_test,y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3)

In [51]:
X_train.shape, X_test.shape

((6805, 22), (2917, 22))

In [52]:
y_train.value_counts(), y_test.value_counts()

(stroke
 0    3458
 1    3347
 Name: count, dtype: int64,
 stroke
 1    1514
 0    1403
 Name: count, dtype: int64)

In [53]:
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)


In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1403
           1       0.94      0.95      0.95      1514

    accuracy                           0.94      2917
   macro avg       0.94      0.94      0.94      2917
weighted avg       0.94      0.94      0.94      2917



In [55]:
confusion_matrix(y_test,y_pred)

array([[1314,   89],
       [  77, 1437]])

## Validação Cruzada

In [56]:
from sklearn.metrics import accuracy_score

from sklearn.model_selection import (
    KFold,
    LeaveOneOut,
    StratifiedKFold,
    cross_validate
)

### K-fold Cross-validation

In [57]:
DT = DecisionTreeClassifier()
# K-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True)

# estimate generalization error
clf =  cross_validate(
    DT,
    X,
    y,
    scoring='balanced_accuracy',
    return_train_score=True,
    cv=kf, # k-fold
)

print(f"{clf['test_score']}\nMédia: {np.mean(clf['test_score'])}")

[0.57644593 0.54016653 0.53390269 0.51817876 0.54267641 0.56134177
 0.57942387 0.51560739 0.56316416 0.59325103]
Média: 0.552415853716797


Balanceado

In [58]:
DT = DecisionTreeClassifier()
# K-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True)

# estimate generalization error
clf =  cross_validate(
    DT,
    X_balanced,
    y_balanced,
    scoring='balanced_accuracy',
    return_train_score=True,
    cv=kf, # k-fold
)

print(f"{clf['test_score']}\nMédia: {np.mean(clf['test_score'])}")

[0.9342282  0.95581624 0.93276862 0.94636935 0.95162124 0.93607184
 0.94371156 0.94857724 0.94236707 0.94229825]
Média: 0.9433829604163135


### Stratified K-fold Cross-validation

In [59]:
DT = DecisionTreeClassifier()
# Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True)

# estimate generalization error
clf =  cross_validate(
    DT,
    X,
    y,
    scoring='balanced_accuracy',
    return_train_score=True,
    cv=skf, # k-fold
)

print(f"{clf['test_score']}\nMédia: {np.mean(clf['test_score'])}")

[0.4845679  0.51530864 0.57427984 0.59736626 0.56604938 0.59427984
 0.53222222 0.55325103 0.6091358  0.54637235]
Média: 0.5572833253056844


Balanceado

In [60]:
DT = DecisionTreeClassifier()
# Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True)

# estimate generalization error
clf =  cross_validate(
    DT,
    X_balanced,
    y_balanced,
    scoring='balanced_accuracy',
    return_train_score=True,
    cv=skf, # k-fold
)

print(f"{clf['test_score']}\nMédia: {np.mean(clf['test_score'])}")

[0.94243119 0.94553451 0.94547325 0.93415638 0.94958848 0.95576132
 0.94855967 0.94547325 0.93518519 0.95884774]
Média: 0.946101097675362
