In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Dataset preprocessing

## Read dataset and first analysis

In [69]:
df = pd.read_csv('../resources/dataset/healthcare-dataset-stroke-data.csv')
df = df.drop(columns=['id'])
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [7]:
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

## Data preprocessing

In [12]:
df.query("stroke == 0").count()

gender               4861
age                  4861
hypertension         4861
heart_disease        4861
ever_married         4861
work_type            4861
Residence_type       4861
avg_glucose_level    4861
bmi                  4700
smoking_status       4861
stroke               4861
dtype: int64

### One-Hot encoding

In [70]:
columns_to_onehot = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
df_one_hot = pd.get_dummies(df[columns_to_onehot])
df = df.drop(columns_to_onehot, axis=1)
df = df.join(df_one_hot)

In [25]:
df

Unnamed: 0,age,avg_glucose_level,bmi,stroke,hypertension,heart_disease,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,228.69,36.6,1,0,1,False,True,False,False,...,False,True,False,False,False,True,False,True,False,False
1,61.0,202.21,,1,0,0,True,False,False,False,...,False,False,True,False,True,False,False,False,True,False
2,80.0,105.92,32.5,1,0,1,False,True,False,False,...,False,True,False,False,True,False,False,False,True,False
3,49.0,171.23,34.4,1,0,0,True,False,False,False,...,False,True,False,False,False,True,False,False,False,True
4,79.0,174.12,24.0,1,1,0,True,False,False,False,...,False,False,True,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,80.0,83.75,,0,1,0,True,False,False,False,...,False,True,False,False,False,True,False,False,True,False
5106,81.0,125.20,40.0,0,0,0,True,False,False,False,...,False,False,True,False,False,True,False,False,True,False
5107,35.0,82.99,30.6,0,0,0,True,False,False,False,...,False,False,True,False,True,False,False,False,True,False
5108,51.0,166.29,25.6,0,0,0,False,True,False,False,...,False,True,False,False,True,False,False,True,False,False


### Normalize data

In [71]:
columns_to_normalize = ['age', 'avg_glucose_level', 'bmi']
scaler = StandardScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

### Remove NaN rows

In [72]:
case = "AVERAGE"
if case == "REMOVE":
    df = df.dropna(subset=['bmi'])
if case == "AVERAGE":
    mean_stroke_1 = df[df['stroke'] == 1]['bmi'].dropna().mean()
    mean_stroke_0 = df[df['stroke'] == 0]['bmi'].dropna().mean()
    df.loc[df['stroke'] == 1, 'bmi'] = df[df['stroke'] == 1]['bmi'].fillna(mean_stroke_1)
    df.loc[df['stroke'] == 0, 'bmi'] = df[df['stroke'] == 0]['bmi'].fillna(mean_stroke_0)

## Features importance

In [73]:
X = df.drop(columns=["stroke"]).values
y = df["stroke"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42)

In [74]:
pred = RandomForestClassifier(100, random_state=42)
pred.fit(X_train, y_train)

y_pred_test = pred.predict(X_test)
accuracy_score(y_test, y_pred_test)

0.9366197183098591

In [75]:
confusion_matrix(y_test, y_pred_test)

array([[1197,    1],
       [  80,    0]])

In [76]:
feature_names = df.drop(columns=["stroke"]).columns
sorted(zip(feature_names, pred.feature_importances_), key=lambda x: x[1], reverse=True)

[('avg_glucose_level', 0.253510153888472),
 ('bmi', 0.23258234991079763),
 ('age', 0.23246658784226093),
 ('hypertension', 0.026203306908707022),
 ('heart_disease', 0.02496686543963964),
 ('smoking_status_never smoked', 0.02264569051749789),
 ('work_type_Private', 0.021186373978356234),
 ('Residence_type_Urban', 0.02110814140848908),
 ('Residence_type_Rural', 0.019594878800273573),
 ('smoking_status_formerly smoked', 0.01895719463350839),
 ('gender_Male', 0.0186542261380837),
 ('work_type_Self-employed', 0.018523020902273448),
 ('smoking_status_smokes', 0.01839953273005892),
 ('gender_Female', 0.017947160344298163),
 ('smoking_status_Unknown', 0.016274659523484874),
 ('work_type_Govt_job', 0.014376715637551675),
 ('ever_married_No', 0.010733819416841685),
 ('ever_married_Yes', 0.01070073559351673),
 ('work_type_children', 0.0011103979018837564),
 ('work_type_Never_worked', 5.818848400479053e-05),
 ('gender_Other', 0.0)]