Necessary libraries

In [66]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import Precision, Recall
from imblearn.over_sampling import SMOTE
import numpy as np

Reading data & Cleaning

In [67]:
df = pd.read_csv("dataset.csv")

In [68]:
X_train = df.drop(columns=['id','stroke'])
y_train = df['stroke']

In [69]:
print(X_train.isnull().sum())
print(y_train.isnull().sum())

gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
dtype: int64
0


In [70]:

X_train.fillna({
    'bmi': X_train['bmi'].mean(),
    'smoking_status': X_train['smoking_status'].mode()[0]
}, inplace=True)


One hot encoding and fixing data types for the neural network

In [71]:

X_train = pd.get_dummies(columns=['gender','ever_married','work_type','Residence_type','smoking_status'], data= X_train)
print(X_train.dtypes)

age                               float64
hypertension                        int64
heart_disease                       int64
avg_glucose_level                 float64
bmi                               float64
gender_Female                        bool
gender_Male                          bool
gender_Other                         bool
ever_married_No                      bool
ever_married_Yes                     bool
work_type_Govt_job                   bool
work_type_Never_worked               bool
work_type_Private                    bool
work_type_Self-employed              bool
work_type_children                   bool
Residence_type_Rural                 bool
Residence_type_Urban                 bool
smoking_status_formerly smoked       bool
smoking_status_never smoked          bool
smoking_status_smokes                bool
dtype: object


In [72]:
X_train = X_train.astype('float32')
print(X_train.dtypes)

age                               float32
hypertension                      float32
heart_disease                     float32
avg_glucose_level                 float32
bmi                               float32
gender_Female                     float32
gender_Male                       float32
gender_Other                      float32
ever_married_No                   float32
ever_married_Yes                  float32
work_type_Govt_job                float32
work_type_Never_worked            float32
work_type_Private                 float32
work_type_Self-employed           float32
work_type_children                float32
Residence_type_Rural              float32
Residence_type_Urban              float32
smoking_status_formerly smoked    float32
smoking_status_never smoked       float32
smoking_status_smokes             float32
dtype: object


Scaling and Splitting

In [73]:

scaler = StandardScaler()

X_train,X_test = train_test_split(X_train, random_state= 32, test_size= 0.2)
y_train,y_test = train_test_split(y_train,random_state=32,test_size=0.2)
X_train[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(X_train[['age', 'avg_glucose_level', 'bmi']])
X_test[['age', 'avg_glucose_level', 'bmi']] = scaler.transform(X_test[['age', 'avg_glucose_level', 'bmi']])


Converting to numpy arrays

In [74]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

Building the model

In [75]:
model = Sequential([
    Input(shape = ((X_train.shape[1],))),
    Dense(150, activation = 'relu'),
    Dense(100, activation = 'relu'),
    Dense(50, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])
model.compile(loss = tf.keras.losses.BinaryCrossentropy(), optimizer = tf.keras.optimizers.Adam(0.001), metrics = ['accuracy', Precision(), Recall()])
model.summary()


Training

In [76]:
model.fit(X_train, y_train, epochs = 10, batch_size = 64)

Epoch 1/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9692 - loss: 0.1269 - precision_3: 0.0151 - recall_3: 0.0094
Epoch 2/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9820 - loss: 0.0752 - precision_3: 0.0000e+00 - recall_3: 0.0000e+00
Epoch 3/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9818 - loss: 0.0760 - precision_3: 0.0000e+00 - recall_3: 0.0000e+00
Epoch 4/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9813 - loss: 0.0763 - precision_3: 0.0000e+00 - recall_3: 0.0000e+00
Epoch 5/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9812 - loss: 0.0763 - precision_3: 0.0000e+00 - recall_3: 0.0000e+00
Epoch 6/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9827 - loss: 0.0718 - precision_3: 0.0000e+00 - recall_3: 0.000

<keras.src.callbacks.history.History at 0x1a589f06dd0>

Something is really really wrong here obviously... Honestly I knew this dataset is imbalanced from the start but I wanted to try xD

In [77]:
def check_if_imbalanced(y_train):
    cnt_1 = 0
    cnt_0 = 0
    for example in y_train:
        if int(example) == 1:
            cnt_1 += 1
        else:
            cnt_0 += 1
    print((cnt_1 + cnt_0) == len(y_train))
    print(f"Positive Values: {cnt_1}, Negative Values: {cnt_0}")

check_if_imbalanced(y_train)


True
Positive Values: 625, Negative Values: 34095


Lets solve this by SMOTE oversampling method

In [78]:
sm = SMOTE(random_state=42, k_neighbors=3)
X_train, y_train = sm.fit_resample(X_train, y_train)

check_if_imbalanced(y_train)

True
Positive Values: 34095, Negative Values: 34095


Back to business

In [79]:
model = Sequential([
    Input(shape = ((X_train.shape[1],))),
    Dense(150, activation = 'relu'),
    Dense(100, activation = 'relu'),
    Dense(50, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])
model.compile(loss = tf.keras.losses.BinaryCrossentropy(), optimizer = tf.keras.optimizers.Adam(0.001), metrics = ['accuracy', Precision(), Recall()])

In [80]:
model.fit(X_train, y_train, epochs = 10, batch_size = 32)

Epoch 1/10
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.8016 - loss: 0.4255 - precision_4: 0.7682 - recall_4: 0.8640
Epoch 2/10
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8777 - loss: 0.2938 - precision_4: 0.8375 - recall_4: 0.9378
Epoch 3/10
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9060 - loss: 0.2363 - precision_4: 0.8706 - recall_4: 0.9541
Epoch 4/10
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9200 - loss: 0.2063 - precision_4: 0.8870 - recall_4: 0.9611
Epoch 5/10
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9284 - loss: 0.1869 - precision_4: 0.8995 - recall_4: 0.9646
Epoch 6/10
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9362 - loss: 0.1697 - precision_4: 0.9096 - recall_4: 0.9683
Epoch 7/10
[1m2131/21

<keras.src.callbacks.history.History at 0x1a58a566290>

Recall is amazing. For prediction strokes recall is crucial.

In [81]:
np.unique(y_test, return_counts=True)


(array([0, 1], dtype=int64), array([8522,  158], dtype=int64))

In [82]:
model.evaluate(X_test, y_test)

[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8959 - loss: 0.3892 - precision_4: 0.0403 - recall_4: 0.1714     


[0.3463371694087982,
 0.8987327218055725,
 0.04539722576737404,
 0.2278480976819992]