In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder


In [18]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [19]:
df.stroke.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [20]:
df = df.dropna()
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [21]:
df_smoke = df.loc[df.smoking_status != 'Unknown']
df_smoke

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5100,68398,Male,82.0,1,0,Yes,Self-employed,Rural,71.97,28.3,never smoked,0
5102,45010,Female,57.0,0,0,Yes,Private,Rural,77.93,21.7,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0


In [22]:
df = df.drop(['id'], axis=1)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [23]:
df = df.astype({'age': int, 'avg_glucose_level': int, 'bmi':int})
df.dtypes

gender               object
age                   int64
hypertension          int64
heart_disease         int64
ever_married         object
work_type            object
Residence_type       object
avg_glucose_level     int64
bmi                   int64
smoking_status       object
stroke                int64
dtype: object

In [24]:
df = pd.get_dummies(df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 
                                'smoking_status'])
df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67,0,1,228,36,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,80,0,1,105,32,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,49,0,0,171,34,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,79,1,0,174,24,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,81,0,0,186,29,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13,0,0,103,18,0,1,0,0,1,...,0,0,0,1,1,0,1,0,0,0
5106,81,0,0,125,40,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
5107,35,0,0,82,30,0,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5108,51,0,0,166,25,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0


In [25]:
df = df.sample(frac=1)
df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
4874,41,0,0,87,30,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2566,7,0,0,86,17,0,0,1,0,1,...,0,0,0,1,0,1,1,0,0,0
3021,79,0,0,103,22,0,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
416,3,0,0,73,16,0,1,0,0,1,...,0,0,0,1,0,1,1,0,0,0
72,66,1,0,116,31,1,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2393,41,0,0,140,46,0,1,0,0,0,...,0,1,0,0,0,1,1,0,0,0
1415,37,0,0,120,33,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1
2763,45,0,0,63,32,0,1,0,0,0,...,0,1,0,0,1,0,1,0,0,0
3552,25,0,0,66,29,0,0,1,0,0,...,0,1,0,0,1,0,1,0,0,0


In [26]:
y = df['stroke']
X = df.drop(['stroke'], axis=1)

In [39]:
X

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
4874,41,0,0,87,30,1,0,0,0,1,...,0,1,0,0,0,1,0,0,1,0
2566,7,0,0,86,17,0,1,0,1,0,...,0,0,0,1,0,1,1,0,0,0
3021,79,0,0,103,22,0,1,0,0,1,...,0,0,1,0,1,0,0,1,0,0
416,3,0,0,73,16,1,0,0,1,0,...,0,0,0,1,0,1,1,0,0,0
72,66,1,0,116,31,1,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2393,41,0,0,140,46,1,0,0,0,1,...,0,1,0,0,0,1,1,0,0,0
1415,37,0,0,120,33,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1
2763,45,0,0,63,32,1,0,0,0,1,...,0,1,0,0,1,0,1,0,0,0
3552,25,0,0,66,29,0,1,0,0,1,...,0,1,0,0,1,0,1,0,0,0


In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [58]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9581635425156207
Testing Data Score: 0.9560260586319218


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [67]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=800).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.9560260586319218


In [68]:
clf

RandomForestClassifier(n_estimators=800, random_state=1)

In [69]:
import pickle

pickle.dump(clf, open('model.h5', 'wb'))

In [70]:
loaded_model = pickle.load(open('model.h5', 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9560260586319218


In [71]:
loaded_model

RandomForestClassifier(n_estimators=800, random_state=1)

In [74]:
print(loaded_model.predict(X[1:10]))
print(y[1:10])

[0 0 0 0 0 0 0 0 0]
2566    0
3021    0
416     0
72      1
4452    0
4093    0
5024    0
4611    0
1541    0
Name: stroke, dtype: int64


In [75]:
import tensorflow as tf
nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=8, activation="relu", input_dim=21))
# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=6, activation="relu"))

# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()

nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=60)



Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 8)                 176       
_________________________________________________________________
dense_13 (Dense)             (None, 6)                 54        
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 7         
Total params: 237
Trainable params: 237
Non-trainable params: 0
_________________________________________________________________
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60

In [76]:
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.1480 - accuracy: 0.9560
Loss: 0.14798589050769806, Accuracy: 0.9560260772705078


In [77]:
nn_model.save('nn_model.h5')

In [78]:
from tensorflow.keras.models import load_model
model = load_model("nn_model.h5")
model

<keras.engine.sequential.Sequential at 0x7fca96ddeef0>

In [87]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.1480 - accuracy: 0.9560
Loss: 0.14798589050769806, Accuracy: 0.9560260772705078


In [95]:
print(model.predict(X[0:6]).round())
print(y)

[[1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [1.]]
4874    0
2566    0
3021    0
416     0
72      1
       ..
2393    0
1415    0
2763    0
3552    0
2335    0
Name: stroke, Length: 4909, dtype: int64
