In [1]:
#importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn import preprocessing
import sklearn.preprocessing

data = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
data = data.drop("id", axis=1)
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [2]:
#label encoder
le = sklearn.preprocessing.LabelEncoder()
le.fit(data["gender"])
data["gender"] = le.transform(data["gender"])
le.fit(data["ever_married"])
data["ever_married"] = le.transform(data["ever_married"])
le.fit(data["work_type"])
data["work_type"] = le.transform(data["work_type"])
le.fit(data["Residence_type"])
data["Residence_type"] = le.transform(data["Residence_type"])
le.fit(data["smoking_status"])
data["smoking_status"] = le.transform(data["smoking_status"])


print(f"Prev total: {len(data)}")

#Removing missing data
to_remove = pd.isna(data.loc[:, "bmi"])
print(f"Removed {sum(to_remove)}.")
data = data[~to_remove]
print(f"Total: {len(data)}")

Prev total: 5110
Removed 201.
Total: 4909


In [4]:
X = data.drop("stroke", axis=1)
y = data["stroke"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 


In [5]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier 
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy of Decision Tree:",metrics.accuracy_score(y_test, y_pred))

#Random Forest
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier()
clf2 = clf2.fit(X_train,y_train)
y_pred2 = clf2.predict(X_test)
print("Accuracy of Random Forest:",metrics.accuracy_score(y_test, y_pred2))

# Test with SVM
from sklearn import svm
clf3 = svm.SVC()
clf3.fit(X_train, y_train)
pred3 = clf.predict(X_test)
print("Accuracy of SVM:",metrics.accuracy_score(y_test, pred3))

#Test With KNN
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(X_train, y_train)
X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape
pred4 = neigh.predict(X_test)
print("Accuracy of KNN:",metrics.accuracy_score(y_test, pred4))

#Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf5 = GaussianNB()
clf5.fit(X_train, y_train)
pred5 = clf5.predict(X_test)
print("Accuracy of NB:",metrics.accuracy_score(y_test, pred5))


Accuracy of Decision Tree: 0.9144602851323829
Accuracy of Random Forest: 0.9470468431771895
Accuracy of SVM: 0.9144602851323829
Accuracy of KNN: 0.9419551934826884
Accuracy of NB: 0.890020366598778


## Hyperparameter Tuning The Best Model

In [72]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import sklearn.preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn import metrics 


data = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
data = data.drop("id", axis=1)

#label encoder
le = sklearn.preprocessing.LabelEncoder()
le.fit(data["gender"])
data["gender"] = le.transform(data["gender"])
le.fit(data["ever_married"])
data["ever_married"] = le.transform(data["ever_married"])
le.fit(data["work_type"])
data["work_type"] = le.transform(data["work_type"])
le.fit(data["Residence_type"])
data["Residence_type"] = le.transform(data["Residence_type"])
le.fit(data["smoking_status"])
data["smoking_status"] = le.transform(data["smoking_status"])


#Removing missing data
to_remove = pd.isna(data.loc[:, "bmi"])
data = data[~to_remove]

X = data.drop("stroke", axis=1)
y = data["stroke"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 

n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 


forest = RandomForestClassifier()

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(forest, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
best_model = gridF.fit(X_train.values, y_train.values)


Fitting 3 folds for each of 500 candidates, totalling 1500 fits


In [9]:
best_model.best_score_

0.9605296664120194

In [10]:
best_model.best_score_
y_pred = best_model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9480651731160896


In [45]:
#Confidence Score of Model
best_model.predict_proba(X_test)

array([[0.92932136, 0.07067864],
       [0.98226042, 0.01773958],
       [1.        , 0.        ],
       ...,
       [0.81509864, 0.18490136],
       [0.992     , 0.008     ],
       [0.89992198, 0.10007802]])

In [11]:
#Saving Model
import pickle

with open("StrokeModel.pickle", "wb") as f:
    pickle.dump(best_model, f)