# Stroke prediction based on patient data

In [3]:
import statistics
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline                   #Import Pipeline
from sklearn.preprocessing import StandardScaler 
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold

### Reading and pre-processing the data set

In [5]:
data_set = pd.read_csv ("healthcare-dataset-stroke-data.csv").dropna().to_numpy()

data_set = data_set[:,1:]
#print(data_set)
n,m = data_set.shape


for i in range(n):

    if(data_set[i][0] == "Male"):
        data_set[i][0] = 0
    if(data_set[i][0] == "Female"):
        data_set[i][0] = 1
    if(data_set[i][0] == "Other"):
        data_set[i][0] = 2
    if(data_set[i][4] == "Yes"):
        data_set[i][4] = 1
    if(data_set[i][4] == "No"):
        data_set[i][4] = 0
    if(data_set[i][5] == "Private"):
        data_set[i][5] = 0
    if(data_set[i][5] == "Self-employed"):
        data_set[i][5] = 1
    if(data_set[i][5] == "children"):
        data_set[i][5] = 2
    if(data_set[i][5] == "Govt_job"):
        data_set[i][5] = 3
    if(data_set[i][5] == "Never_worked"):
        data_set[i][5] = 4
    if(data_set[i][6] == "Urban"):
        data_set[i][6] = 0
    if(data_set[i][6] == "Rural"):
        data_set[i][6] = 1
    if(data_set[i][9] == "never smoked"):
        data_set[i][9] = 0
    if(data_set[i][9] == "Unknown"):
        data_set[i][9] = 1
    if(data_set[i][9] == "formerly smoked"):
        data_set[i][9] = 2
    if(data_set[i][9] == "smokes"):
        data_set[i][9] = 3
    if(data_set[i][10] == 0):
        data_set[i][10] = "No"
    if(data_set[i][10] == 1):
        data_set[i][10] = "Yes"

        

In [7]:
np.random.shuffle(data_set)
x = data_set[:,:-1]
y = data_set[:,-1]
oversample = SMOTE()
x,y = oversample.fit_resample(x, y)
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=21)


### Classification of the data with different methods

In [9]:
#Random forest
clf = RandomForestClassifier( n_estimators=100, random_state=21 )
scaler = StandardScaler()
pipe_rt = Pipeline([('scaler', scaler), ('clf', clf)])

#Support Vector Machines
svm = SVC()
scaler = StandardScaler()
pipe_svc = Pipeline([('scaler', scaler), ('clf', svm)])

#k-Nearest Neighbours
ada = KNeighborsClassifier()
scaler = StandardScaler()
pipe_kn = Pipeline([('scaler', scaler), ('clf', ada)])

##### 5-fold cross-validation for the three models

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)


avg_error_rt_t = []
avg_erro_svm_t = []
avg_error_kn_t = []

avg_error_rt_v = []
avg_erro_svm_v = []
avg_error_kn_v = []

for train_index, test_index in skf.split(X_train, y_train):

    X_tr, X_te = X_train[train_index], X_train[test_index]
    y_tr, y_te = y_train[train_index], y_train[test_index]
    
    pipe_rt.fit(X_tr, y_tr)
    pipe_svc.fit(X_tr, y_tr)
    pipe_kn.fit(X_tr, y_tr)
    
    y_pred_tr = pipe_rt.predict(X_tr)
    avg_error_rt_t.append(accuracy_score(y_tr, y_pred_tr))
    y_pred_v = pipe_rt.predict(X_te)
    avg_error_rt_v.append(accuracy_score(y_te, y_pred_v))
    
    y_pred_tr = pipe_svc.predict(X_tr)
    avg_erro_svm_t.append(accuracy_score(y_tr, y_pred_tr))
    y_pred_v = pipe_svc.predict(X_te)
    avg_erro_svm_v.append(accuracy_score(y_te, y_pred_v))
    
    y_pred_tr = pipe_kn.predict(X_tr)
    avg_error_kn_t.append(accuracy_score(y_tr, y_pred_tr))
    y_pred_v = pipe_kn.predict(X_te)
    avg_error_kn_v.append(accuracy_score(y_te, y_pred_v))
    
    

print("Average training error rate of random forest after 5-fold cross-validation :")
print(1-statistics.mean(avg_error_rt_t))
print("Average error training rate of svm after 5-fold cross-validation:")
print(1-statistics.mean(avg_erro_svm_t))
print("Average error training rate of KNeighborsClassifier after 5-fold cross-validation:")
print(1-statistics.mean(avg_error_kn_t))


print("Average validation error rate of random forest after 5-fold cross-validation :")
print(1-statistics.mean(avg_error_rt_v))
print("Average validation training rate of svm after 5-fold cross-validation:")
print(1-statistics.mean(avg_erro_svm_v))
print("Average validation training rate of KNeighborsClassifier after 5-fold cross-validation:")
print(1-statistics.mean(avg_error_kn_v))


Average training error rate of random forest after 5-fold cross-validation :
0.0
Average error training rate of svm after 5-fold cross-validation:
0.0761303191489362
Average error training rate of KNeighborsClassifier after 5-fold cross-validation:
0.05123005319148943
Average validation error rate of random forest after 5-fold cross-validation :
0.026329787234042556
Average validation training rate of svm after 5-fold cross-validation:
0.09042553191489366
Average validation training rate of KNeighborsClassifier after 5-fold cross-validation:
0.07965425531914894


### Final evaluation with best model (Random Forest)

In [None]:
best = pipe_rt.fit(X_train,y_train)

y_pred_tr = best.predict(X_train)
print(accuracy_score(y_train, y_pred_tr))
y_pred_v =best.predict(X_val)
print(accuracy_score(y_val, y_pred_v))

plot_confusion_matrix(best, X_val,y_val)

plt.show() 
