In [167]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
import numpy as np
%matplotlib inline

from matplotlib import style
style.use('ggplot')

In [69]:
df = pd.read_csv('stroke data.csv')
stroke_data = df.drop(columns=['stroke'])
target = df.stroke
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked


In [70]:
stroke_data.shape

(5110, 11)

In [72]:
stroke_data.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
dtype: int64

In [74]:
target.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [84]:
stroke_data = pd.get_dummies(stroke_data)

In [95]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
stroke_data = pd.DataFrame(imp_mean.fit_transform(stroke_data))

In [97]:
stroke_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,9046.0,67.0,0.0,1.0,228.69,36.600000,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,51676.0,61.0,0.0,0.0,202.21,28.893237,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,31112.0,80.0,0.0,1.0,105.92,32.500000,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,60182.0,49.0,0.0,0.0,171.23,34.400000,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1665.0,79.0,1.0,0.0,174.12,24.000000,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234.0,80.0,1.0,0.0,83.75,28.893237,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5106,44873.0,81.0,0.0,0.0,125.20,40.000000,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5107,19723.0,35.0,0.0,0.0,82.99,30.600000,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5108,37544.0,51.0,0.0,0.0,166.29,25.600000,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [96]:
stroke_data.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
dtype: int64

In [122]:
smote = SMOTE(random_state=10)
x,y = smote.fit_resample(stroke_data,target)

balanced_stroke_data = pd.DataFrame(x,columns=stroke_data.columns)
balanced_target = pd.Series(y,name='stroke')

In [129]:
b_x_train, b_x_test, b_y_train, b_y_test = train_test_split(balanced_stroke_data,balanced_target,test_size=0.3)


In [141]:
x_train, x_test, y_train, y_test = train_test_split(stroke_data,target,test_size=0.3)


In [144]:
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
print(accuracy_score(y_test,pred))
print(confusion_matrix(y_test,pred))



0.9041095890410958
[[1372   86]
 [  61   14]]


In [134]:
clf = DecisionTreeClassifier()
clf.fit(b_x_train,b_y_train)
pred = clf.predict(b_x_test)
print(accuracy_score(b_y_test,pred))
print(confusion_matrix(b_y_test,pred))

0.9424065821049022
[[1371   94]
 [  74 1378]]


In [166]:
rf_param_grid = {
    'n_estimators' : np.arange(10,200,10),
    'max_depth': np.arange(3,10,1),
}

In [168]:
rscv = RandomizedSearchCV(estimator=RandomForestClassifier(),param_distributions=rf_param_grid,cv=10,verbose=False)
rscv.fit(b_x_train,b_y_train)
print(rscv.best_params_)

{'n_estimators': 190, 'max_depth': 8}


In [148]:
clf = RandomForestClassifier()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
print(accuracy_score(y_test,pred))
print(confusion_matrix(y_test,pred))

0.9504240052185258
[[1457    1]
 [  75    0]]


In [179]:
clf = RandomForestClassifier()
clf.fit(b_x_train,b_y_train)
pred = clf.predict(b_x_test)
print(accuracy_score(b_y_test,pred))
print(confusion_matrix(b_y_test,pred))

0.9749742886527254
[[1459    6]
 [  67 1385]]


In [154]:
clf = GradientBoostingClassifier()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
print(accuracy_score(y_test,pred))
print(confusion_matrix(y_test,pred))



0.9465101108936725
[[1446   12]
 [  70    5]]


In [162]:
clf = GaussianNB()
clf.fit(b_x_train,b_y_train)
pred = clf.predict(b_x_test)
print(accuracy_score(b_y_test,pred))
print(confusion_matrix(b_y_test,pred))



0.7936235858758999
[[ 945  520]
 [  82 1370]]


In [165]:
clf = GaussianNB()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
print(accuracy_score(y_test,pred))
print(confusion_matrix(y_test,pred))



0.9093281148075668
[[1377   81]
 [  58   17]]


In [None]:
clf = GradientBoostingClassifier()
clf.fit(b_x_train,b_y_train)
pred = clf.predict(b_x_test)
print(accuracy_score(b_y_test,pred))
print(confusion_matrix(b_y_test,pred))



0.9622900239972575
[[1452   13]
 [  97 1355]]
