
# Stroke Predication Model

In [1]:
# Importing Libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# for displaying all feature from dataset:
pd.pandas.set_option('display.max_columns', None)

#### Reading the Preprocessed dataset

In [6]:
# Reading Dataset:
df = pd.read_csv("Stroke_data.csv")
# Top 5 records:
df.head()

Unnamed: 0,gender_Male,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,stroke
0,1,67.0,0,1,1,0,228.69,36.6,0,1,0,0,1,0,0,1
1,0,61.0,0,0,1,1,202.21,28.1,0,0,1,0,0,1,0,1
2,1,80.0,0,1,1,1,105.92,32.5,0,1,0,0,0,1,0,1
3,0,49.0,0,0,1,0,171.23,34.4,0,1,0,0,0,0,1,1
4,0,79.0,1,0,1,1,174.12,24.0,0,0,1,0,0,1,0,1


## Attribute Information
1) id: unique identifier <br>
2) gender: "Male", "Female" or "Other" <br>
3) age: age of the patient <br>
4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension <br>
5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease <br>
6) ever_married: "No" or "Yes" <br>
7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed" <br>
8) Residence_type: "Rural" or "Urban" <br>
9) avg_glucose_level: average glucose level in blood <br>
10) bmi: body mass index <br>
11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"* <br>
12) stroke: 1 if the patient had a stroke or 0 if not <br>
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

#### # Dependent & Independent Feature:

In [7]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [8]:
X.head()

Unnamed: 0,gender_Male,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes
0,1,67.0,0,1,1,0,228.69,36.6,0,1,0,0,1,0,0
1,0,61.0,0,0,1,1,202.21,28.1,0,0,1,0,0,1,0
2,1,80.0,0,1,1,1,105.92,32.5,0,1,0,0,0,1,0
3,0,49.0,0,0,1,0,171.23,34.4,0,1,0,0,0,0,1
4,0,79.0,1,0,1,1,174.12,24.0,0,0,1,0,0,1,0


#### # Train-Test Split:

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [10]:
print(X_train.shape)
print(X_test.shape)

(4087, 15)
(1022, 15)


In [11]:
# Importing Performance Metrics:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### # RandomForestClassifier:

In [12]:
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier()
RandomForest = RandomForest.fit(X_train,y_train)

# Predictions:
y_pred = RandomForest.predict(X_test)

# Performance:
print('Accuracy:', accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.9461839530332681
[[967   1]
 [ 54   0]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       968
           1       0.00      0.00      0.00        54

    accuracy                           0.95      1022
   macro avg       0.47      0.50      0.49      1022
weighted avg       0.90      0.95      0.92      1022



In [13]:
# AdaBoostClassifier:
from sklearn.ensemble import AdaBoostClassifier
AdaBoost = AdaBoostClassifier()
AdaBoost = AdaBoost.fit(X_train,y_train)

# Predictions:
y_pred = AdaBoost.predict(X_test)

# Performance:
print('Accuracy:', accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.9461839530332681
[[966   2]
 [ 53   1]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       968
           1       0.33      0.02      0.04        54

    accuracy                           0.95      1022
   macro avg       0.64      0.51      0.50      1022
weighted avg       0.92      0.95      0.92      1022



In [14]:
# GradientBoostingClassifier:
from sklearn.ensemble import GradientBoostingClassifier
GradientBoost = GradientBoostingClassifier()
GradientBoost = GradientBoost.fit(X_train,y_train)

# Predictions:
y_pred = GradientBoost.predict(X_test)

# Performance:
print('Accuracy:', accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.9461839530332681
[[966   2]
 [ 53   1]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       968
           1       0.33      0.02      0.04        54

    accuracy                           0.95      1022
   macro avg       0.64      0.51      0.50      1022
weighted avg       0.92      0.95      0.92      1022



#### RandomizedSearchCV

In [15]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200,20)]
# Minimum number of samples required to split a node
min_samples_split = [1,2,3,4,5,6,7,8,9,10,12,14,16,18,20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3,4,5,6,7,8,9,10,12,14,16,18,20]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

rf = RandomForestClassifier()
rf_randomcv = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=10,verbose=2,
                               random_state=0,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train,y_train)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200], 'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20], 'criterion': ['entropy', 'gini']}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [16]:
rf_randomcv.best_params_

{'n_estimators': 120,
 'min_samples_split': 1,
 'min_samples_leaf': 20,
 'max_features': 'sqrt',
 'max_depth': 30,
 'criterion': 'gini'}

In [17]:
RandomForest_RandomCV = RandomForestClassifier(criterion='gini', n_estimators=100, max_depth=130, max_features='auto', min_samples_split=14, min_samples_leaf=16)
RandomForest_RandomCV = RandomForest_RandomCV.fit(X_train,y_train)

# Predictions:
y_pred = RandomForest_RandomCV.predict(X_test)

# Performance:
print('Accuracy:', accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

  warn(


Accuracy: 0.9471624266144814
[[968   0]
 [ 54   0]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       968
           1       0.00      0.00      0.00        54

    accuracy                           0.95      1022
   macro avg       0.47      0.50      0.49      1022
weighted avg       0.90      0.95      0.92      1022



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# ----------------------------------------------------------------------------------------

## SMOTE


**SMOTE (Synthetic Minority Over-sampling Technique) is a popular technique used in data science and machine learning to address class imbalance in datasets. Class imbalance occurs when the classes in the target variable are not represented equally, resulting in a skewed distribution.**

**SMOTE works by generating synthetic samples for the minority class to increase its representation in the dataset. It creates new synthetic instances by interpolating between existing minority class samples. The algorithm selects a random sample from the minority class, identifies its k nearest neighbors, and creates synthetic samples by combining the attributes of the selected sample and its neighbors.**

In [19]:
import delayed

In [20]:
from imblearn.combine import SMOTETomek
smote = SMOTETomek()
X_smote, y_smote = smote.fit_resample(X,y)

In [21]:
from collections import Counter
print('Before SMOTE : ', Counter(y))
print('After SMOTE  : ', Counter(y_smote))

Before SMOTE :  Counter({0: 4860, 1: 249})
After SMOTE  :  Counter({1: 4817, 0: 4817})


In [22]:
# Train Test Split:
X_train, X_test, y_train, y_test = train_test_split(X_smote,y_smote, test_size=0.2, random_state=0)

# RandomForestClassifier:
RandomForest = RandomForestClassifier()
RandomForest = RandomForest.fit(X_train,y_train)

# Predictions:
y_pred = RandomForest.predict(X_test)

# Performance:
print('Accuracy:', accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.9486248053969901
[[889  57]
 [ 42 939]]
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       946
           1       0.94      0.96      0.95       981

    accuracy                           0.95      1927
   macro avg       0.95      0.95      0.95      1927
weighted avg       0.95      0.95      0.95      1927



## Over Sampling

In [27]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(sampling_strategy=0.4)
x_oversampler, y_oversampler = oversampler.fit_resample(X, y)


In [28]:
print('Before RandomOverSampler : ', Counter(y))
print('After RandomOverSampler  : ', Counter(y_oversampler))

Before RandomOverSampler :  Counter({0: 4860, 1: 249})
After RandomOverSampler  :  Counter({0: 4860, 1: 1944})


- #### We make 60-40% data.

In [29]:
# Train Test Split:
X_train, X_test, y_train, y_test = train_test_split(x_oversampler,y_oversampler, test_size=0.2, random_state=0)

# RandomForestClassifier:
RandomForest = RandomForestClassifier()
RandomForest = RandomForest.fit(X_train,y_train)

# Predictions:
y_pred = RandomForest.predict(X_test)

# Performance:
print('Accuracy:', accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.9904481998530492
[[965  11]
 [  2 383]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       976
           1       0.97      0.99      0.98       385

    accuracy                           0.99      1361
   macro avg       0.99      0.99      0.99      1361
weighted avg       0.99      0.99      0.99      1361

