# Stroke Prediction Model

#### Importing the required dependencies

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
# for displaying all feature from dataset:
pd.pandas.set_option('display.max_columns', None)

#### Importing the Preprocessed dataset 

In [4]:
# Reading Dataset:
df = pd.read_csv("Preprocessed_Stroke_data.csv")

#### Let's have a look on the data

In [5]:
df.head(5)

Unnamed: 0,gender_Male,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,stroke
0,1,67.0,0,1,1,0,228.69,36.6,0,1,0,0,1,0,0,1
1,0,61.0,0,0,1,1,202.21,28.1,0,0,1,0,0,1,0,1
2,1,80.0,0,1,1,1,105.92,32.5,0,1,0,0,0,1,0,1
3,0,49.0,0,0,1,0,171.23,34.4,0,1,0,0,0,0,1,1
4,0,79.0,1,0,1,1,174.12,24.0,0,0,1,0,0,1,0,1


**Attribute Information**

- id: unique identifier
- gender: "Male", "Female" or "Other"
- age: age of the patient
- hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
- heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
- ever_married: "No" or "Yes"
- work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
- Residence_type: "Rural" or "Urban"
- avg_glucose_level: average glucose level in blood
- bmi: body mass index
- smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
- stroke: 1 if the patient had a stroke or 0 if not

*Note: "Unknown" in smoking_status means that the information is unavailable for this patient*

In [6]:
# Dataset size
df.shape

(5353, 16)

In [7]:
# Description of  the dataset
df.describe()

Unnamed: 0,gender_Male,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,stroke
count,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0,5353.0
mean,0.415468,44.34915,0.104614,0.061834,0.667103,0.494676,107.469968,28.865851,0.00411,0.572576,0.167009,0.128339,0.17691,0.369512,0.15524,0.092098
std,0.492849,22.836951,0.306084,0.240877,0.471294,0.500018,46.545976,7.612528,0.063982,0.494751,0.373019,0.334498,0.381628,0.482718,0.362167,0.289191
min,0.0,0.08,0.0,0.0,0.0,0.0,55.12,10.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,26.0,0.0,0.0,0.0,0.0,77.44,23.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,46.0,0.0,0.0,1.0,0.0,92.24,28.1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,62.0,0.0,0.0,1.0,1.0,115.52,32.7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,82.0,1.0,1.0,1.0,1.0,271.74,97.6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Separating the dependent and independent variables 

In [8]:
# Dependent & Independent Feature:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [9]:
# Over Sampling:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(sampling_strategy=0.4)
x_oversampler, y_oversampler = oversampler.fit_resample(X, y)

#### Train Test Splitting of the dataset

In [10]:
# Train Test Split:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_oversampler,y_oversampler, test_size=0.2, random_state=0)

### Now its time to apply and fit ML model

In [11]:
# RandomForestClassifier:

from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier()
RandomForest = RandomForest.fit(X_train,y_train)

#### Creating a pickle file for the classifier

In [12]:
filename = 'Stroke.pkl'
pickle.dump(RandomForest, open(filename, 'wb'))