In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/kaggle/StrokePrediction"

In [3]:
%cd /content/gdrive/My Drive/kaggle/StrokePrediction

/content/gdrive/My Drive/kaggle/StrokePrediction


In [4]:
!ls

feature_names.pickle		    model.pickle	  X_test.pickle
healthcare-dataset-stroke-data.csv  model_stacked.pickle  y_re.pickle
kaggle.json			    X_re.pickle		  y_test.pickle


#Loading the data 

Importing libraries

In [5]:
import pandas as pd
import numpy as np

Importing the dataset

In [6]:
dataset=pd.read_csv("healthcare-dataset-stroke-data.csv")
dataset

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


#Data Preprocessing

Let's see which columns have null values 

In [26]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


But first,let's seperate the dataset into dependent variables and independent variables 

In [8]:
Y=dataset["stroke"]#dependent variables
X=dataset.drop(columns=["id","stroke"]) #independent variables

From above we see that only the bmi column has null values. So , we need to **impute the missing values** in the bmi column

###Imputing missing values

We are going to fill the missing values with the mean of BMI column

In [9]:
X['bmi'].fillna(X['bmi'].mean(), inplace = True)

###Applying dummy encoding  

Dummy coding scheme is similar to one-hot encoding. It transforms the categorical variable into a set of binary variables (also known as **dummy variables**). In one-hot encoding, for N categories in a variable, it uses N binary variables. The dummy encoding is a small improvement over one-hot-encoding. Dummy encoding uses N-1 features to represent N labels/categories.

[Source](https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/)

In [10]:
X=pd.get_dummies(X,drop_first=True) #experiment with drop_first
#X=X.drop(columns=["gender_Other"])

In [11]:
X

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.600000,1,0,1,0,1,0,0,1,1,0,0
1,61.0,0,0,202.21,28.893237,0,0,1,0,0,1,0,0,0,1,0
2,80.0,0,1,105.92,32.500000,1,0,1,0,1,0,0,0,0,1,0
3,49.0,0,0,171.23,34.400000,0,0,1,0,1,0,0,1,0,0,1
4,79.0,1,0,174.12,24.000000,0,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,80.0,1,0,83.75,28.893237,0,0,1,0,1,0,0,1,0,1,0
5106,81.0,0,0,125.20,40.000000,0,0,1,0,0,1,0,1,0,1,0
5107,35.0,0,0,82.99,30.600000,0,0,1,0,0,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.600000,1,0,1,0,1,0,0,0,1,0,0


Save the feature names for later

In [12]:
import pickle as pk
with open("feature_names.pickle","wb") as f:
  pk.dump(X.columns.tolist(),f)

###Splitting into training set and testing set

After applying dummy encoding we must split the dataset into training set and a test set.

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.20,random_state=42)

###Feature scaling

Now we need to scale the numbers in the numerical columns .

In [14]:
from sklearn.preprocessing import  MinMaxScaler
min_max=MinMaxScaler()
X_train[["age","avg_glucose_level","bmi"]]=min_max.fit_transform(X_train[["age","avg_glucose_level","bmi"]])
X_test[["age","avg_glucose_level","bmi"]]=min_max.transform(X_test[["age","avg_glucose_level","bmi"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice fro

###Oversampling

Oversampling is done to compensate for the lack of  data points of a particular class. In this case , the positive class (1) has very less samples as compared to the negative class (0).We will be using  SVMSMOTE for adjusting the class distribution of the data.</br>
**SVMSMOTE** is a variant of the SMOTE algorithm . It uses the SVM algorithm to detect samples to use for generating new synthetic samples .</br>
[Source](https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SVMSMOTE.html)

In [15]:
from imblearn.over_sampling import SVMSMOTE
sm=SVMSMOTE(random_state=42)
X_re,y_re=sm.fit_resample(X_train,y_train)



#Model building and training

Ensemble learning is the process by which multiple learning algorithms are combined in order to obtain better predictive performance as compared to the constituent learning algorithms.Here, we will be experimenting with two ensembling methods : **blending and stacking**. </br>
[Reference for blending](https://machinelearningmastery.com/blending-ensemble-machine-learning-with-python/) <br>
[Reference for stacking](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html)

###Blending

Importing models

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score,accuracy_score
from numpy import hstack

Initializing base models

In [17]:
def get_models():
  models=list()
  models.append(("lr",LogisticRegression()))
  models.append(("rf",RandomForestClassifier()))
  models.append(("nb",GaussianNB()))
  models.append(("svm",SVC()))
  return models

Training function for the blended model

In [18]:
def fit_ensemble(models,X_train,X_val,y_train,y_val):#this function returns the fitted blended model 
  print(50*"-","Phase 1:Fit blender model",50*"-")
  meta_X=list() #list for containing the predictions made by the base models on the validation data
  for name,model in models:
    print("Fitting to base model: {} ".format(name))
    model.fit(X_train,y_train) #fit base model on the training data
    y=model.predict(X_val) #making predictions on validation data ,these predictions will be used as the features for the blender model
    y=y.reshape(len(y),1)
    meta_X.append(y)

  meta_X=hstack(meta_X)
  blender=RandomForestClassifier() #intialise blender model
  print("Fitting blender model now!")
  blender.fit(meta_X,y_val)
  return blender

Prediction function  for the blended model

In [19]:
def predict_blend(models,blender,X_test): #this function returns the final predictions done by the blender model
  print(50*"-","Phase 2:Creating final predictions",50*"-")
  meta_X=list()
  for name, model in models:
    print("Making test set predictions for base model: {}".format(name))
    y=model.predict(X_test)
    y=y.reshape(len(y),1)
    meta_X.append(y)
  meta_X=hstack(meta_X)
  print("Creating final blender predictions now!")
  return blender.predict_proba(meta_X)[:,1],blender.predict(meta_X)

When we tie it up all together

In [20]:
X_train,X_val,y_train,y_val=train_test_split(X_re,y_re,test_size=0.4,random_state=1) #splitting full train set into : training set ,validation set
models=get_models() #intialize base models
blender=fit_ensemble(models,X_train,X_val,y_train,y_val) #fit blender model
yhat,y_pred=predict_blend(models,blender,X_test) #predict the final probabilities
auc=roc_auc_score(y_test,yhat)
print("AUC score after blending:",auc)

-------------------------------------------------- Phase 1:Fit blender model --------------------------------------------------
Fitting to base model: lr 
Fitting to base model: rf 
Fitting to base model: nb 
Fitting to base model: svm 
Fitting blender model now!
-------------------------------------------------- Phase 2:Creating final predictions --------------------------------------------------
Making test set predictions for base model: lr
Making test set predictions for base model: rf
Making test set predictions for base model: nb
Making test set predictions for base model: svm
Creating final blender predictions now!
AUC score after blending: 0.8055023521505376


Save the blended model for later

In [21]:
import pickle as pk
with open("model.pickle","wb") as f:
  pk.dump(blender,f)

###Stacking

Importing essentials

In [22]:
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from sklearn.ensemble import StackingClassifier

Function to create the stacked model

In [23]:
def get_stacked_model(models):
  meta_model=RandomForestClassifier() # meta model that will make the final predictions
  model=StackingClassifier(estimators=models,final_estimator=meta_model,cv=5)
  return model

Combining everything and making predictions 

In [24]:
models=get_models() # get the base models
stacked=get_stacked_model(models) # create the stacked model
stacked.fit(X_re,y_re) #fit to the stacked model
y_pred=stacked.predict_proba(X_test)[:,1]  
auc=roc_auc_score(y_test,y_pred)
print("Final auc score after stacking:",auc)

Final auc score after stacking: 0.7474714381720429


Saving data for hyperparameter optimisation 

In [25]:
#saving the resampled training data
with open("X_re.pickle","wb") as f:
  pk.dump(X_re,f)
with open("y_re.pickle","wb") as f:
  pk.dump(y_re,f)  
  
#saving the test data
with open("X_test.pickle","wb") as f:
  pk.dump(X_test,f)
with open("y_test.pickle","wb") as f:
  pk.dump(y_test,f)