In [1]:
import numpy as np
import pandas as pd
import seaborn as sb

print('Numpy:', np.__version__)
print('Pandas:', pd.__version__)
print('Seaborn:', sb.__version__)

df = pd.read_csv('stroke.csv')

df.head()

Numpy: 1.21.5
Pandas: 1.4.2
Seaborn: 0.11.2


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [2]:
#Deleting features that aren't very useful 
del df['id']

#Imputing missing BMI values 
from sklearn.impute import KNNImputer 
import copy 

knn_obj = KNNImputer(n_neighbors=5)

features_to_use = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']

temp = df[features_to_use].to_numpy()

knn_obj.fit(temp)
temp_imputed = knn_obj.transform(temp)

df_imputed = copy.deepcopy(df)
df_imputed[features_to_use] = temp_imputed
df_imputed.dropna(inplace=True)
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   float64
 3   heart_disease      5110 non-null   float64
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   float64
dtypes: float64(6), object(5)
memory usage: 439.3+ KB


In [3]:
#One hot encoding the categorical features
tmp_df = pd.get_dummies(df_imputed.gender,prefix='gender')
df_imputed = pd.concat((df_imputed,tmp_df), axis=1) 

tmp_df = pd.get_dummies(df_imputed.ever_married,prefix='ever_married')
df_imputed = pd.concat((df_imputed,tmp_df), axis=1)

tmp_df = pd.get_dummies(df_imputed.work_type,prefix='work_type')
df_imputed = pd.concat((df_imputed,tmp_df), axis=1) 

tmp_df = pd.get_dummies(df_imputed.Residence_type,prefix='Residence_type')
df_imputed = pd.concat((df_imputed,tmp_df), axis=1) 

tmp_df = pd.get_dummies(df_imputed.smoking_status,prefix='smoking_status')
df_imputed = pd.concat((df_imputed,tmp_df), axis=1) 

#cleaning up dataset 
if 'gender' in df_imputed:
    del df_imputed['gender']

if 'ever_married' in df_imputed:
    del df_imputed['ever_married']
    
if 'work_type' in df_imputed:
    del df_imputed['work_type']
    
if 'Residence_type' in df_imputed:
    del df_imputed['Residence_type']
    
if 'smoking_status' in df_imputed:
    del df_imputed['smoking_status']
    
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             5110 non-null   float64
 1   hypertension                    5110 non-null   float64
 2   heart_disease                   5110 non-null   float64
 3   avg_glucose_level               5110 non-null   float64
 4   bmi                             5110 non-null   float64
 5   stroke                          5110 non-null   float64
 6   gender_Female                   5110 non-null   uint8  
 7   gender_Male                     5110 non-null   uint8  
 8   gender_Other                    5110 non-null   uint8  
 9   ever_married_No                 5110 non-null   uint8  
 10  ever_married_Yes                5110 non-null   uint8  
 11  work_type_Govt_job              5110 non-null   uint8  
 12  work_type_Never_worked          51

## Splitting Training and Test Sets

In [21]:
from sklearn.model_selection import ShuffleSplit

if 'stroke' in df_imputed:
    Y = df_imputed['stroke'].to_numpy()
    del df_imputed['stroke']
    #normalizing data
    norm_featues = ['age', 'bmi', 'avg_glucose_level']
    df_imputed[norm_features] = (df_imputed[norm_features] - df_imputed[norm_features].mean()) / df_imputed[norm_features].std()

X = df_imputed.to_numpy()

num_cv_iterations = 3
num_instances = len(Y)
cv_object = ShuffleSplit(
                        n_splits = num_cv_iterations,
                        test_size = 0.2)

print(cv_object)

ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)


In [30]:
from numpy.linalg import pinv
from sklearn.metrics import accuracy_score
from scipy.special import expit

class BLR:
    def __init__(self, eta, iterations=20, C=0.001):
        self.eta = eta
        self.iters = iterations
        self.C = C
        
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n' + str(self.w_)
        else: 
            return 'Untrained Binary Logistic Regression Object'
    
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X))
    
    @staticmethod
    def _sigmoid(theta):
        return expit(theta)
    
    def _get_gradient(self,X,Y): 
        Ydiff = Y-self.predict_proba(X,add_bias=False).ravel()
        gradient = np.mean(X*Ydiff[:,np.newaxis], axis=0)
        
        gradient = gradient.reshape(self.w_shape)
        gradient[1:] += -2*self.w_[1:]*self.C
        
    def predict_proba(self,X,add_bias=True):
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.w_)
    
    def predict(self, X):
        return (self.predict_proba(X)>0.5)
    
    def fit(self,X,Y):
        Xb = self._add_bias(X)
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1))
        
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,Y)
            self.w_ += gradient*self.eta
            

In [31]:
class HessianBLR(BLR): 
    def _get_gradient(self,X,Y): 
        g = self.predict_proba(X,add_bias=False).ravel()
        hessian = X.T@np.diag(g*(1-g))@X-2*self.C
        
        Ydiff = Y - g 
        gradient = np.sum(X*Ydiff[:,np.newaxis], axis=0)
        gradient = gradient.reshape(self.w_.shape)
        gradient[1:] += -2*self.w_[1:]*self.C
        
        return pinv(hessian) @ gradient 

In [32]:
from sklearn import metrics as mt 
lr_clf = HessianBLR(eta=0.1, iterations=50, C=0.001)
iter_num = 0

for train_indices, test_indices in cv_object.split(X,Y):
    X_train = X[train_indices]
    Y_train = Y[train_indices]
    
    X_test = X[test_indices]
    Y_test = Y[test_indices]
    
    lr_clf.fit(X_train,Y_train)
    Y_hat = lr_clf.predict(X_test)
    
    acc=mt.accuracy_score(Y_test,Y_hat)
    conf = mt.confusion_matrix(Y_test,Y_hat)
    print("====Iteration",iter_num," ====")
    print("accuracy", acc )
    print("confusion matrix\n",conf)
    iter_num += 1
    

====Iteration 0  ====
accuracy 0.9608610567514677
confusion matrix
 [[982   0]
 [ 40   0]]
====Iteration 1  ====
accuracy 0.9481409001956947
confusion matrix
 [[969   0]
 [ 53   0]]
====Iteration 2  ====
accuracy 0.9608610567514677
confusion matrix
 [[982   0]
 [ 40   0]]
