# Logistic Regression
## Stroke Prediction Dataset

### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV

### Loading the raw dataset

In [2]:
raw_df = pd.read_csv(r'..\Datasets\healthcare-dataset-stroke-data.csv')
raw_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


### Data Preparation

#### Dropping the ID column

In [3]:
raw_df.drop(['id'], axis = 1, inplace = True)
raw_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


#### Data features

In [4]:
X = raw_df.drop(['stroke'], axis = 1)
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked


#### Data labels

In [5]:
y = raw_df['stroke']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: stroke, dtype: int64

### Creating Model pipeline

#### One Hot Encoding for categorical columns

In [6]:
CATEGORICAL_COLS = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
ohe_categories = [list(raw_df[col_name].unique()) for col_name in CATEGORICAL_COLS]
ohe_categories

[['Male', 'Female', 'Other'],
 ['Yes', 'No'],
 ['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
 ['Urban', 'Rural'],
 ['formerly smoked', 'never smoked', 'smokes', 'Unknown']]

In [7]:
ohe_transformer = ColumnTransformer([('ohe', OneHotEncoder(categories = ohe_categories, drop = 'first'), CATEGORICAL_COLS)], 
    sparse_threshold = 0, remainder='passthrough')

#### Encoding and Imputing Pipeline

In [8]:
ohe_imputer = Pipeline([
    ('ohe_tranformer', ohe_transformer),
    ('knn_imputer', KNNImputer())
])

#### Min max scalar for numerical columns

In [10]:
minmax_scaler = ColumnTransformer([('minmax', MinMaxScaler(), [11, 14, 15])], 
    sparse_threshold = 0, remainder='passthrough')

#### Logistic Regression pipeline

In [11]:
clf_pipeline = Pipeline([
    ('ohe_imputer', ohe_imputer),
    ('minmax_scaler', minmax_scaler),
    ('logreg', LogisticRegression(class_weight = 'balanced'))
])

### Model Fitting

#### Grid Search CV using Stratified K-fold

In [12]:
skf = StratifiedKFold(n_splits = 5)

params = [{
    'ohe_imputer__knn_imputer__n_neighbors': [3, 5, 7],
    'ohe_imputer__knn_imputer__weights': ['uniform', 'distance'],
    'minmax_scaler__minmax__clip': [True, False],
    'logreg__penalty': ['l2', 'none'],
    'logreg__tol': np.logspace(-5, -3, 3),
    'logreg__C': np.logspace(-5, -1, 5),
    'logreg__max_iter': [50, 100, 150],
    'logreg__solver': ['newton-cg', 'lbfgs', 'sag']
},
{
    'ohe_imputer__knn_imputer__n_neighbors': [3, 5, 7],
    'ohe_imputer__knn_imputer__weights': ['uniform', 'distance'],
    'minmax_scaler__minmax__clip': [True, False],
    'logreg__penalty': ['l1', 'l2'],
    'logreg__tol': np.logspace(-5, -3, 3),
    'logreg__C': np.logspace(-5, -1, 5),
    'logreg__max_iter': [50, 100, 150],
    'logreg__solver': ['liblinear', 'saga']
}]

In [13]:
gs = GridSearchCV(clf_pipeline, params, cv = skf, n_jobs = -1)

#### All possible params for Grid Search CV

In [14]:
sorted(gs.get_params().keys())

['cv',
 'error_score',
 'estimator',
 'estimator__logreg',
 'estimator__logreg__C',
 'estimator__logreg__class_weight',
 'estimator__logreg__dual',
 'estimator__logreg__fit_intercept',
 'estimator__logreg__intercept_scaling',
 'estimator__logreg__l1_ratio',
 'estimator__logreg__max_iter',
 'estimator__logreg__multi_class',
 'estimator__logreg__n_jobs',
 'estimator__logreg__penalty',
 'estimator__logreg__random_state',
 'estimator__logreg__solver',
 'estimator__logreg__tol',
 'estimator__logreg__verbose',
 'estimator__logreg__warm_start',
 'estimator__memory',
 'estimator__minmax_scaler',
 'estimator__minmax_scaler__minmax',
 'estimator__minmax_scaler__minmax__clip',
 'estimator__minmax_scaler__minmax__copy',
 'estimator__minmax_scaler__minmax__feature_range',
 'estimator__minmax_scaler__n_jobs',
 'estimator__minmax_scaler__remainder',
 'estimator__minmax_scaler__sparse_threshold',
 'estimator__minmax_scaler__transformer_weights',
 'estimator__minmax_scaler__transformers',
 'estimator__

#### Results

In [15]:
%%time

gs.fit(X, y)

cv_results = pd.DataFrame(gs.cv_results_).sort_values('mean_test_score', ascending = False)
cv_results.head()

Wall time: 12min 27s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logreg__C,param_logreg__max_iter,param_logreg__penalty,param_logreg__solver,param_logreg__tol,param_minmax_scaler__minmax__clip,...,param_ohe_imputer__knn_imputer__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4396,0.194879,0.030451,0.051863,0.01175,0.001,150,l1,liblinear,1e-05,True,...,uniform,"{'logreg__C': 0.001, 'logreg__max_iter': 150, ...",0.951076,0.951076,0.951076,0.951076,0.952055,0.951272,0.000391,1
3992,0.174532,0.018041,0.051463,0.017037,0.0001,150,l1,liblinear,0.001,False,...,uniform,"{'logreg__C': 0.0001, 'logreg__max_iter': 150,...",0.951076,0.951076,0.951076,0.951076,0.952055,0.951272,0.000391,1
4279,0.202657,0.02647,0.062632,0.014918,0.001,100,l1,liblinear,0.001,False,...,distance,"{'logreg__C': 0.001, 'logreg__max_iter': 100, ...",0.951076,0.951076,0.951076,0.951076,0.952055,0.951272,0.000391,1
4280,0.177526,0.011717,0.05246,0.014529,0.001,100,l1,liblinear,0.001,False,...,uniform,"{'logreg__C': 0.001, 'logreg__max_iter': 100, ...",0.951076,0.951076,0.951076,0.951076,0.952055,0.951272,0.000391,1
4281,0.206847,0.042956,0.062831,0.004593,0.001,100,l1,liblinear,0.001,False,...,distance,"{'logreg__C': 0.001, 'logreg__max_iter': 100, ...",0.951076,0.951076,0.951076,0.951076,0.952055,0.951272,0.000391,1


#### Best Parameters for the model

In [16]:
gs.best_params_

{'logreg__C': 1e-05,
 'logreg__max_iter': 50,
 'logreg__penalty': 'l1',
 'logreg__solver': 'liblinear',
 'logreg__tol': 1e-05,
 'minmax_scaler__minmax__clip': True,
 'ohe_imputer__knn_imputer__n_neighbors': 3,
 'ohe_imputer__knn_imputer__weights': 'uniform'}

#### All parameter combinations with the best score sorted by least time taken to predict

In [17]:
param_cols = [c for c in cv_results.columns if c.startswith('param_')]
best_params_df = cv_results[cv_results['mean_test_score'] == gs.best_score_][param_cols + ['mean_score_time']].sort_values('mean_score_time')
best_params_df.head()

Unnamed: 0,param_logreg__C,param_logreg__max_iter,param_logreg__penalty,param_logreg__solver,param_logreg__tol,param_minmax_scaler__minmax__clip,param_ohe_imputer__knn_imputer__n_neighbors,param_ohe_imputer__knn_imputer__weights,mean_score_time
3240,1e-05,50,l1,liblinear,1e-05,True,3,uniform,0.036705
4295,0.001,100,l1,saga,1e-05,False,7,distance,0.042696
3385,1e-05,100,l1,liblinear,1e-05,True,3,distance,0.042886
4420,0.001,150,l1,liblinear,0.001,True,7,uniform,0.04468
3532,1e-05,150,l1,liblinear,1e-05,True,7,uniform,0.04488


In [18]:
for col_name in param_cols:
    print(f"{col_name:50}", best_params_df[col_name].unique())

param_logreg__C                                    [1e-05 0.001 0.0001]
param_logreg__max_iter                             [50 100 150]
param_logreg__penalty                              ['l1']
param_logreg__solver                               ['liblinear' 'saga']
param_logreg__tol                                  [1e-05 0.001 0.0001]
param_minmax_scaler__minmax__clip                  [True False]
param_ohe_imputer__knn_imputer__n_neighbors        [3 7 5]
param_ohe_imputer__knn_imputer__weights            ['uniform' 'distance']
