In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np


In [121]:
stroke_df = pd.read_csv('stroke_df_sem_out')
stroke_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,28,0,0,1,2,1,79.53,31.1,2,0
1,1,33,0,0,1,2,0,78.44,23.9,1,0
2,0,42,0,0,1,2,0,103.0,40.3,0,0
3,1,56,0,0,1,2,1,64.87,28.8,2,0
4,0,24,0,0,0,2,0,73.36,28.8,2,0


In [122]:
stroke_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,28,0,0,1,2,1,79.53,31.1,2,0
1,1,33,0,0,1,2,0,78.44,23.9,1,0
2,0,42,0,0,1,2,0,103.0,40.3,0,0
3,1,56,0,0,1,2,1,64.87,28.8,2,0
4,0,24,0,0,0,2,0,73.36,28.8,2,0


In [123]:
#features

X = (stroke_df.drop(['stroke'], axis=1))

#target

y = (stroke_df['stroke'])

In [124]:
from sklearn.model_selection import train_test_split
X_train_rs, X_valid_rs, y_train_rs, y_valid_rs = train_test_split(X,y, stratify=y,test_size=0.33,random_state=123)

In [125]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# oversample our minority class, in this case the stroke=1
X_os, y_os = resample(X_train_rs[y_train_rs == 1], y_train_rs[y_train_rs == 1], replace=True
                      , n_samples = X_train_rs[y_train_rs == 0].shape[0] # n_samples == size of majority class
                      , random_state=999
                     )
#X2, y2 = np.vstack((X[y == 0], X_os)), np.hstack((y[y == 0], y_os))
X2 = pd.concat([X_train_rs[y_train_rs == 0], X_os])
y2 = pd.concat([y_train_rs[y_train_rs == 0], y_os])
print('original imbalance:\n', y_train_rs.value_counts())
print('after resampling:\n', y2.value_counts())

original imbalance:
 stroke
0    9397
1     345
Name: count, dtype: int64
after resampling:
 stroke
0    9397
1    9397
Name: count, dtype: int64


In [131]:
num_features = ['avg_glucose_level', 'bmi']
num_tf = Pipeline(steps = [
        ('scaler', StandardScaler())
])


bin_features = ['age']
bins_tf = Pipeline(steps =[
    ('kbins', KBinsDiscretizer(n_bins = 10, strategy='uniform'))
]) 
    
# create preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', num_tf, num_features),    
    ('binning', bins_tf, bin_features)
    ])


baseline_lr = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('lr', LogisticRegression(max_iter=1000))
    ])




In [132]:
from sklearn import set_config
set_config(display='diagram')
baseline_lr
    

In [136]:
baseline_lr.fit(X2,y2)



In [137]:
preds = baseline_lr.predict(X_valid_rs)

In [138]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
yt,yp = y_valid_rs, preds
f1,p,r,roc = f1_score(yt,yp), precision_score(yt,yp), recall_score(yt,yp), roc_auc_score(yt,yp)
print('Validation Set Metrics:')
print(f"f1_score {f1} | precision_score {p} | recall_score {r} | ROC AUC SCORE: {roc}")

Validation Set Metrics:
f1_score 0.17313432835820897 | precision_score 0.09634551495016612 | recall_score 0.8529411764705882 | ROC AUC SCORE: 0.779570609838232


In [139]:
cm=confusion_matrix(yt, yp)
tn, fp, fn, tp = confusion_matrix(yt, yp).ravel()

In [140]:
tn, fp, fn, tp

(3269, 1360, 25, 145)