In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


In [1]:
import pandas as pd

# the relative path of the stroke csv dataset is saved into the variable data for ease of use
data = '../data/processed/stroke_copy_processed.csv'

# The data is read and saved in a variable (stroke)
stroke_processed = pd.read_csv(data)

stroke_processed = pd.DataFrame(stroke_processed)

stroke_processed

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_category,bmi_category,glucose_category
0,9046,Male,67.0,0,1,Yes,Private,Urban,229,37,formerly smoked,1,Senior,Obesity,Diabetic
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202,34,never smoked,1,Adult,Obesity,Diabetic
2,31112,Male,80.0,0,1,Yes,Private,Rural,106,32,never smoked,1,Senior,Obesity,Healthy
3,60182,Female,49.0,0,0,Yes,Private,Urban,171,34,smokes,1,Adult,Obesity,Pre-Diabetic
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174,24,never smoked,1,Senior,Healthy Weight,Pre-Diabetic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,84,27,never smoked,0,Senior,Overweight,Healthy
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125,40,never smoked,0,Senior,Obesity,Healthy
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,83,31,never smoked,0,Adult,Obesity,Healthy
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166,26,formerly smoked,0,Adult,Overweight,Pre-Diabetic


In [6]:
# stroke_processed.describe(include='all')

stroke_processed


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_category,bmi_category,glucose_category
0,9046,Male,67.0,0,1,Yes,Private,Urban,229,37,formerly smoked,1,Senior,Obesity,Diabetic
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202,34,never smoked,1,Adult,Obesity,Diabetic
2,31112,Male,80.0,0,1,Yes,Private,Rural,106,32,never smoked,1,Senior,Obesity,Healthy
3,60182,Female,49.0,0,0,Yes,Private,Urban,171,34,smokes,1,Adult,Obesity,Pre-Diabetic
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174,24,never smoked,1,Senior,Healthy Weight,Pre-Diabetic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,84,27,never smoked,0,Senior,Overweight,Healthy
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125,40,never smoked,0,Senior,Obesity,Healthy
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,83,31,never smoked,0,Adult,Obesity,Healthy
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166,26,formerly smoked,0,Adult,Overweight,Pre-Diabetic


In [7]:
stroke = stroke_processed.copy()
stroke_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   int64  
 9   bmi                5110 non-null   int64  
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
 12  age_category       5110 non-null   object 
 13  bmi_category       5110 non-null   object 
 14  glucose_category   5110 non-null   object 
dtypes: float64(1), int64(6), object(8)
memory usage: 599.0+ KB


In [5]:


# Define feature groups
numerical_features = ['age', 'avg_glucose_level', 'bmi']
categorical_features = [
    'gender', 'ever_married', 'work_type', 'residence_type',
    'smoking_status', 'age_category', 'bmi_category', 'glucose_category'
]

# Transformers
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Full pipeline with a classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Drop ID column and separate target
X = stroke.drop(columns=['id', 'stroke'])
y = stroke['stroke']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NOTE:
A classic case of severe class imbalance, and it's especially common in healthcare datasets where the positive class (stroke occurrence) is rare.

What’s Happening
Model is predicting only the majority class (0 = no stroke). That’s why:
• Precision, recall, and F1-score for class 1 are all 0.00
• Accuracy looks deceptively high (95%) because it’s just predicting “no stroke” for everyone
• The warning means that precision for class 1 is undefined because there are no predicted positives

In healthcare, missing true positives (false negatives) can be dangerous. We want our model to identify stroke risk even if it means tolerating some false positives.

In [9]:
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
print(feature_names)

['num__age' 'num__avg_glucose_level' 'num__bmi' 'cat__gender_Female'
 'cat__gender_Male' 'cat__gender_Other' 'cat__ever_married_No'
 'cat__ever_married_Yes' 'cat__work_type_Govt_job'
 'cat__work_type_Never_worked' 'cat__work_type_Private'
 'cat__work_type_Self-employed' 'cat__work_type_children'
 'cat__residence_type_Rural' 'cat__residence_type_Urban'
 'cat__smoking_status_Unknown' 'cat__smoking_status_formerly smoked'
 'cat__smoking_status_never smoked' 'cat__smoking_status_smokes'
 'cat__age_category_Adult' 'cat__age_category_Children'
 'cat__age_category_Senior' 'cat__age_category_Youth'
 'cat__bmi_category_Healthy Weight' 'cat__bmi_category_Obesity'
 'cat__bmi_category_Overweight' 'cat__bmi_category_Underweight'
 'cat__glucose_category_Diabetic' 'cat__glucose_category_Healthy'
 'cat__glucose_category_Low' 'cat__glucose_category_Pre-Diabetic']
