In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [16]:

# the relative path of the stroke csv dataset is saved into the variable data for ease of use
data = '../data/processed/stroke_copy_processed.csv'

# The data is read and saved in a variable (stroke)
stroke_processed = pd.read_csv(data)

stroke_processed = pd.DataFrame(stroke_processed)

stroke_processed

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_category,bmi_category,glucose_category
0,9046,Male,67.0,0,1,Yes,Private,Urban,229,37,formerly smoked,1,Senior,Obesity,Diabetic
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202,34,never smoked,1,Adult,Obesity,Diabetic
2,31112,Male,80.0,0,1,Yes,Private,Rural,106,32,never smoked,1,Senior,Obesity,Healthy
3,60182,Female,49.0,0,0,Yes,Private,Urban,171,34,smokes,1,Adult,Obesity,Pre-Diabetic
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174,24,never smoked,1,Senior,Healthy Weight,Pre-Diabetic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,84,27,never smoked,0,Senior,Overweight,Healthy
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125,40,never smoked,0,Senior,Obesity,Healthy
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,83,31,never smoked,0,Adult,Obesity,Healthy
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166,26,formerly smoked,0,Adult,Overweight,Pre-Diabetic


In [17]:
stroke = stroke_processed.copy()
stroke_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   int64  
 9   bmi                5110 non-null   int64  
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
 12  age_category       5110 non-null   object 
 13  bmi_category       5110 non-null   object 
 14  glucose_category   5110 non-null   object 
dtypes: float64(1), int64(6), object(8)
memory usage: 599.0+ KB


In [18]:
# Exclude rows with age less than 18
filtered_stroke = stroke[stroke['age'] >= 18]

filtered_stroke

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_category,bmi_category,glucose_category
0,9046,Male,67.0,0,1,Yes,Private,Urban,229,37,formerly smoked,1,Senior,Obesity,Diabetic
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202,34,never smoked,1,Adult,Obesity,Diabetic
2,31112,Male,80.0,0,1,Yes,Private,Rural,106,32,never smoked,1,Senior,Obesity,Healthy
3,60182,Female,49.0,0,0,Yes,Private,Urban,171,34,smokes,1,Adult,Obesity,Pre-Diabetic
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174,24,never smoked,1,Senior,Healthy Weight,Pre-Diabetic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,84,27,never smoked,0,Senior,Overweight,Healthy
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125,40,never smoked,0,Senior,Obesity,Healthy
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,83,31,never smoked,0,Adult,Obesity,Healthy
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166,26,formerly smoked,0,Adult,Overweight,Pre-Diabetic


In [19]:
# Confirmation
print(filtered_stroke['age'].min())


18.0


In [20]:
# Separate positive and negative cases
positive = filtered_stroke[filtered_stroke['stroke'] == 1]
negative = filtered_stroke[filtered_stroke['stroke'] == 0].sample(n=249, random_state=42)

# Combine and shuffle
balanced_df = pd.concat([positive, negative]).sample(frac=1, random_state=42).reset_index(drop=True)

In [21]:
balanced_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_category,bmi_category,glucose_category
0,43478,Male,34.0,0,0,Yes,Private,Urban,60,28,formerly smoked,0,Adult,Overweight,Low
1,50784,Male,63.0,0,0,Yes,Private,Rural,229,27,never smoked,1,Adult,Overweight,Diabetic
2,68356,Female,73.0,0,0,Yes,Self-employed,Urban,71,34,never smoked,1,Senior,Obesity,Healthy
3,62466,Female,80.0,0,0,Yes,Private,Urban,64,45,never smoked,1,Senior,Obesity,Low
4,1836,Female,51.0,1,0,Yes,Private,Urban,88,28,never smoked,1,Adult,Overweight,Healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,42072,Female,50.0,1,0,Yes,Private,Rural,73,30,formerly smoked,1,Adult,Obesity,Healthy
492,10135,Female,37.0,0,0,No,Private,Rural,112,29,Unknown,0,Adult,Overweight,Healthy
493,1679,Male,35.0,0,0,Yes,Private,Rural,77,29,formerly smoked,0,Adult,Overweight,Healthy
494,8168,Female,34.0,0,0,Yes,Private,Rural,113,23,formerly smoked,0,Adult,Healthy Weight,Healthy


In [11]:


# Define feature groups
numerical_features = ['age', 'avg_glucose_level', 'bmi']
categorical_features = [
    'gender', 'ever_married', 'work_type', 'residence_type',
    'smoking_status', 'age_category', 'bmi_category', 'glucose_category'
]

# Transformers
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Full pipeline with a classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Drop ID column and separate target
X = balanced_df.drop(columns=['id', 'stroke'])
y = balanced_df['stroke']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.80      0.74      0.77        50
           1       0.76      0.82      0.79        50

    accuracy                           0.78       100
   macro avg       0.78      0.78      0.78       100
weighted avg       0.78      0.78      0.78       100



In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Drop ID column and separate target
X = balanced_df.drop(columns=['id', 'stroke'])
y = balanced_df['stroke']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

# Evaluate using cross-validation
scores = cross_val_score(pipeline, X, y, cv=4, scoring='f1')  # or 'accuracy', 'roc_auc', etc.
print("Cross-validated F1 scores:", scores)
print("Mean F1 score:", scores.mean())

Cross-validated F1 scores: [0.72307692 0.72592593 0.7518797  0.77419355]
Mean F1 score: 0.7437690241595165


Rresults:
- Recall for stroke (0.82): Nw we’re catching 82% of actual stroke cases
• Precision for stroke (0.76): About 3 out of 4 predicted stroke cases are correct
• Balanced performance: F1-scores are close for both classes, which is great for fairness and interpretability.

Notes:
Method used here - Undersampling (okay for prototyping), but not for production or fairness-sensitive models.

We can consider:
	◦ SMOTE (synthetic oversampling) with the whole dataset
	◦ Ensemble methods (e.g., Balanced Random Forest)
	◦ Class weighting (preserves full data)
    - Tree-based models - RandomForestClassifier or XGBoost (Good for situations that involve data imbalance)


Critical TODO:
CROSS VALIDATION - choosing the best fold number
Ensemble methods with bootstrapping

Further ToDOs:
 - Track performance across demographic slices (e.g., gender, age_category)
• Consider using Fairlearn or Aequitas to audit bias
• Use precision if we're prioritizing high-risk patients for intervention

Feature Importance + SHAP
Use model.feature_importances_ or SHAP values to understand what’s driving predictions — especially important in healthcare.

Fairness Auditing
Check performance across subgroups (e.g., gender, age_category) to ensure the model isn’t biased.

Notes on Bootstrapping:

Bootstrapping is primarily used for assessment and improvement of the modeling process itself, not for creating a single, larger training set.

1.⁠ ⁠Bootstrapping for Model Evaluation & Validation (The Most Common Use)

This is a robust alternative to a single train/test split.

· How it works:
  1. You create hundreds or thousands of bootstrap samples from your entire dataset.
  2. For each bootstrap sample, you train your model and then test it on the data points not included in that sample (the "out-of-bag" or OOB samples).
  3. This gives you a distribution of performance metrics (e.g., accuracy, F1-score).
· Why it's great: You get a much more reliable estimate of your model's performance and its variance, rather than relying on a single, potentially lucky or unlucky, train/test split.

2.⁠ ⁠Bagging (Bootstrap Aggregating) - A Core Ensemble Method (This is the situation/use case Joshua was trying to refer to )
This is where bootstrapping directly creates the training data for an ensemble of models.
· How it works:
  1. You create multiple bootstrap samples from your original training data.
  2. You train a separate instance of the same model (e.g., a decision tree) on each bootstrap sample.
  3. For prediction, you combine the outputs of all these models (e.g., by majority vote for classification or averaging for regression).
· Example: The Random Forest algorithm is the classic example. It is essentially a bagging algorithm applied to decision trees.
· Why it's great: Bagging reduces variance and helps prevent overfitting. The final "model" is the entire ensemble, which is almost always more accurate and stable than a single model trained on the original data.
