# 03 – Preprocessing and Model Development for Stroke Risk Prediction

This notebook focuses on cleaning, transforming, and preparing the stroke dataset for modeling. It includes data preprocessing, feature selection, model training, and evaluation.


In [1]:
# 04 - Preprocessing and Training Data Development for Stroke Prediction

# 1. Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime
from sb_utils import save_file

## Load Data

In [2]:
df = pd.read_csv('/Users/manuelramirez/Documents/stroke-risk-capstone/stroke_data_step3_features.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
id,9046,51676,31112,60182,1665
age,1.051434,0.78607,1.62639,0.255342,1.582163
hypertension,0,0,0,0,1
heart_disease,1,0,1,0,0
avg_glucose_level,2.706375,2.121559,-0.005028,1.437358,1.501184
bmi,1.005086,-0.098981,0.472536,0.719327,-0.631531
stroke,1,1,1,1,1
smoking_status_formerly smoked,True,False,False,False,False
smoking_status_never smoked,False,True,True,False,True
smoking_status_smokes,False,False,False,True,False


## Extract stroke data

In [5]:
df.shape

(5110, 18)

## 4. One-Hot Encoding for Categorical Features
Use `pd.get_dummies()` to convert categorical variables into numeric.


In [7]:
#3. Create dummy/indicator features for categorical variables
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()


In [9]:
# Use get_dummies to create dummy variables (drop_first=True to avoid multicollinearity)
df_dummies = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df_dummies.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban
0,9046,1.051434,0,1,2.706375,1.005086,1,True,False,False,True,False,True,False,True,False,False,True
1,51676,0.78607,0,0,2.121559,-0.098981,1,False,True,False,False,False,True,False,False,True,False,False
2,31112,1.62639,0,1,-0.005028,0.472536,1,False,True,False,True,False,True,False,True,False,False,False
3,60182,0.255342,0,0,1.437358,0.719327,1,False,False,True,False,False,True,False,True,False,False,True
4,1665,1.582163,1,0,1.501184,-0.631531,1,False,True,False,False,False,True,False,False,True,False,False


In [10]:
#4. Standardize the magnitude of numeric features
# Identify numeric columns (excluding target)
numeric_cols =df_dummies.drop("stroke", axis=1).select_dtypes(include=[np.number]).columns.tolist()


## 5. Feature Scaling
We scale numeric features using `StandardScaler` to normalize their values.


In [11]:
# Initialize the scaler
scaler = StandardScaler()
# Fit and transform the numeric features
scaled_data = scaler.fit_transform(df_dummies[numeric_cols])

# Convert back to DataFrame and set column names
scaled_df = pd.DataFrame(scaled_data, columns=numeric_cols, index=df_dummies.index)

# Keep the target variable
target = df_dummies["stroke"]

# Combine scaled features and target
final_df = pd.concat([scaled_df, target], axis=1)
final_df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,-1.298312,1.051434,-0.328602,4.185032,2.706375,1.005086,1
1,0.716371,0.78607,-0.328602,-0.238947,2.121559,-0.098981,1
2,-0.255478,1.62639,-0.328602,4.185032,-0.005028,0.472536,1
3,1.118363,0.255342,-0.328602,-0.238947,1.437358,0.719327,1
4,-1.647136,1.582163,3.043196,-0.238947,1.501184,-0.631531,1


## TRAIN/TEST SPLIT

In [12]:
## 5. Split data into training and testing sets
X = final_df.drop("stroke", axis=1)
y = final_df["stroke"]


In [13]:
# 70-30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Output shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("Preprocessing complete. Datasets saved.")

X_train shape: (3577, 6)
X_test shape: (1533, 6)
y_train shape: (3577,)
y_test shape: (1533,)
Preprocessing complete. Datasets saved.


In [15]:
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance training data only
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)






In [16]:
X_train.dtypes

id                   float64
age                  float64
hypertension         float64
heart_disease        float64
avg_glucose_level    float64
bmi                  float64
dtype: object

In [17]:
X_test.dtypes

id                   float64
age                  float64
hypertension         float64
heart_disease        float64
avg_glucose_level    float64
bmi                  float64
dtype: object

In [18]:
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
