##Data Cleaning and Processing of Stroke Prediction Data



This notebook is organized to guide the process of data cleaning and processing of the stroke prediction data. The workflow begins with importing and exploring the dataset, followed by thorough data cleaning and preprocessing, including handling missing values and encoding categorical variables. 

In [1]:
#Import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
#read the data file with pandas

df = pd.read_csv('../data/healthcare-dataset-stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
#describe shape of dataframe

df.shape


(5110, 12)

In [4]:
#check the column names of dataframe

df.columns


Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [5]:
#format the column names of dataframe to ensure consistency (lowercase)

df = df.rename(columns=str.lower)

In [6]:
#describe the information of dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [7]:
#descriptives of columns with numerical values

df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [8]:
#Check the number of values for each column

for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

id: 5110 unique values
gender: 3 unique values
age: 104 unique values
hypertension: 2 unique values
heart_disease: 2 unique values
ever_married: 2 unique values
work_type: 5 unique values
residence_type: 2 unique values
avg_glucose_level: 3979 unique values
bmi: 418 unique values
smoking_status: 4 unique values
stroke: 2 unique values


In [9]:
#Check number of duplicate rows

print(f"Duplicate rows: {df.duplicated().sum()}")


Duplicate rows: 0


In [10]:
#Gender with three values - check the frequency

df.gender.value_counts()


gender
Female    2994
Male      2115
Other        1
Name: count, dtype: int64

In [11]:
#remove data with gender=other

df = df[df.gender != 'Other']

df.gender.value_counts()

gender
Female    2994
Male      2115
Name: count, dtype: int64

##Handling Missing Values of BMI

In [13]:
# check missing values

df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [14]:
df_stroke=df.copy()

In [15]:
# Impute missing 'bmi' values with the median for each stroke group
df_stroke['bmi'] = df_stroke.groupby('stroke')['bmi'].transform(lambda x: x.fillna(x.median()))

# Check for missing values again to confirm imputation
print("Missing values after imputation:")
print(df_stroke.isnull().sum())

Missing values after imputation:
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


In [None]:
#Show the descriptives of bmi for people with stroke and without stroke
print("Descriptive statistics of BMI by 'Stroke:")
print(df_stroke.groupby('stroke')['bmi'].mean())
print(df_stroke.groupby('stroke')['bmi'].median())


Descriptive statistics of BMI by Gender:
stroke
0    28.797119
1    30.347390
Name: bmi, dtype: float64
stroke
0    28.0
1    29.7
Name: bmi, dtype: float64


##BMI Binning

In [18]:
# BMI Binning
df_stroke['bmi_category'] = pd.cut(
    df_stroke['bmi'],
    bins=[0, 18.5, 24.9, 29.9, df_stroke['bmi'].max()],
    labels=['Underweight', 'Normal', 'Overweight', 'Obese'],
    right=False
)

bmi_stroke_counts = df_stroke.groupby(['bmi_category', 'stroke']).size().unstack(fill_value=0)
bmi_stroke_percentages = bmi_stroke_counts.apply(lambda x: x / x.sum() * 100, axis=1)
display(bmi_stroke_percentages)

  bmi_stroke_counts = df_stroke.groupby(['bmi_category', 'stroke']).size().unstack(fill_value=0)


stroke,0,1
bmi_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Underweight,99.703264,0.296736
Normal,97.119342,2.880658
Overweight,93.047796,6.952204
Obese,94.807198,5.192802


In [19]:
print(df_stroke.isnull().sum())

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
bmi_category         1
dtype: int64


##Encode categorical features.

In [20]:
df_stroke.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,bmi_category
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,Obese
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,29.7,never smoked,1,Overweight
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,Obese
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,Obese
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,Normal


In [21]:
df_stroke = pd.get_dummies(df_stroke, columns=['gender', 'ever_married', 'work_type', 'residence_type', 'smoking_status', 'bmi_category'])
df_stroke.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,residence_type_Rural,residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,bmi_category_Underweight,bmi_category_Normal,bmi_category_Overweight,bmi_category_Obese
0,67.0,0,1,228.69,36.6,1,False,True,False,True,...,False,True,False,True,False,False,False,False,False,True
1,61.0,0,0,202.21,29.7,1,True,False,False,True,...,True,False,False,False,True,False,False,False,True,False
2,80.0,0,1,105.92,32.5,1,False,True,False,True,...,True,False,False,False,True,False,False,False,False,True
3,49.0,0,0,171.23,34.4,1,True,False,False,True,...,False,True,False,False,False,True,False,False,False,True
4,79.0,1,0,174.12,24.0,1,True,False,False,True,...,True,False,False,False,True,False,False,True,False,False


##Normalize or scale numerical features

In [22]:
# Drop the 'bmi' column from df_stroke
df_stroke = df_stroke.drop('bmi', axis=1)

# Display the first few rows to verify the change
display(df_stroke.head())

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,...,residence_type_Rural,residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,bmi_category_Underweight,bmi_category_Normal,bmi_category_Overweight,bmi_category_Obese
0,67.0,0,1,228.69,1,False,True,False,True,False,...,False,True,False,True,False,False,False,False,False,True
1,61.0,0,0,202.21,1,True,False,False,True,False,...,True,False,False,False,True,False,False,False,True,False
2,80.0,0,1,105.92,1,False,True,False,True,False,...,True,False,False,False,True,False,False,False,False,True
3,49.0,0,0,171.23,1,True,False,False,True,False,...,False,True,False,False,False,True,False,False,False,True
4,79.0,1,0,174.12,1,True,False,False,True,False,...,True,False,False,False,True,False,False,True,False,False


In [23]:
#normalize numerical data

from sklearn.preprocessing import StandardScaler

# Select numerical features to normalize
numerical_features = ['age', 'avg_glucose_level']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the numerical data and transform the data
df_stroke[numerical_features] = scaler.fit_transform(df[numerical_features])

df_stroke.head()


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,...,residence_type_Rural,residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,bmi_category_Underweight,bmi_category_Normal,bmi_category_Overweight,bmi_category_Obese
0,1.051242,0,1,2.70645,1,False,True,False,True,False,...,False,True,False,True,False,False,False,False,False,True
1,0.785889,0,0,2.121652,1,True,False,False,True,False,...,True,False,False,False,True,False,False,False,True,False
2,1.626174,0,1,-0.004867,1,False,True,False,True,False,...,True,False,False,False,True,False,False,False,False,True
3,0.255182,0,0,1.437473,1,True,False,False,True,False,...,False,True,False,False,False,True,False,False,False,True
4,1.581949,1,0,1.501297,1,True,False,False,True,False,...,True,False,False,False,True,False,False,True,False,False


In [24]:
missing = df_stroke.isnull().sum()
print(missing)

age                               0
hypertension                      0
heart_disease                     0
avg_glucose_level                 0
stroke                            0
gender_Female                     0
gender_Male                       0
ever_married_No                   0
ever_married_Yes                  0
work_type_Govt_job                0
work_type_Never_worked            0
work_type_Private                 0
work_type_Self-employed           0
work_type_children                0
residence_type_Rural              0
residence_type_Urban              0
smoking_status_Unknown            0
smoking_status_formerly smoked    0
smoking_status_never smoked       0
smoking_status_smokes             0
bmi_category_Underweight          0
bmi_category_Normal               0
bmi_category_Overweight           0
bmi_category_Obese                0
dtype: int64


In [25]:
df_stroke.to_csv('df_stroke_processed.csv', index=False)