In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("../data/raw/heart_disease.csv")
df.head()


Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,...,No,High,Medium,7.633228,Medium,342.0,,12.969246,12.38725,No
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,...,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,...,Yes,Low,Low,4.44044,Low,393.0,92.0,12.709873,11.230926,No
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,...,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,...,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No


In [3]:
df.isnull().sum()


Age                       29
Gender                    19
Blood Pressure            19
Cholesterol Level         30
Exercise Habits           25
Smoking                   25
Family Heart Disease      21
Diabetes                  30
BMI                       22
High Blood Pressure       26
Low HDL Cholesterol       25
High LDL Cholesterol      26
Alcohol Consumption     2586
Stress Level              22
Sleep Hours               25
Sugar Consumption         30
Triglyceride Level        26
Fasting Blood Sugar       22
CRP Level                 26
Homocysteine Level        20
Heart Disease Status       0
dtype: int64

In [4]:
numeric_cols = [
    'Age','Blood Pressure','Cholesterol Level','BMI','Triglyceride Level',
    'Fasting Blood Sugar','CRP Level','Homocysteine Level','Sleep Hours'
]

binary_cols = [
    'Gender','Smoking','Family Heart Disease','Diabetes',
    'High Blood Pressure','Low HDL Cholesterol','High LDL Cholesterol'
]

multiclass_cols = [
    'Exercise Habits','Alcohol Consumption','Stress Level','Sugar Consumption'
]

target_col = 'Heart Disease Status'


In [5]:
binary_map = {"Yes":1, "No":0, "Male":1, "Female":0}

for col in binary_cols:
    df[col] = df[col].map(binary_map)


In [6]:
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
df[multiclass_cols] = df[multiclass_cols].fillna(df[multiclass_cols].mode().iloc[0])
df[binary_cols] = df[binary_cols].fillna(df[binary_cols].mode().iloc[0])


In [7]:
df_encoded = pd.get_dummies(df, columns=multiclass_cols, drop_first=True)
df_encoded.head()


Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,Low HDL Cholesterol,...,Homocysteine Level,Heart Disease Status,Exercise Habits_Low,Exercise Habits_Medium,Alcohol Consumption_Low,Alcohol Consumption_Medium,Stress Level_Low,Stress Level_Medium,Sugar Consumption_Low,Sugar Consumption_Medium
0,56.0,1.0,153.0,155.0,1.0,1.0,0.0,24.991591,1.0,1.0,...,12.38725,No,False,False,False,False,False,True,False,True
1,69.0,0.0,146.0,286.0,0.0,1.0,1.0,25.221799,0.0,1.0,...,19.298875,No,False,False,False,True,False,False,False,True
2,46.0,1.0,126.0,216.0,0.0,0.0,0.0,29.855447,0.0,1.0,...,11.230926,No,True,False,True,False,True,False,True,False
3,32.0,0.0,122.0,293.0,1.0,1.0,0.0,24.130477,1.0,0.0,...,5.961958,No,False,False,True,False,False,False,False,False
4,60.0,1.0,166.0,242.0,1.0,1.0,1.0,20.486289,1.0,0.0,...,8.153887,No,True,False,True,False,False,False,False,False


In [8]:
df_model = df_encoded.copy()


In [9]:
scaler = StandardScaler()
df_model[numeric_cols] = scaler.fit_transform(df_model[numeric_cols])


In [10]:
df_model[target_col] = df_model[target_col].map({"Yes":1, "No":0})


In [11]:
X = df_model.drop(columns=[target_col])
y = df_model[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)


In [12]:
X_train.shape, X_test.shape


((8000, 24), (2000, 24))

In [13]:
df_model.to_csv("../data/processed/heart_disease_clean.csv", index=False)
