In [3]:
# 01_data_preprocessing.ipynb

# --- Imports ---
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# --- Fetch dataset ---
heart_disease = fetch_ucirepo(id=45)

# --- Data ---
X = heart_disease.data.features
y = heart_disease.data.targets

# --- Combine features + target into one dataframe ---
df = pd.concat([X, y], axis=1)
df_clean = df.copy()
df_clean['ca'] = df_clean['ca'].fillna(0)
df_clean['thal'] = df_clean['thal'].fillna(df_clean['thal'].mode()[0])

# --- One-hot encoding for categorical features ---
df_clean.insert(3, "cp_1", 0)
df_clean.insert(4, "cp_2", 0)
df_clean.insert(5, "cp_3", 0)
df_clean.insert(6, "cp_4", 0)
df_clean["cp_1"] = np.where(df["cp"] ==1, 1, 0)
df_clean["cp_2"] = np.where(df["cp"] ==2, 1, 0)
df_clean["cp_3"] = np.where(df["cp"] ==3, 1, 0)
df_clean["cp_4"] = np.where(df["cp"] ==4, 1, 0)

df_clean.insert(10, "restecg_0", 0)
df_clean.insert(11, "restecg_1", 0)
df_clean.insert(12, "restecg_2", 0)
df_clean["restecg_0"] = np.where(df["restecg"] ==0, 1, 0)
df_clean["restecg_1"] = np.where(df["restecg"] ==1, 1, 0)
df_clean["restecg_2"] = np.where(df["restecg"] ==2, 1, 0)

df_clean.insert(15, "slope_1", 0)
df_clean.insert(16, "slope_2", 0)
df_clean.insert(17, "slope_3", 0)
df_clean["slope_1"] = np.where(df["slope"] ==1, 1, 0)
df_clean["slope_2"] = np.where(df["slope"] ==2, 1, 0)
df_clean["slope_3"] = np.where(df["slope"] ==3, 1, 0)

df_clean.insert(19, "thal_3", 0)
df_clean.insert(20, "thal_6", 0)        
df_clean.insert(21, "thal_7", 0)
df_clean["thal_3"] = np.where(df["thal"] ==3, 1, 0)
df_clean["thal_6"] = np.where(df["thal"] ==6, 1, 0)
df_clean["thal_7"] = np.where(df["thal"] ==7, 1 , 0)
df_clean = df_clean.drop(columns=['cp', 'restecg', 'slope', 'thal'])

# --- Standardize continuous features ---
scaler = StandardScaler()
continuous_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
df_clean[continuous_features] = scaler.fit_transform(df_clean[continuous_features])

print("✅ Data Preprocessing Complete. Shape:", df_clean.shape)


✅ Data Preprocessing Complete. Shape: (303, 23)
