In [9]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib

In [19]:
from ucimlrepo import fetch_ucirepo
import pandas as pd

# Fetch dataset
heart_disease = fetch_ucirepo(id=45)

X = heart_disease.data.features
y = heart_disease.data.targets

print("Targets head:")
print(y.head())

# In this dataset, the column is usually named 'num'
print("Target column names:", y.columns.tolist())

# Convert to binary target (0 = no disease, 1 = disease)
y_binary = y.copy()
y_binary['target'] = (y_binary['num'] > 0).astype(int)
y_binary = y_binary[['target']]  # keep only the new column

# Merge features + target
df = pd.concat([X, y_binary], axis=1)
df.to_csv("data/heart_disease.csv", index=False)

print("Final dataset shape:", df.shape)
print(df.head())


Targets head:
   num
0    0
1    2
2    1
3    0
4    0
Target column names: ['num']
Final dataset shape: (303, 14)
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   1       145   233    1        2      150      0      2.3      3   
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   

    ca  thal  target  
0  0.0   6.0       0  
1  3.0   3.0       1  
2  2.0   7.0       1  
3  0.0   3.0       0  
4  0.0   3.0       0  


In [20]:
DATA_PATH = 'data/heart_disease.csv' # update if needed
PROCESSED_PATH = 'data/heart_processed.csv'
PIPELINE_PATH = 'models/preprocessing_pipeline.pkl'

In [21]:
df = pd.read_csv(DATA_PATH)
print('Initial shape:', df.shape)

Initial shape: (303, 14)


In [22]:
print(df.head())
print(df.info())
print(df.describe())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   1       145   233    1        2      150      0      2.3      3   
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   

    ca  thal  target  
0  0.0   6.0       0  
1  3.0   3.0       1  
2  2.0   7.0       1  
3  0.0   3.0       0  
4  0.0   3.0       0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64

In [24]:

# Separate features and target
X = df.drop(columns=['target'])
y = df['target']

In [27]:
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
# Some integer columns may be categorical (e.g., 'sex','cp','slope','ca','thal')
# Override if present:
for c in ['sex','cp','fbs','restecg','exang','slope','ca','thal']:
	if c in numeric_cols:
		numeric_cols.remove(c)
		cat_cols.append(c)

print('Numeric cols:', numeric_cols)
print('Categorical cols:', cat_cols)

Numeric cols: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
Categorical cols: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']


In [29]:
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
cat_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_cols),
('cat', cat_transformer, cat_cols)
])

In [30]:
X_processed = preprocessor.fit_transform(X)

In [32]:
ohe_cols = []
if cat_cols:
    try:
        ohe_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_cols)
        ohe_cols = ohe_names.tolist()
    except Exception:
        ohe_cols = cat_cols


processed_cols = numeric_cols + ohe_cols


X_proc_df = pd.DataFrame(X_processed, columns=processed_cols)
processed_df = pd.concat([X_proc_df.reset_index(drop=True), y.reset_index(drop=True)], axis=1)


import os

# Ensure the directory for PIPELINE_PATH exists
os.makedirs(os.path.dirname(PIPELINE_PATH), exist_ok=True)

processed_df.to_csv(PROCESSED_PATH, index=False)
joblib.dump(preprocessor, PIPELINE_PATH)
print('Saved processed data to', PROCESSED_PATH)
print('Saved preprocessing pipeline to', PIPELINE_PATH)

Saved processed data to data/heart_processed.csv
Saved preprocessing pipeline to models/preprocessing_pipeline.pkl
