In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style="whitegrid")

In [24]:
try:
    # Let pandas automatically detect the header row
    df = pd.read_csv('../data/heart_disease_uci.csv')
    print("Dataset loaded successfully using the standard method!")

    # Drop any unnamed columns that might have been created
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    print("\nInitial Data Info:")
    df.info()

except FileNotFoundError:
    print("Error: 'heart_disease_uci.csv' not found in the 'data' folder.")
    df = pd.DataFrame()

if not df.empty:
    df.head()

Dataset loaded successfully using the standard method!

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [25]:
if not df.empty:
    # Standardize the target column name to 'target'
    if 'num' in df.columns:
        df.rename(columns={'num': 'target'}, inplace=True)

    # Ensure the target is binary (0 for no disease, 1 for disease)
    df['target'] = (df['target'] > 0).astype(int)

    print("Target variable processed. Distribution:")
    print(df['target'].value_counts())

Target variable processed. Distribution:
target
1    509
0    411
Name: count, dtype: int64


In [26]:
# Select columns with 'object' dtype, which indicates text
categorical_text_cols = df.select_dtypes(include=['object']).columns

if not categorical_text_cols.empty:
    print(f"Identified text-based columns to encode: {list(categorical_text_cols)}")
    # One-hot encode these columns, creating new binary (0/1) columns
    df = pd.get_dummies(df, columns=categorical_text_cols, drop_first=True)
    print("Categorical columns successfully one-hot encoded.")
else:
    print("No text-based categorical columns found to encode.")

print(f"\nDataFrame shape after encoding: {df.shape}")
df.head()

Identified text-based columns to encode: ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
Categorical columns successfully one-hot encoded.

DataFrame shape after encoding: (920, 23)


Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,target,sex_Male,dataset_Hungary,...,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,1,63,145.0,233.0,150.0,2.3,0.0,0,True,False,...,False,True,True,False,False,False,False,False,False,False
1,2,67,160.0,286.0,108.0,1.5,3.0,1,True,False,...,False,False,False,False,False,True,True,False,True,False
2,3,67,120.0,229.0,129.0,2.6,2.0,1,True,False,...,False,False,False,False,False,True,True,False,False,True
3,4,37,130.0,250.0,187.0,3.5,0.0,0,True,False,...,True,False,False,True,False,False,False,False,True,False
4,5,41,130.0,204.0,172.0,1.4,0.0,0,False,False,...,False,False,False,False,False,False,False,True,True,False


In [27]:
# Cell 5 (Corrected)

# Impute any remaining missing values with the median of each column
for col in df.columns:
    if df[col].isnull().any():
        # This is the updated, clearer syntax
        df[col] = df[col].fillna(df[col].median())

print(f"\nTotal missing values after final imputation: {df.isnull().sum().sum()}")


Total missing values after final imputation: 0


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# Split data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify original numerical columns that need scaling
cols_to_scale = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']

# Ensure all columns to be scaled actually exist in the dataframe
cols_to_scale = [col for col in cols_to_scale if col in X_train.columns]

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler ONLY on the training data and transform it
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

# Use the SAME fitted scaler to transform the test data
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

print("Numerical features scaled successfully.")
print("Training set shape:", X_train.shape)

Numerical features scaled successfully.
Training set shape: (736, 22)


In [29]:
import os

# Create a directory for our processed data
output_dir = '../data/processed'
os.makedirs(output_dir, exist_ok=True)

# Save the fully processed dataframes
X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)

print(f"\n✅ Fully cleaned and processed data saved to the '{output_dir}' folder.")


✅ Fully cleaned and processed data saved to the '../data/processed' folder.
