# 01 - Data Preprocessing and Cleaning

This notebook covers:
1. Loading the Heart Disease UCI dataset
2. Exploratory Data Analysis (EDA)
3. Handling missing values
4. Data encoding and scaling
5. Data visualization


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


In [None]:
# Load the Heart Disease UCI dataset
print("Loading Heart Disease UCI dataset...")
heart_disease = fetch_ucirepo(id=45)

# Extract features and target
X = heart_disease.data.features
y = heart_disease.data.targets

# Display metadata
print("\nDataset Metadata:")
print(heart_disease.metadata)

# Display variable information
print("\nVariable Information:")
print(heart_disease.variables)


In [None]:
# Combine features and target into a single DataFrame
df = pd.concat([X, y], axis=1)

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())


In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

# Check data types
print("\nData Types:")
print(df.dtypes)

# Check unique values in each column
print("\nUnique Values per Column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")
    if df[col].nunique() < 20:  # Show values if less than 20 unique
        print(f"  Values: {sorted(df[col].unique())}")
    print()


In [None]:
# Exploratory Data Analysis - Visualizations

# 1. Target variable distribution
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
df['target'].value_counts().plot(kind='bar', color=['skyblue', 'lightcoral'])
plt.title('Target Variable Distribution')
plt.xlabel('Heart Disease (0=No, 1=Yes)')
plt.ylabel('Count')
plt.xticks(rotation=0)

# 2. Age distribution
plt.subplot(2, 3, 2)
plt.hist(df['age'], bins=20, color='lightgreen', alpha=0.7, edgecolor='black')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

# 3. Sex distribution
plt.subplot(2, 3, 3)
df['sex'].value_counts().plot(kind='bar', color=['pink', 'lightblue'])
plt.title('Sex Distribution')
plt.xlabel('Sex (0=Female, 1=Male)')
plt.ylabel('Count')
plt.xticks(rotation=0)

# 4. Chest pain type distribution
plt.subplot(2, 3, 4)
df['cp'].value_counts().plot(kind='bar', color='orange')
plt.title('Chest Pain Type Distribution')
plt.xlabel('Chest Pain Type')
plt.ylabel('Count')
plt.xticks(rotation=0)

# 5. Resting blood pressure
plt.subplot(2, 3, 5)
plt.hist(df['trestbps'], bins=20, color='lightcoral', alpha=0.7, edgecolor='black')
plt.title('Resting Blood Pressure Distribution')
plt.xlabel('Resting Blood Pressure (mm Hg)')
plt.ylabel('Frequency')

# 6. Cholesterol distribution
plt.subplot(2, 3, 6)
plt.hist(df['chol'], bins=20, color='lightyellow', alpha=0.7, edgecolor='black')
plt.title('Cholesterol Distribution')
plt.xlabel('Serum Cholesterol (mg/dl)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Correlation Analysis
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix of Heart Disease Dataset')
plt.tight_layout()
plt.show()

# Top correlations with target variable
target_corr = correlation_matrix['target'].abs().sort_values(ascending=False)
print("Top correlations with target variable:")
print(target_corr[1:])  # Exclude target itself


In [None]:
# Box plots for numerical features by target
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
plt.figure(figsize=(15, 10))

for i, feature in enumerate(numerical_features, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(data=df, x='target', y=feature)
    plt.title(f'{feature} by Heart Disease')
    plt.xlabel('Heart Disease (0=No, 1=Yes)')

plt.tight_layout()
plt.show()


In [None]:
# Data Preprocessing Steps

# 1. Handle missing values (if any)
print("Missing values before preprocessing:")
print(df.isnull().sum().sum())

# 2. Create a copy for preprocessing
df_processed = df.copy()

# 3. Handle any potential missing values (replace with median for numerical, mode for categorical)
for col in df_processed.columns:
    if df_processed[col].dtype in ['int64', 'float64']:
        df_processed[col].fillna(df_processed[col].median(), inplace=True)
    else:
        df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)

print("Missing values after preprocessing:")
print(df_processed.isnull().sum().sum())

# 4. Separate features and target
X = df_processed.drop('target', axis=1)
y = df_processed['target']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")


In [None]:
# 5. Feature scaling
# For this dataset, most features are already in appropriate ranges, but we'll apply StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print("Feature scaling completed using StandardScaler")
print(f"Scaled features shape: {X_scaled.shape}")

# Display scaled features statistics
print("\nScaled features statistics:")
print(X_scaled.describe())


In [None]:
# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("Train-test split completed:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training target distribution: {y_train.value_counts().to_dict()}")
print(f"Test target distribution: {y_test.value_counts().to_dict()}")

# Save the processed data
import joblib
import os

# Create directories if they don't exist
os.makedirs('../data', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# Save processed data
joblib.dump(X_train, '../data/X_train.pkl')
joblib.dump(X_test, '../data/X_test.pkl')
joblib.dump(y_train, '../data/y_train.pkl')
joblib.dump(y_test, '../data/y_test.pkl')
joblib.dump(scaler, '../models/scaler.pkl')

print("\nProcessed data saved successfully!")
print("Files saved:")
print("- ../data/X_train.pkl")
print("- ../data/X_test.pkl") 
print("- ../data/y_train.pkl")
print("- ../data/y_test.pkl")
print("- ../models/scaler.pkl")


In [None]:
# Box plots for numerical features by target
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
plt.figure(figsize=(15, 10))

for i, feature in enumerate(numerical_features, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(data=df, x='target', y=feature)
    plt.title(f'{feature} by Heart Disease')
    plt.xlabel('Heart Disease (0=No, 1=Yes)')

plt.tight_layout()
plt.show()
