# Exploratory Data Analysis: Heart Disease UCI Dataset

**Author:** MLOps Team  
**Date:** January 2026  
**Dataset:** UCI Heart Disease (Combined: Cleveland, Hungarian, Switzerland, VA)

---

## Objective

This notebook performs comprehensive exploratory data analysis (EDA) on the Heart Disease dataset to:

1. Understand the data distribution and quality
2. Identify missing values and outliers
3. Analyze feature correlations
4. Examine class balance
5. Generate insights for model development

---

## 1. Setup and Data Loading

In [None]:
# Import libraries
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")

In [None]:
# Load configuration
import sys
sys.path.append('..')

from src.config import (
    PROCESSED_DATA_PATH,
    TARGET_COLUMN,
    NUMERIC_FEATURES,
    CATEGORICAL_FEATURES
)

print(f"Data path: {PROCESSED_DATA_PATH}")
print(f"Target column: {TARGET_COLUMN}")
print(f"\nNumeric features ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}")
print(f"\nCategorical features ({len(CATEGORICAL_FEATURES)}): {CATEGORICAL_FEATURES}")

In [None]:
# Load the processed dataset
df = pd.read_csv(PROCESSED_DATA_PATH, na_values='?')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")

## 2. Initial Data Inspection

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Dataset info
df.info()

In [None]:
# Basic statistics for numeric features
df[NUMERIC_FEATURES].describe().round(2)

## 3. Missing Values Analysis

In [None]:
# Count missing values
missing_counts = df.isnull().sum()
missing_pct = (missing_counts / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Percentage': missing_pct
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("\n‚ö†Ô∏è  Missing Values Detected:")
    display(missing_df)
    
    # Visualize missing values
    fig, ax = plt.subplots(figsize=(10, 4))
    missing_df['Missing Count'].plot(kind='barh', ax=ax, color='coral')
    ax.set_xlabel('Number of Missing Values')
    ax.set_title('Missing Values by Feature')
    plt.tight_layout()
    plt.show()
else:
    print("\n‚úÖ No missing values found in the dataset!")

## 4. Target Variable Analysis

In [None]:
# Class distribution
class_counts = df[TARGET_COLUMN].value_counts().sort_index()
class_pct = (class_counts / len(df) * 100).round(2)

print(f"Target Variable: {TARGET_COLUMN}\n")
print("Class Distribution:")
for cls, count in class_counts.items():
    print(f"  Class {cls}: {count} samples ({class_pct[cls]}%)")

print(f"\nBalance Ratio: {class_counts.min() / class_counts.max():.2f}")
if class_counts.min() / class_counts.max() > 0.8:
    print("‚úÖ Dataset is well balanced")
elif class_counts.min() / class_counts.max() > 0.5:
    print("‚ö†Ô∏è  Slight imbalance, but acceptable")
else:
    print("‚ùå Significant class imbalance detected")

In [None]:
# Visualize class balance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
colors = ['#2ecc71', '#e74c3c']
class_counts.plot(kind='bar', ax=ax1, color=colors, edgecolor='black', alpha=0.8)
ax1.set_title('Class Distribution (Count)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Class', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.set_xticklabels(['No Disease (0)', 'Disease (1)'], rotation=0)
ax1.grid(axis='y', alpha=0.3)

# Add count labels
for i, v in enumerate(class_counts):
    ax1.text(i, v + 5, str(v), ha='center', va='bottom', fontweight='bold', fontsize=11)

# Pie chart
ax2.pie(
    class_counts,
    labels=['No Disease (0)', 'Disease (1)'],
    autopct='%1.1f%%',
    colors=colors,
    startangle=90,
    explode=(0.05, 0.05),
    textprops={'fontsize': 11, 'fontweight': 'bold'}
)
ax2.set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Numeric Features Distribution

In [None]:
# Histograms for all numeric features
numeric_cols = [col for col in NUMERIC_FEATURES if col in df.columns]

n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
axes = axes.flatten() if n_rows > 1 else [axes]

for idx, col in enumerate(numeric_cols):
    ax = axes[idx]
    
    # Histogram
    ax.hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7, color='steelblue')
    
    # Add mean line
    mean_val = df[col].mean()
    ax.axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.1f}')
    
    ax.set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    ax.set_xlabel(col, fontsize=10)
    ax.set_ylabel('Frequency', fontsize=10)
    ax.legend()
    ax.grid(axis='y', alpha=0.3)

# Hide unused subplots
for idx in range(len(numeric_cols), len(axes)):
    axes[idx].axis('off')

plt.suptitle('Numeric Features: Distribution Analysis', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

In [None]:
# Statistical summary with skewness and kurtosis
stats_df = df[numeric_cols].agg([
    'mean', 'median', 'std', 'min', 'max',
    lambda x: x.skew(),  # skewness
    lambda x: x.kurtosis()  # kurtosis
]).T

stats_df.columns = ['Mean', 'Median', 'Std Dev', 'Min', 'Max', 'Skewness', 'Kurtosis']
stats_df = stats_df.round(2)

print("\nüìä Numeric Features: Statistical Summary")
display(stats_df)

print("\nüí° Interpretation:")
print("  - Skewness: < -1 or > 1 indicates high skew, -0.5 to 0.5 is fairly symmetric")
print("  - Kurtosis: > 3 indicates heavy tails (outliers), < 3 indicates light tails")

## 6. Categorical Features Analysis

In [None]:
# Value counts for categorical features
categorical_cols = [col for col in CATEGORICAL_FEATURES if col in df.columns]

print("\nüìã Categorical Features: Value Counts\n")

for col in categorical_cols:
    print(f"\n{col.upper()}:")
    value_counts = df[col].value_counts().sort_index()
    for val, count in value_counts.items():
        pct = (count / len(df) * 100)
        print(f"  {val}: {count:4d} ({pct:5.1f}%)")

In [None]:
# Visualize categorical features
n_cols = 4
n_rows = (len(categorical_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, n_rows * 3))
axes = axes.flatten() if n_rows > 1 else [axes]

for idx, col in enumerate(categorical_cols):
    ax = axes[idx]
    value_counts = df[col].value_counts().sort_index()
    
    value_counts.plot(kind='bar', ax=ax, color='teal', edgecolor='black', alpha=0.7)
    ax.set_title(f'{col}', fontsize=11, fontweight='bold')
    ax.set_xlabel('')
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=0)
    
    # Add count labels
    for i, v in enumerate(value_counts):
        ax.text(i, v + 2, str(v), ha='center', va='bottom', fontsize=9)

# Hide unused subplots
for idx in range(len(categorical_cols), len(axes)):
    axes[idx].axis('off')

plt.suptitle('Categorical Features: Distribution', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Compute correlation matrix (numeric features + target)
if TARGET_COLUMN in df.columns:
    corr_cols = numeric_cols + [TARGET_COLUMN]
else:
    corr_cols = numeric_cols

correlation_matrix = df[corr_cols].corr()

# Correlation heatmap
fig, ax = plt.subplots(figsize=(12, 10))

sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt='.2f',
    cmap='coolwarm',
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={'shrink': 0.8},
    ax=ax
)

ax.set_title('Feature Correlation Heatmap', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Top correlations with target variable
if TARGET_COLUMN in correlation_matrix.columns:
    target_corr = correlation_matrix[TARGET_COLUMN].drop(TARGET_COLUMN).sort_values(ascending=False)
    
    print(f"\nüéØ Top Features Correlated with {TARGET_COLUMN}:\n")
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Positive correlations
    top_positive = target_corr.head(5)
    top_positive.plot(kind='barh', ax=ax1, color='forestgreen', edgecolor='black')
    ax1.set_title('Top 5 Positive Correlations', fontsize=12, fontweight='bold')
    ax1.set_xlabel('Correlation Coefficient')
    ax1.grid(axis='x', alpha=0.3)
    
    # Negative correlations
    top_negative = target_corr.tail(5)
    top_negative.plot(kind='barh', ax=ax2, color='crimson', edgecolor='black')
    ax2.set_title('Top 5 Negative Correlations', fontsize=12, fontweight='bold')
    ax2.set_xlabel('Correlation Coefficient')
    ax2.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\nPositive Correlations (top 5):")
    for feat, corr in top_positive.items():
        print(f"  {feat:15s}: {corr:6.3f}")
    
    print("\nNegative Correlations (top 5):")
    for feat, corr in top_negative.items():
        print(f"  {feat:15s}: {corr:6.3f}")

In [None]:
# Feature-to-feature correlations (excluding target)
feature_corr = correlation_matrix.drop(TARGET_COLUMN, axis=0).drop(TARGET_COLUMN, axis=1)

# Get upper triangle (avoid duplicates)
upper_tri = np.triu(np.ones_like(feature_corr), k=1).astype(bool)
feature_pairs = feature_corr.where(upper_tri).stack().reset_index()
feature_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
feature_pairs['Abs_Correlation'] = feature_pairs['Correlation'].abs()

# Top correlated feature pairs
top_pairs = feature_pairs.nlargest(10, 'Abs_Correlation')

print("\nüîó Top 10 Correlated Feature Pairs:\n")
for idx, row in top_pairs.iterrows():
    print(f"  {row['Feature 1']:12s} ‚Üî {row['Feature 2']:12s}: {row['Correlation']:6.3f}")

## 8. Bivariate Analysis: Features vs Target

In [None]:
# Box plots: Numeric features by target class
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
axes = axes.flatten() if n_rows > 1 else [axes]

for idx, col in enumerate(numeric_cols):
    ax = axes[idx]
    
    df.boxplot(column=col, by=TARGET_COLUMN, ax=ax, patch_artist=True,
               boxprops=dict(facecolor='lightblue', color='blue'),
               medianprops=dict(color='red', linewidth=2))
    
    ax.set_title(f'{col} by {TARGET_COLUMN}', fontsize=11, fontweight='bold')
    ax.set_xlabel(f'{TARGET_COLUMN} (0=No Disease, 1=Disease)')
    ax.set_ylabel(col)
    plt.sca(ax)
    plt.xticks([1, 2], ['No Disease (0)', 'Disease (1)'])

# Hide unused subplots
for idx in range(len(numeric_cols), len(axes)):
    axes[idx].axis('off')

plt.suptitle('Numeric Features: Distribution by Target Class', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

In [None]:
# Statistical comparison by target class
print("\nüìä Mean Values by Target Class:\n")

comparison_df = df.groupby(TARGET_COLUMN)[numeric_cols].mean().T
comparison_df.columns = ['No Disease (0)', 'Disease (1)']
comparison_df['Difference'] = comparison_df['Disease (1)'] - comparison_df['No Disease (0)']
comparison_df = comparison_df.round(2)

display(comparison_df)

print("\nüí° Interpretation:")
print("  - Positive difference: Higher values associated with disease")
print("  - Negative difference: Lower values associated with disease")

## 9. Outlier Detection

In [None]:
# Identify outliers using IQR method
print("\nüîç Outlier Detection (IQR Method):\n")

outlier_summary = []

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    n_outliers = len(outliers)
    pct_outliers = (n_outliers / len(df) * 100)
    
    outlier_summary.append({
        'Feature': col,
        'Lower Bound': round(lower_bound, 2),
        'Upper Bound': round(upper_bound, 2),
        'Outliers': n_outliers,
        'Percentage': round(pct_outliers, 2)
    })

outlier_df = pd.DataFrame(outlier_summary)
display(outlier_df)

total_outliers = outlier_df['Outliers'].sum()
print(f"\nTotal outlier data points: {total_outliers}")
if total_outliers < len(df) * 0.05:
    print("‚úÖ Outliers are minimal (< 5% of data)")
else:
    print("‚ö†Ô∏è  Consider outlier treatment or robust scaling")

## 10. Key Insights and Recommendations

In [None]:
print("\n" + "="*70)
print("üìå KEY INSIGHTS FROM EDA")
print("="*70)

# Dataset overview
print(f"\n1. DATASET OVERVIEW:")
print(f"   - Total samples: {len(df):,}")
print(f"   - Features: {len(df.columns) - 1} ({len(numeric_cols)} numeric, {len(categorical_cols)} categorical)")
print(f"   - Target: {TARGET_COLUMN} (binary classification)")

# Class balance
balance_ratio = class_counts.min() / class_counts.max()
print(f"\n2. CLASS BALANCE:")
print(f"   - No Disease (0): {class_counts[0]} ({class_pct[0]}%)")
print(f"   - Disease (1): {class_counts[1]} ({class_pct[1]}%)")
print(f"   - Balance ratio: {balance_ratio:.2f}")
if balance_ratio > 0.8:
    print(f"   ‚úÖ Well-balanced dataset, no special handling needed")

# Missing values
total_missing = df.isnull().sum().sum()
print(f"\n3. DATA QUALITY:")
if total_missing == 0:
    print(f"   ‚úÖ No missing values detected")
else:
    print(f"   ‚ö†Ô∏è  Missing values: {total_missing} ({total_missing/df.size*100:.2f}%)")

# Top predictors
if TARGET_COLUMN in correlation_matrix.columns:
    top_3_positive = target_corr.head(3)
    top_3_negative = target_corr.tail(3)
    
    print(f"\n4. TOP PREDICTIVE FEATURES:")
    print(f"   Positive correlations:")
    for feat, corr in top_3_positive.items():
        print(f"     - {feat}: {corr:.3f}")
    print(f"   Negative correlations:")
    for feat, corr in top_3_negative.items():
        print(f"     - {feat}: {corr:.3f}")

# Feature distributions
print(f"\n5. FEATURE DISTRIBUTIONS:")
highly_skewed = stats_df[abs(stats_df['Skewness']) > 1].index.tolist()
if highly_skewed:
    print(f"   ‚ö†Ô∏è  Highly skewed features: {', '.join(highly_skewed)}")
    print(f"      ‚Üí Consider log transformation or robust scaling")
else:
    print(f"   ‚úÖ Most features are fairly symmetric")

# Recommendations
print(f"\n6. MODELING RECOMMENDATIONS:")
print(f"   ‚úÖ Use stratified train/test split to maintain class balance")
print(f"   ‚úÖ Apply StandardScaler for numeric features")
print(f"   ‚úÖ Use OneHotEncoder for categorical features")
print(f"   ‚úÖ Consider both linear (LogReg) and non-linear (RF) models")
print(f"   ‚úÖ Use ROC-AUC as primary evaluation metric")

if highly_skewed:
    print(f"   ‚ö†Ô∏è  Consider PowerTransformer for skewed features")

if total_outliers > 0:
    print(f"   ‚ö†Ô∏è  Monitor outlier impact on model performance")

print("\n" + "="*70)
print("‚úÖ EDA COMPLETE")
print("="*70 + "\n")

## 11. Save EDA Outputs (Optional)

In [None]:
# Create EDA artifacts directory
from src.config import ARTIFACTS_DIR

eda_dir = ARTIFACTS_DIR / 'eda'
eda_dir.mkdir(parents=True, exist_ok=True)

print(f"\nüíæ Saving EDA outputs to: {eda_dir}\n")

# Save summary statistics
stats_df.to_csv(eda_dir / 'numeric_features_stats.csv')
print("   ‚úÖ Saved: numeric_features_stats.csv")

# Save correlation matrix
correlation_matrix.to_csv(eda_dir / 'correlation_matrix.csv')
print("   ‚úÖ Saved: correlation_matrix.csv")

# Save outlier summary
outlier_df.to_csv(eda_dir / 'outlier_summary.csv', index=False)
print("   ‚úÖ Saved: outlier_summary.csv")

# Save class distribution
class_dist = pd.DataFrame({
    'Class': class_counts.index,
    'Count': class_counts.values,
    'Percentage': class_pct.values
})
class_dist.to_csv(eda_dir / 'class_distribution.csv', index=False)
print("   ‚úÖ Saved: class_distribution.csv")

print("\n‚úÖ All EDA outputs saved successfully!")

---

## Summary

This EDA notebook analyzed the Heart Disease UCI dataset and revealed:

- **Well-balanced** binary classification problem
- **Clean data** with minimal missing values
- **Strong predictive features** identified through correlation analysis
- **Appropriate preprocessing** strategy defined (scaling + encoding)
- **Ready for modeling** with confidence in data quality

**Next Steps:**
1. Feature engineering pipeline implementation
2. Model training (Logistic Regression, Random Forest)
3. Model evaluation and comparison
4. MLflow experiment tracking

---