In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Load the dataset
df = pd.read_csv('../data/heart.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nFirst 5 rows:")
df.head()

In [None]:
# Dataset info
print("Dataset Information:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nBasic statistics:")
df.describe()

In [None]:
# Target distribution (Class Balance)
plt.figure(figsize=(8, 5))
target_counts = df['target'].value_counts()
plt.bar(['No Disease (0)', 'Disease (1)'], target_counts.values, color=['#2ecc71', '#e74c3c'])
plt.title('Heart Disease Class Distribution', fontsize=16, fontweight='bold')
plt.ylabel('Number of Patients', fontsize=12)
plt.xlabel('Target Class', fontsize=12)
for i, v in enumerate(target_counts.values):
    plt.text(i, v + 5, str(v), ha='center', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('class_balance.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Feature distributions (Histograms)
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca', 'cp', 'sex', 'fbs']

for idx, feature in enumerate(features):
    ax = axes[idx // 3, idx % 3]
    ax.hist(df[feature], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
    ax.set_title(f'{feature.capitalize()} Distribution', fontsize=12, fontweight='bold')
    ax.set_xlabel(feature, fontsize=10)
    ax.set_ylabel('Frequency', fontsize=10)
    ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Key insights
print("=" * 60)
print("KEY EDA INSIGHTS")
print("=" * 60)
print(f"1. Total samples: {len(df)}")
print(f"2. Features: {df.shape[1] - 1} (plus 1 target)")
print(f"3. Class distribution:")
print(f"   - No disease: {target_counts[0]} ({target_counts[0]/len(df)*100:.1f}%)")
print(f"   - Disease: {target_counts[1]} ({target_counts[1]/len(df)*100:.1f}%)")
print(f"4. Missing values: {df.isnull().sum().sum()}")
print(f"5. Top correlated features with target:")
target_corr = correlation_matrix['target'].abs().sort_values(ascending=False)[1:6]
for feat, corr in target_corr.items():
    print(f"   - {feat}: {corr:.3f}")
print("=" * 60)