# Heart Disease Prediction - Exploratory Data Analysis

**Comprehensive analysis of heart disease dataset with visualizations**

---

## Table of Contents
1. [Import Libraries](#1-import-libraries)
2. [Load Data](#2-load-data)
3. [Data Overview](#3-data-overview)
4. [Data Quality Assessment](#4-data-quality-assessment)
5. [Statistical Summary](#5-statistical-summary)
6. [Target Variable Analysis](#6-target-variable-analysis)
7. [Numerical Features](#7-numerical-features)
8. [Categorical Features](#8-categorical-features)
9. [Correlation Analysis](#9-correlation-analysis)
10. [Bivariate Analysis](#10-bivariate-analysis)
11. [Age & Gender Analysis](#11-age--gender-analysis)
12. [Key Findings](#12-key-findings)

## 1. Import Libraries

In [None]:
# Data processing
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
from scipy import stats
from scipy.stats import chi2_contingency

# Settings
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Create output directory
import os
if not os.path.exists('eda_outputs'):
    os.makedirs('eda_outputs')

print("‚úì All libraries imported successfully!")

## 2. Load Data

In [None]:
# Load dataset
df = pd.read_csv('heart_disease.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")

## 3. Data Overview

In [None]:
# Display first few rows
print("First 10 rows:")
df.head(10)

In [None]:
# Data types and info
print("Dataset Information:")
df.info()

In [None]:
# Identify column types
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumerical Columns ({len(numerical_cols)}):")
print(numerical_cols)
print(f"\nCategorical Columns ({len(categorical_cols)}):")
print(categorical_cols)

## 4. Data Quality Assessment

In [None]:
# Missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percentage': (df.isnull().sum().values / len(df) * 100).round(2)
})
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

print("Missing Values Analysis:")
missing_data

In [None]:
# Visualize missing data
fig = px.bar(missing_data.sort_values('Missing_Percentage'), 
             x='Missing_Percentage', 
             y='Column',
             orientation='h',
             title='Missing Data Analysis',
             labels={'Missing_Percentage': 'Missing Percentage (%)', 'Column': 'Feature'},
             color='Missing_Percentage',
             color_continuous_scale='Reds')
fig.update_layout(height=500, showlegend=False)
fig.show()

In [None]:
# Duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate Rows: {duplicates} ({duplicates/len(df)*100:.2f}%)")

## 5. Statistical Summary

In [None]:
# Numerical features statistics
print("Numerical Features Summary:")
df.describe().T.style.background_gradient(cmap='coolwarm')

In [None]:
# Categorical features
print("Categorical Features Summary:\n")
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())
    print(f"Unique values: {df[col].nunique()}")
    print("-" * 50)

## 6. Target Variable Analysis

In [None]:
target = 'Heart Disease Status'

# Count and percentage
print(f"Target Variable: {target}\n")
target_counts = df[target].value_counts()
target_pct = df[target].value_counts(normalize=True) * 100

target_summary = pd.DataFrame({
    'Count': target_counts,
    'Percentage': target_pct
})
print(target_summary)

In [None]:
# Interactive target distribution
fig = make_subplots(rows=1, cols=2, 
                    specs=[[{'type':'bar'}, {'type':'pie'}]],
                    subplot_titles=('Count', 'Percentage'))

# Bar chart
fig.add_trace(
    go.Bar(x=target_counts.index, y=target_counts.values,
           marker_color=['#2ecc71', '#e74c3c'],
           text=target_counts.values,
           textposition='auto'),
    row=1, col=1
)

# Pie chart
fig.add_trace(
    go.Pie(labels=target_counts.index, values=target_counts.values,
           marker_colors=['#2ecc71', '#e74c3c']),
    row=1, col=2
)

fig.update_layout(title_text="Heart Disease Status Distribution", 
                  showlegend=False, height=400)
fig.show()

## 7. Numerical Features

In [None]:
# Distribution plots
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col].dropna(), bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    axes[idx].set_title(f'{col} Distribution', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    
    # Add mean and median lines
    mean_val = df[col].mean()
    median_val = df[col].median()
    axes[idx].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
    axes[idx].axvline(median_val, color='green', linestyle='--', linewidth=2, label=f'Median: {median_val:.2f}')
    axes[idx].legend(fontsize=8)

plt.tight_layout()
plt.savefig('eda_outputs/numerical_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Box plots for outlier detection
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    df.boxplot(column=col, ax=axes[idx])
    axes[idx].set_title(f'{col} - Box Plot', fontweight='bold')
    axes[idx].set_ylabel(col)

plt.tight_layout()
plt.savefig('eda_outputs/boxplots_outliers.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Categorical Features

In [None]:
# Categorical feature distributions (excluding target)
categorical_features = [col for col in categorical_cols if col != target]

fig, axes = plt.subplots(4, 3, figsize=(18, 16))
axes = axes.flatten()

for idx, col in enumerate(categorical_features):
    value_counts = df[col].value_counts()
    value_counts.plot(kind='bar', ax=axes[idx], color='teal', alpha=0.7)
    axes[idx].set_title(f'{col} Distribution', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Count')
    axes[idx].tick_params(axis='x', rotation=45)

# Hide extra subplots
for idx in range(len(categorical_features), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig('eda_outputs/categorical_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Correlation Analysis

In [None]:
# Correlation matrix
correlation_matrix = df[numerical_cols].corr()

# Static heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap - Numerical Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('eda_outputs/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Interactive correlation heatmap
fig = px.imshow(correlation_matrix, 
                text_auto='.2f',
                color_continuous_scale='RdBu_r',
                title='Interactive Correlation Heatmap',
                aspect='auto')
fig.update_layout(height=700)
fig.show()

In [None]:
# Find highly correlated features
high_corr = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.5:
            high_corr.append({
                'Feature1': correlation_matrix.columns[i],
                'Feature2': correlation_matrix.columns[j],
                'Correlation': correlation_matrix.iloc[i, j]
            })

if high_corr:
    high_corr_df = pd.DataFrame(high_corr).sort_values('Correlation', key=abs, ascending=False)
    print("Highly Correlated Features (|r| > 0.5):")
    display(high_corr_df)
else:
    print("No highly correlated features found (|r| > 0.5)")

## 10. Bivariate Analysis

In [None]:
# Numerical features vs Target
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    df.boxplot(column=col, by=target, ax=axes[idx])
    axes[idx].set_title(f'{col} by {target}', fontweight='bold')
    axes[idx].set_xlabel(target)
    axes[idx].set_ylabel(col)
    axes[idx].get_figure().suptitle('')

plt.tight_layout()
plt.savefig('eda_outputs/numerical_vs_target.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Categorical features vs Target
fig, axes = plt.subplots(4, 3, figsize=(18, 16))
axes = axes.flatten()

for idx, col in enumerate(categorical_features):
    ct = pd.crosstab(df[col], df[target], normalize='index') * 100
    ct.plot(kind='bar', stacked=False, ax=axes[idx], color=['#2ecc71', '#e74c3c'])
    axes[idx].set_title(f'{col} vs {target}', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Percentage (%)')
    axes[idx].legend(title=target, loc='best')
    axes[idx].tick_params(axis='x', rotation=45)

for idx in range(len(categorical_features), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig('eda_outputs/categorical_vs_target.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Chi-square test for categorical variables
chi_results = []
for col in categorical_features:
    ct = pd.crosstab(df[col].fillna('Missing'), df[target])
    chi2, p_value, dof, expected = chi2_contingency(ct)
    chi_results.append({
        'Feature': col,
        'Chi2': chi2,
        'P-Value': p_value,
        'Significant': 'Yes ‚úì' if p_value < 0.05 else 'No'
    })

chi_df = pd.DataFrame(chi_results).sort_values('P-Value')
print("Chi-Square Tests (Categorical Features vs Target):")
chi_df.style.background_gradient(subset=['P-Value'], cmap='RdYlGn_r')

## 11. Age & Gender Analysis

In [None]:
# Age analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Distribution by disease status
for status in df[target].unique():
    data = df[df[target] == status]['Age'].dropna()
    axes[0].hist(data, alpha=0.6, bins=30, label=status)
axes[0].set_title('Age Distribution by Heart Disease Status', fontweight='bold')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Box plot
df.boxplot(column='Age', by=target, ax=axes[1])
axes[1].set_title('Age by Heart Disease Status', fontweight='bold')
axes[1].get_figure().suptitle('')

# Violin plot
for i, status in enumerate(df[target].unique()):
    data = df[df[target] == status]['Age'].dropna()
    parts = axes[2].violinplot([data], positions=[i], showmeans=True, showmedians=True)
axes[2].set_xticks(range(len(df[target].unique())))
axes[2].set_xticklabels(df[target].unique())
axes[2].set_title('Age Distribution (Violin Plot)', fontweight='bold')
axes[2].set_ylabel('Age')

plt.tight_layout()
plt.savefig('eda_outputs/age_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Age groups
df['Age_Group'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 60, 100], 
                         labels=['<30', '30-40', '40-50', '50-60', '60+'])

age_disease = pd.crosstab(df['Age_Group'], df[target], normalize='index') * 100
print("Heart Disease by Age Group (%):")
display(age_disease)

# Visualization
age_disease.plot(kind='bar', figsize=(10, 6), color=['#2ecc71', '#e74c3c'])
plt.title('Heart Disease Prevalence by Age Group', fontweight='bold', fontsize=14)
plt.xlabel('Age Group')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=0)
plt.legend(title=target)
plt.tight_layout()
plt.show()

In [None]:
# Gender analysis
gender_disease = pd.crosstab(df['Gender'], df[target], normalize='index') * 100
print("Heart Disease by Gender (%):")
display(gender_disease)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count
pd.crosstab(df['Gender'], df[target]).plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Heart Disease by Gender - Count', fontweight='bold')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Count')
axes[0].legend(title=target)
axes[0].tick_params(axis='x', rotation=0)

# Percentage
gender_disease.plot(kind='bar', ax=axes[1], color=['#2ecc71', '#e74c3c'])
axes[1].set_title('Heart Disease by Gender - Percentage', fontweight='bold')
axes[1].set_xlabel('Gender')
axes[1].set_ylabel('Percentage (%)')
axes[1].legend(title=target)
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig('eda_outputs/gender_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 12. Key Findings

In [None]:
print("="*80)
print("KEY FINDINGS FROM EDA")
print("="*80)

print(f"\nüìä Dataset Overview:")
print(f"   - Total Records: {len(df):,}")
print(f"   - Total Features: {len(df.columns)}")
print(f"   - Numerical Features: {len(numerical_cols)}")
print(f"   - Categorical Features: {len(categorical_cols)}")

print(f"\nüìã Data Quality:")
total_missing = df.isnull().sum().sum()
print(f"   - Total Missing Values: {total_missing:,} ({total_missing/(len(df)*len(df.columns))*100:.2f}%)")
print(f"   - Duplicate Rows: {duplicates:,}")
print(f"   - Features with >25% missing: {len(missing_data[missing_data['Missing_Percentage'] > 25])}")

print(f"\nüéØ Target Variable:")
for status, count in df[target].value_counts().items():
    print(f"   - {status}: {count:,} ({count/len(df)*100:.1f}%)")

print(f"\nüîç Statistical Insights:")
print(f"   - Average Age: {df['Age'].mean():.1f} years")
print(f"   - Age Range: {df['Age'].min():.0f} - {df['Age'].max():.0f} years")
print(f"   - Average BMI: {df['BMI'].mean():.2f}")
print(f"   - Average Blood Pressure: {df['Blood Pressure'].mean():.1f} mmHg")
print(f"   - Average Cholesterol: {df['Cholesterol Level'].mean():.1f} mg/dL")

print(f"\nüìà Significant Predictors (p < 0.05):")
significant_features = chi_df[chi_df['Significant'] == 'Yes ‚úì']['Feature'].tolist()
for feat in significant_features[:5]:  # Show top 5
    print(f"   - {feat}")

print("\n" + "="*80)
print("‚úì EDA COMPLETED SUCCESSFULLY!")
print("All visualizations saved to 'eda_outputs/' directory")
print("="*80)