# Exploratory Data Analysis (EDA)
## Heart Disease Prediction Dataset

This notebook provides comprehensive exploratory data analysis of the heart disease dataset.

## 1. Import Libraries and Load Data

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path to import custom modules
sys.path.append('..')
from src import data_preprocessing as dp
from src import eda
from config import DATASET_PATH, PROCESSED_DATA_PATH, OUTPUT_FIGURES_DIR

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load and Preprocess Data

In [None]:
# Load raw data
df = dp.load_data()
print(f'Dataset shape: {df.shape}')
df.head()

In [None]:
# Validate data
validation = dp.validate_data(df)
print('Validation Results:')
print(f'Shape: {validation["shape"]}')
print(f'Missing values: {sum(validation["missing_values"].values())}')
print(f'Duplicates: {validation["duplicates"]}')

In [None]:
# Preprocess data
df_processed = dp.preprocess_data(df)
print(f'Processed dataset shape: {df_processed.shape}')

# Save processed data
dp.save_processed_data(df_processed)
df_processed.head()

## 3. Dataset Overview

In [None]:
# Get dataset overview
overview = eda.get_dataset_overview(df_processed)
print('Dataset Overview:')
for key, value in overview.items():
    print(f'{key}: {value}')

In [None]:
# Summary statistics
summary_stats = dp.get_summary_statistics(df_processed)
summary_stats

## 4. Missing Values Analysis

In [None]:
# Analyze missing values
missing_analysis = eda.analyze_missing_values(df_processed)
print(f'Total missing values: {missing_analysis["total_missing"]}')
print(f'Percentage missing: {missing_analysis["percentage_missing"]:.2f}%')

## 5. Target Variable Analysis

In [None]:
# Analyze target variable
target_analysis = eda.analyze_target_variable(df_processed)
print('Target Variable Analysis:')
print(f'Value counts: {target_analysis["value_counts"]}')
print(f'Percentages: {target_analysis["percentages"]}')
print(f'Is balanced: {target_analysis["is_balanced"]}')

In [None]:
# Visualize target distribution
target_fig = eda.visualize_target_distribution(df_processed)
plt.tight_layout()
plt.show()

## 6. Histograms (Minimum 3)

In [None]:
# Create histograms for numeric columns
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
if 'target' in numeric_cols:
    numeric_cols.remove('target')

# Select at least 3 columns for histograms
histogram_cols = numeric_cols[:max(3, len(numeric_cols))]
print(f'Creating histograms for: {histogram_cols}')

hist_fig = eda.create_histograms(df_processed, columns=histogram_cols)
plt.tight_layout()
plt.show()

## 7. Boxplots (Minimum 3)

In [None]:
# Create boxplots for numeric columns
boxplot_cols = numeric_cols[:max(3, len(numeric_cols))]
print(f'Creating boxplots for: {boxplot_cols}')

box_fig = eda.create_boxplots(df_processed, columns=boxplot_cols)
plt.tight_layout()
plt.show()

## 8. Correlation Heatmap

In [None]:
# Create correlation heatmap
corr_fig = eda.create_correlation_heatmap(df_processed)
plt.tight_layout()
plt.show()

## 9. Key Insights

In [None]:
# Generate EDA insights
insights = eda.generate_eda_insights(df_processed)
print('Key Insights:')
print('=' * 60)
for section, content in insights.items():
    print(f'\n{section.upper()}:')
    print(content)

## Summary

This notebook has completed a comprehensive exploratory data analysis including:

1. ✅ Data loading and validation
2. ✅ Data preprocessing and saving processed data
3. ✅ Dataset overview and summary statistics
4. ✅ Missing values analysis
5. ✅ Target variable distribution analysis
6. ✅ Histograms (minimum 3)
7. ✅ Boxplots (minimum 3)
8. ✅ Correlation heatmap
9. ✅ Key insights generation

The processed dataset has been saved to `data/processed/heart_disease_processed.csv` and can be used for model training.