# 01 â€“ Exploratory Data Analysis (EDA)

This notebook performs exploratory data analysis on the UCI Hospital Readmission dataset.
It loads the raw data, creates the binary 30-day readmission target, drops identifier columns,
and produces initial insights and visualizations.

In [None]:
import sys
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# Add src/ to Python path
PROJECT_ROOT = Path().resolve().parent
sys.path.append(str(PROJECT_ROOT / 'src'))

from utils import load_raw_data, add_binary_target, drop_id_and_leaky_columns


## Load Data

In [None]:
# Load raw UCI file
df_raw = load_raw_data()
df = add_binary_target(df_raw, source_col="readmitted", target_col="readmitted_30")
df = drop_id_and_leaky_columns(df)

df.head()

## Dataset Overview

In [None]:
df.shape

In [None]:
df.dtypes.head(20)

## Target Variable Distribution

In [None]:
target_counts = df['readmitted_30'].value_counts().rename(index={0: 'No (<30)', 1: 'Yes (<30)'})
target_counts

In [None]:
target_counts.plot(kind='bar')
plt.title('Distribution of 30-Day Readmission Target')
plt.ylabel('Number of encounters')
plt.show()

## Missing Values

In [None]:
missing_pct = df.isna().mean().sort_values(ascending=False)
missing_pct.head(20)

In [None]:
plt.figure(figsize=(8,5))
missing_pct[missing_pct > 0].head(20).plot(kind='bar')
plt.title('Top 20 Columns by Missingness')
plt.ylabel('Fraction missing')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Feature Exploration

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(data=df, x='readmitted_30', y='time_in_hospital')
plt.title('Length of Stay vs 30-Day Readmission')
plt.xlabel('Readmitted within 30 days (0=no, 1=yes)')
plt.ylabel('Time in hospital (days)')
plt.show()

In [None]:
plt.figure(figsize=(10,4))
sns.countplot(data=df, x='age', hue='readmitted_30')
plt.title('Readmission by Age Band')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Age band')
plt.ylabel('Number of encounters')
plt.tight_layout()
plt.show()

In [None]:
if 'number_inpatient' in df.columns:
    plt.figure(figsize=(6,4))
    sns.boxplot(data=df, x='readmitted_30', y='number_inpatient')
    plt.title('Prior Inpatient Visits vs 30-Day Readmission')
    plt.xlabel('Readmitted within 30 days (0=no, 1=yes)')
    plt.ylabel('Number of prior inpatient visits')
    plt.show()

## Correlation Heatmap (Numeric Features)

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
plt.figure(figsize=(10,8))
sns.heatmap(numeric_df.corr(), cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()