# Exploratory Data Analysis - Patient Appointment Prediction

This notebook performs exploratory data analysis on the medical center appointment dataset to understand the data patterns, distributions, and relationships that might affect patient no-show behavior.

## Table of Contents
1. [Data Loading and Overview](#data-loading)
2. [Data Quality Assessment](#data-quality)
3. [Target Variable Analysis](#target-analysis)
4. [Feature Analysis](#feature-analysis)
5. [Correlation Analysis](#correlation-analysis)
6. [Temporal Analysis](#temporal-analysis)
7. [Key Insights](#insights)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Import utility functions
import sys
sys.path.append('../src')
from utils import plot_data_distribution, plot_correlation_heatmap

print("Libraries imported successfully!")


## 1. Data Loading and Overview {#data-loading}


In [None]:
# Load the raw dataset
df_raw = pd.read_csv('../data/raw/MedicalCentre.csv')

print("Dataset Overview:")
print(f"Shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}")
print("\nFirst few rows:")
df_raw.head()


In [None]:
# Basic information about the dataset
print("Dataset Information:")
print("=" * 50)
print(f"Total records: {len(df_raw):,}")
print(f"Total features: {len(df_raw.columns)}")
print(f"Memory usage: {df_raw.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\nData Types:")
print(df_raw.dtypes)

print("\nMissing Values:")
missing_values = df_raw.isnull().sum()
print(missing_values[missing_values > 0])


## 2. Data Quality Assessment {#data-quality}


In [None]:
# Check for duplicates
duplicates = df_raw.duplicated().sum()
print(f"Duplicate records: {duplicates:,}")

# Check for negative ages
negative_ages = (df_raw['Age'] < 0).sum()
print(f"Negative ages: {negative_ages:,}")

# Check age distribution
print(f"\nAge Statistics:")
print(f"Min age: {df_raw['Age'].min()}")
print(f"Max age: {df_raw['Age'].max()}")
print(f"Mean age: {df_raw['Age'].mean():.2f}")
print(f"Median age: {df_raw['Age'].median():.2f}")

# Check unique values in categorical columns
print(f"\nUnique values in categorical columns:")
categorical_cols = ['Gender', 'Neighbourhood', 'No-show']
for col in categorical_cols:
    print(f"{col}: {df_raw[col].nunique()} unique values")


## 3. Target Variable Analysis {#target-analysis}


In [None]:
# Analyze the target variable (No-show)
no_show_counts = df_raw['No-show'].value_counts()
no_show_percentages = df_raw['No-show'].value_counts(normalize=True) * 100

print("No-Show Distribution:")
print("=" * 30)
for value, count in no_show_counts.items():
    percentage = no_show_percentages[value]
    print(f"{value}: {count:,} ({percentage:.2f}%)")

# Visualize the target variable distribution
plt.figure(figsize=(8, 6))
colors = ['lightblue', 'lightcoral']
plt.pie(no_show_counts.values, labels=no_show_counts.index, autopct='%1.1f%%', 
        colors=colors, startangle=90)
plt.title('Distribution of No-Show Appointments')
plt.axis('equal')
plt.show()


## 4. Feature Analysis {#feature-analysis}


In [None]:
# Analyze age distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df_raw['Age'], bins=50, color='skyblue', alpha=0.7, edgecolor='black')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
df_raw.boxplot(column='Age', ax=plt.gca())
plt.title('Age Box Plot')
plt.ylabel('Age')

plt.tight_layout()
plt.show()


In [None]:
# Analyze gender distribution
gender_counts = df_raw['Gender'].value_counts()
print("Gender Distribution:")
print(gender_counts)

plt.figure(figsize=(8, 6))
plt.bar(gender_counts.index, gender_counts.values, color=['lightblue', 'lightpink'])
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()


In [None]:
# Analyze medical conditions
medical_conditions = ['Scholarship', 'Hypertension', 'Diabetes', 'Alcoholism', 'Handicap', 'SMS_received']

plt.figure(figsize=(15, 10))
for i, condition in enumerate(medical_conditions, 1):
    plt.subplot(2, 3, i)
    condition_counts = df_raw[condition].value_counts()
    plt.pie(condition_counts.values, labels=['No', 'Yes'], autopct='%1.1f%%', 
            colors=['lightblue', 'lightcoral'])
    plt.title(f'{condition} Distribution')
    plt.axis('equal')

plt.tight_layout()
plt.show()


## 5. Correlation Analysis {#correlation-analysis}


In [None]:
# Analyze no-show rates by different features
def analyze_no_show_by_feature(df, feature):
    """Analyze no-show rates by a specific feature"""
    no_show_by_feature = df.groupby(feature)['No-show'].value_counts(normalize=True).unstack()
    no_show_by_feature = no_show_by_feature.fillna(0)
    return no_show_by_feature

# Analyze by gender
print("No-Show Rates by Gender:")
gender_no_show = analyze_no_show_by_feature(df_raw, 'Gender')
print(gender_no_show)

# Analyze by age groups
df_raw['AgeGroup'] = pd.cut(df_raw['Age'], bins=[0, 18, 35, 50, 65, 100], 
                           labels=['0-18', '19-35', '36-50', '51-65', '65+'])
age_no_show = analyze_no_show_by_feature(df_raw, 'AgeGroup')
print("\nNo-Show Rates by Age Group:")
print(age_no_show)


In [None]:
# Visualize no-show rates by different features
plt.figure(figsize=(15, 10))

# Gender
plt.subplot(2, 3, 1)
gender_no_show['Yes'].plot(kind='bar', color='lightcoral')
plt.title('No-Show Rate by Gender')
plt.ylabel('No-Show Rate')
plt.xticks(rotation=0)

# Age Groups
plt.subplot(2, 3, 2)
age_no_show['Yes'].plot(kind='bar', color='lightgreen')
plt.title('No-Show Rate by Age Group')
plt.ylabel('No-Show Rate')
plt.xticks(rotation=45)

# Medical conditions
for i, condition in enumerate(['Scholarship', 'Hypertension', 'Diabetes'], 3):
    plt.subplot(2, 3, i)
    condition_no_show = analyze_no_show_by_feature(df_raw, condition)
    condition_no_show['Yes'].plot(kind='bar', color='lightblue')
    plt.title(f'No-Show Rate by {condition}')
    plt.ylabel('No-Show Rate')
    plt.xticks(rotation=0)

plt.tight_layout()
plt.show()


## 6. Temporal Analysis {#temporal-analysis}


In [None]:
# Convert date columns to datetime
df_raw['ScheduledDay'] = pd.to_datetime(df_raw['ScheduledDay'])
df_raw['AppointmentDay'] = pd.to_datetime(df_raw['AppointmentDay'])

# Calculate waiting time
df_raw['WaitingDays'] = (df_raw['AppointmentDay'] - df_raw['ScheduledDay']).dt.days

# Analyze waiting time distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df_raw['WaitingDays'], bins=50, color='skyblue', alpha=0.7, edgecolor='black')
plt.title('Distribution of Waiting Days')
plt.xlabel('Waiting Days')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
df_raw.boxplot(column='WaitingDays', ax=plt.gca())
plt.title('Waiting Days Box Plot')
plt.ylabel('Waiting Days')

plt.tight_layout()
plt.show()


In [None]:
# Analyze no-show rates by waiting time
waiting_groups = pd.cut(df_raw['WaitingDays'], bins=[0, 1, 7, 30, 90, 365], 
                       labels=['Same Day', '1-7 Days', '8-30 Days', '31-90 Days', '90+ Days'])
waiting_no_show = analyze_no_show_by_feature(df_raw, waiting_groups)

plt.figure(figsize=(10, 6))
waiting_no_show['Yes'].plot(kind='bar', color='orange')
plt.title('No-Show Rate by Waiting Time')
plt.ylabel('No-Show Rate')
plt.xlabel('Waiting Time Group')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.show()

print("No-Show Rates by Waiting Time:")
print(waiting_no_show)


## 7. Key Insights {#insights}


In [None]:
# Summary of key findings
print("KEY INSIGHTS FROM EDA:")
print("=" * 50)

print(f"1. Dataset Size: {len(df_raw):,} appointments")
print(f"2. Overall No-Show Rate: {df_raw['No-show'].value_counts(normalize=True)['Yes']*100:.1f}%")

print(f"\n3. Gender Impact:")
gender_impact = gender_no_show['Yes']
for gender, rate in gender_impact.items():
    print(f"   - {gender}: {rate*100:.1f}% no-show rate")

print(f"\n4. Age Impact:")
age_impact = age_no_show['Yes']
for age_group, rate in age_impact.items():
    print(f"   - {age_group}: {rate*100:.1f}% no-show rate")

print(f"\n5. Waiting Time Impact:")
waiting_impact = waiting_no_show['Yes']
for wait_group, rate in waiting_impact.items():
    print(f"   - {wait_group}: {rate*100:.1f}% no-show rate")

print(f"\n6. Medical Conditions Impact:")
for condition in ['Scholarship', 'Hypertension', 'Diabetes', 'Alcoholism', 'Handicap', 'SMS_received']:
    condition_impact = analyze_no_show_by_feature(df_raw, condition)
    print(f"   - {condition}:")
    print(f"     * No: {condition_impact.loc[0, 'Yes']*100:.1f}% no-show rate")
    print(f"     * Yes: {condition_impact.loc[1, 'Yes']*100:.1f}% no-show rate")
