# Chest X-Ray Dataset - Exploratory Data Analysis

This notebook performs comprehensive EDA on the Chest X-Ray dataset to understand:
- Class distribution and imbalance
- Image properties (size, format, quality)
- Visual patterns across classes
- Preprocessing requirements


In [None]:
# Standard imports
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from collections import Counter
from tqdm.notebook import tqdm

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

%matplotlib inline


## 1. Dataset Overview


In [None]:
# Define paths
DATA_DIR = project_root / 'data' / 'raw' / 'chest-xray-dataset'
CLASS_NAMES = ['Normal', 'Pneumonia', 'Tuberculosis']

# Check if data exists
if not DATA_DIR.exists():
    print(f"⚠️ Data not found at {DATA_DIR}")
    print("Please run: python -m src.data.download_dataset")
else:
    print(f"✓ Data directory found: {DATA_DIR}")
    for class_name in CLASS_NAMES:
        class_dir = DATA_DIR / class_name
        if class_dir.exists():
            num_images = len(list(class_dir.glob('*')))
            print(f"  - {class_name}: {num_images} images")


In [None]:
# Collect image metadata
def get_image_info(image_path):
    """Extract metadata from an image."""
    try:
        with Image.open(image_path) as img:
            return {
                'path': str(image_path),
                'filename': image_path.name,
                'width': img.size[0],
                'height': img.size[1],
                'mode': img.mode,
                'format': img.format,
                'aspect_ratio': img.size[0] / img.size[1],
                'file_size_kb': image_path.stat().st_size / 1024,
            }
    except Exception as e:
        return None

# Build dataset
image_data = []

for class_name in CLASS_NAMES:
    class_dir = DATA_DIR / class_name
    if not class_dir.exists():
        continue
    
    image_files = list(class_dir.glob('*.png')) + list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.jpeg'))
    
    for img_path in tqdm(image_files, desc=class_name):
        info = get_image_info(img_path)
        if info:
            info['class'] = class_name
            image_data.append(info)

df = pd.DataFrame(image_data)
print(f"\nTotal valid images: {len(df)}")
df.head()


## 2. Class Distribution Analysis


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
class_counts = df['class'].value_counts()
colors = ['#2ecc71', '#e74c3c', '#f39c12']
axes[0].bar(class_counts.index, class_counts.values, color=colors, edgecolor='black')
axes[0].set_title('Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Number of Images')
for i, (idx, val) in enumerate(class_counts.items()):
    axes[0].text(i, val + 20, f'{val}\n({val/len(df)*100:.1f}%)', ha='center', fontsize=10)

# Pie chart
axes[1].pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%',
            colors=colors, explode=[0.02]*3, shadow=True)
axes[1].set_title('Class Proportions', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'figures' / 'class_distribution.png', dpi=150)
plt.show()

# Calculate class imbalance ratio
imbalance_ratio = class_counts.max() / class_counts.min()
print(f"\nClass Imbalance Ratio: {imbalance_ratio:.2f}x")


## 3. Sample Images Visualization


In [None]:
fig, axes = plt.subplots(3, 5, figsize=(16, 10))

for row, class_name in enumerate(CLASS_NAMES):
    class_images = df[df['class'] == class_name].sample(5, random_state=42)
    
    for col, (_, img_row) in enumerate(class_images.iterrows()):
        img = Image.open(img_row['path'])
        axes[row, col].imshow(img, cmap='gray' if img.mode == 'L' else None)
        axes[row, col].axis('off')
        if col == 0:
            axes[row, col].set_ylabel(class_name, fontsize=14, fontweight='bold')

plt.suptitle('Sample X-Ray Images by Class', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(project_root / 'reports' / 'figures' / 'sample_images.png', dpi=150, bbox_inches='tight')
plt.show()


## 4. Key Findings & Recommendations

### Findings:
1. **Class Imbalance**: The dataset shows class imbalance that needs to be addressed
2. **Image Sizes**: Images vary in size, requiring standardization
3. **Image Modes**: Most images are grayscale (converted to RGB for transfer learning)

### Recommendations:
1. **Handling Imbalance**:
   - Use class-weighted loss function
   - Apply weighted random sampling during training
   
2. **Preprocessing**:
   - Resize all images to 224x224 (standard for ImageNet pretrained models)
   - Convert to RGB for transfer learning
   - Apply ImageNet normalization

3. **Data Augmentation**:
   - Horizontal flip (X-rays are roughly symmetric)
   - Small rotations (±15°)
   - Brightness/contrast adjustments
