In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from pathlib import Path
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Libraries imported successfully!")

## 1. Dataset Overview

The Chest X-Ray Images (Pneumonia) dataset contains:
- **Normal**: Chest X-rays from healthy patients
- **Pneumonia**: Chest X-rays showing pneumonia (bacterial and viral)

### Dataset Structure
```
data/raw/
‚îú‚îÄ‚îÄ NORMAL/
‚îÇ   ‚îú‚îÄ‚îÄ image1.jpeg
‚îÇ   ‚îú‚îÄ‚îÄ image2.jpeg
‚îÇ   ‚îî‚îÄ‚îÄ ...
‚îî‚îÄ‚îÄ PNEUMONIA/
    ‚îú‚îÄ‚îÄ image1.jpeg
    ‚îú‚îÄ‚îÄ image2.jpeg
    ‚îî‚îÄ‚îÄ ...
```

In [None]:
# Set data directory path
# IMPORTANT: Update this path to your actual dataset location
DATA_DIR = Path("../data/raw/chest_xray/train")  # Adjust this path

# Check if directory exists
if not DATA_DIR.exists():
    print("‚ùå Data directory not found!")
    print(f"Please download the dataset and update DATA_DIR to point to: {DATA_DIR}")
    print("\nDataset can be downloaded from:")
    print("https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia")
else:
    print("‚úÖ Data directory found!")
    print(f"Path: {DATA_DIR}")

In [None]:
# Count images in each category
def count_images(directory):
    """Count images in directory and subdirectories."""
    counts = {}

    for class_dir in directory.iterdir():
        if class_dir.is_dir():
            image_files = list(class_dir.glob('*.jpeg')) + list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.png'))
            counts[class_dir.name] = len(image_files)

    return counts

if DATA_DIR.exists():
    image_counts = count_images(DATA_DIR)

    print("\nüìä Dataset Statistics:")
    print("="*50)
    total = 0
    for class_name, count in image_counts.items():
        print(f"{class_name}: {count:,} images")
        total += count
    print(f"\nTotal: {total:,} images")
    print("="*50)

## 2. Class Distribution Visualization

In [None]:
if DATA_DIR.exists():
    # Plot class distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    # Bar chart
    classes = list(image_counts.keys())
    counts = list(image_counts.values())
    colors = ['#2ecc71', '#e74c3c']

    bars = ax1.bar(classes, counts, color=colors, alpha=0.7, edgecolor='black')
    ax1.set_ylabel('Number of Images', fontsize=12)
    ax1.set_title('Class Distribution', fontsize=14, fontweight='bold')

    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height):,}',
                ha='center', va='bottom', fontweight='bold')

    # Pie chart
    ax2.pie(counts, labels=classes, autopct='%1.1f%%', colors=colors,
            startangle=90, explode=[0.05, 0.05])
    ax2.set_title('Class Proportions', fontsize=14, fontweight='bold')

    plt.tight_layout()
    plt.show()

    # Calculate imbalance ratio
    imbalance_ratio = max(counts) / min(counts)
    print(f"\n‚öñÔ∏è Class Imbalance Ratio: {imbalance_ratio:.2f}:1")

    if imbalance_ratio > 2:
        print("‚ö†Ô∏è Dataset is imbalanced. Consider using:")
        print("   - Class weights during training")
        print("   - Oversampling minority class")
        print("   - Undersampling majority class")
        print("   - Focal loss function")

## 3. Sample Image Visualization

In [None]:
def display_sample_images(data_dir, n_samples=8):
    """Display sample images from each class."""
    fig, axes = plt.subplots(2, n_samples, figsize=(20, 6))

    for row_idx, class_dir in enumerate(data_dir.iterdir()):
        if class_dir.is_dir():
            image_files = list(class_dir.glob('*.jpeg'))[:n_samples]

            for col_idx, img_path in enumerate(image_files):
                img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)

                axes[row_idx, col_idx].imshow(img, cmap='gray')
                axes[row_idx, col_idx].axis('off')

                if col_idx == 0:
                    axes[row_idx, col_idx].set_title(
                        class_dir.name,
                        fontsize=14,
                        fontweight='bold',
                        loc='left'
                    )

    plt.suptitle('Sample X-Ray Images from Each Class',
                fontsize=16, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.show()

if DATA_DIR.exists():
    display_sample_images(DATA_DIR)

## 4. Image Properties Analysis

In [None]:
def analyze_image_properties(data_dir, sample_size=100):
    """Analyze dimensions and properties of sample images."""
    properties = {
        'class': [],
        'width': [],
        'height': [],
        'mean_intensity': [],
        'std_intensity': [],
        'min_intensity': [],
        'max_intensity': []
    }

    for class_dir in data_dir.iterdir():
        if class_dir.is_dir():
            image_files = list(class_dir.glob('*.jpeg'))[:sample_size]

            for img_path in image_files:
                img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)

                if img is not None:
                    properties['class'].append(class_dir.name)
                    properties['height'].append(img.shape[0])
                    properties['width'].append(img.shape[1])
                    properties['mean_intensity'].append(np.mean(img))
                    properties['std_intensity'].append(np.std(img))
                    properties['min_intensity'].append(np.min(img))
                    properties['max_intensity'].append(np.max(img))

    return pd.DataFrame(properties)

if DATA_DIR.exists():
    print("üìä Analyzing image properties (this may take a moment)...")
    props_df = analyze_image_properties(DATA_DIR)

    print("\n‚úÖ Analysis complete!")
    print("\nüìè Image Dimensions Summary:")
    print(props_df[['width', 'height']].describe())

In [None]:
if DATA_DIR.exists():
    # Visualize dimension distribution
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Width distribution
    for class_name in props_df['class'].unique():
        class_data = props_df[props_df['class'] == class_name]
        axes[0, 0].hist(class_data['width'], alpha=0.6, label=class_name, bins=30)
    axes[0, 0].set_xlabel('Width (pixels)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Image Width Distribution')
    axes[0, 0].legend()

    # Height distribution
    for class_name in props_df['class'].unique():
        class_data = props_df[props_df['class'] == class_name]
        axes[0, 1].hist(class_data['height'], alpha=0.6, label=class_name, bins=30)
    axes[0, 1].set_xlabel('Height (pixels)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Image Height Distribution')
    axes[0, 1].legend()

    # Mean intensity distribution
    for class_name in props_df['class'].unique():
        class_data = props_df[props_df['class'] == class_name]
        axes[1, 0].hist(class_data['mean_intensity'], alpha=0.6, label=class_name, bins=30)
    axes[1, 0].set_xlabel('Mean Pixel Intensity')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Mean Intensity Distribution')
    axes[1, 0].legend()

    # Standard deviation distribution
    for class_name in props_df['class'].unique():
        class_data = props_df[props_df['class'] == class_name]
        axes[1, 1].hist(class_data['std_intensity'], alpha=0.6, label=class_name, bins=30)
    axes[1, 1].set_xlabel('Std Dev Pixel Intensity')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('Intensity Std Dev Distribution')
    axes[1, 1].legend()

    plt.tight_layout()
    plt.show()

## 5. Key Findings and Recommendations

Based on the analysis above, document:

### Observations:
- Class distribution and imbalance
- Image dimension variability
- Intensity distribution differences between classes

### Recommendations:
1. **Preprocessing:**
   - Resize all images to consistent dimensions (e.g., 224x224)
   - Apply CLAHE for contrast enhancement
   - Normalize pixel intensities

2. **Data Augmentation:**
   - Rotation (¬±15 degrees)
   - Small shifts and zooms
   - Brightness/contrast adjustments

3. **Class Imbalance:**
   - Use class weights during training
   - Consider focal loss

4. **Next Steps:**
   - Proceed to preprocessing notebook
   - Create train/validation/test splits
   - Implement data augmentation pipeline

In [None]:
print("‚úÖ Data exploration complete!")
print("\nüìå Next Steps:")
print("1. Review the visualizations and statistics above")
print("2. Open notebook: 02_preprocessing.ipynb")
print("3. Implement data preprocessing pipeline")
print("4. Create train/validation/test splits")