# üè• Retinal Disease Dataset - Exploratory Data Analysis
## RFMiD Dataset: Multi-Label Disease Classification

**Objective:** Comprehensive analysis of retinal fundus images for 45 disease classes

**Dataset:** RFMiD (Retinal Fundus Multi-Disease Dataset)
- Training: 1,920 images
- Validation: 640 images
- Testing: 640 images
- Total: 3,200 images
- Classes: 45 retinal diseases

## 1. Import Libraries and Setup

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Sklearn
from sklearn.metrics import confusion_matrix

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("‚úì All libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Load Dataset

In [None]:
# Define base path
base_path = Path("/home/darkhorse/Downloads/Compressed/Multi Rentinal Disease Model/A. RFMiD_All_Classes_Dataset")

# Load label files
train_labels = pd.read_csv(base_path / "2. Groundtruths/a. RFMiD_Training_Labels.csv")
val_labels = pd.read_csv(base_path / "2. Groundtruths/b. RFMiD_Validation_Labels.csv")
test_labels = pd.read_csv(base_path / "2. Groundtruths/c. RFMiD_Testing_Labels.csv")

# Add split identifier
train_labels['split'] = 'train'
val_labels['split'] = 'val'
test_labels['split'] = 'test'

# Combine all
all_labels = pd.concat([train_labels, val_labels, test_labels], ignore_index=True)

print("‚úì Dataset loaded successfully!")
print(f"\nTraining samples: {len(train_labels):,}")
print(f"Validation samples: {len(val_labels):,}")
print(f"Testing samples: {len(test_labels):,}")
print(f"Total samples: {len(all_labels):,}")
print(f"\nFeatures: {train_labels.shape[1]}")

In [None]:
# Display first few rows
print("First 5 samples from training set:")
train_labels.head()

## 3. Dataset Overview and Disease Statistics

In [None]:
# Get disease columns
disease_columns = [col for col in train_labels.columns if col not in ['ID', 'Disease_Risk', 'split']]

print(f"Number of disease classes: {len(disease_columns)}")
print(f"\nDisease classes:")
for i, disease in enumerate(disease_columns, 1):
    print(f"{i:2d}. {disease}")

In [None]:
# Disease prevalence in training set
disease_counts = train_labels[disease_columns].sum().sort_values(ascending=False)

print("="*80)
print("TOP 20 MOST COMMON DISEASES (Training Set)")
print("="*80)
print(f"{'Rank':<6} {'Code':<10} {'Count':<10} {'Prevalence'}")
print("-"*80)

for rank, (disease, count) in enumerate(disease_counts.head(20).items(), 1):
    percentage = (count / len(train_labels)) * 100
    print(f"{rank:<6} {disease:<10} {count:<10} {percentage:5.2f}%")

In [None]:
# Multi-label statistics
labels_per_sample = train_labels[disease_columns].sum(axis=1)

print("="*60)
print("MULTI-LABEL STATISTICS")
print("="*60)
print(f"Min labels per sample: {labels_per_sample.min()}")
print(f"Max labels per sample: {labels_per_sample.max()}")
print(f"Mean labels per sample: {labels_per_sample.mean():.2f}")
print(f"Median labels per sample: {labels_per_sample.median():.1f}")
print(f"Std labels per sample: {labels_per_sample.std():.2f}")

print(f"\nLabel distribution:")
print(labels_per_sample.value_counts().sort_index())

## 4. Visualization: Disease Distribution

In [None]:
# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(20, 12))

# 1. Top 20 diseases bar plot
ax1 = axes[0, 0]
top_20 = disease_counts.head(20)
colors = plt.cm.viridis(np.linspace(0, 1, len(top_20)))
bars = ax1.barh(range(len(top_20)), top_20.values, color=colors)
ax1.set_yticks(range(len(top_20)))
ax1.set_yticklabels(top_20.index, fontsize=9)
ax1.set_xlabel('Number of Samples', fontsize=12, fontweight='bold')
ax1.set_title('Top 20 Most Common Retinal Diseases', fontsize=14, fontweight='bold', pad=20)
ax1.invert_yaxis()
ax1.grid(axis='x', alpha=0.3)

# Add value labels
for i, (bar, count) in enumerate(zip(bars, top_20.values)):
    ax1.text(count + 5, i, str(count), va='center', fontsize=9, fontweight='bold')

# 2. Disease distribution by split
ax2 = axes[0, 1]
split_data = []
for split in ['train', 'val', 'test']:
    split_df = all_labels[all_labels['split'] == split]
    split_data.append(split_df[disease_columns].sum().sum())

splits = ['Training', 'Validation', 'Testing']
colors_split = ['#2ecc71', '#3498db', '#e74c3c']
bars = ax2.bar(splits, split_data, color=colors_split, edgecolor='black', linewidth=2)
ax2.set_ylabel('Total Disease Instances', fontsize=12, fontweight='bold')
ax2.set_title('Disease Instances by Dataset Split', fontsize=14, fontweight='bold', pad=20)
ax2.grid(axis='y', alpha=0.3)

# Add value labels
for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height, f'{int(height):,}',
            ha='center', va='bottom', fontweight='bold', fontsize=11)

# 3. Labels per sample distribution
ax3 = axes[1, 0]
ax3.hist(labels_per_sample, bins=range(0, labels_per_sample.max()+2), 
        color='coral', edgecolor='black', alpha=0.7)
ax3.axvline(labels_per_sample.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {labels_per_sample.mean():.2f}')
ax3.axvline(labels_per_sample.median(), color='blue', linestyle='--', linewidth=2, label=f'Median: {labels_per_sample.median():.1f}')
ax3.set_xlabel('Number of Diseases per Sample', fontsize=12, fontweight='bold')
ax3.set_ylabel('Frequency', fontsize=12, fontweight='bold')
ax3.set_title('Distribution of Multi-Label Instances', fontsize=14, fontweight='bold', pad=20)
ax3.legend(fontsize=10)
ax3.grid(axis='y', alpha=0.3)

# 4. Disease co-occurrence heatmap
ax4 = axes[1, 1]
top_15_diseases = disease_counts.head(15).index
corr_matrix = train_labels[top_15_diseases].corr()

im = ax4.imshow(corr_matrix, cmap='coolwarm', aspect='auto', vmin=-0.5, vmax=0.5)
ax4.set_xticks(range(len(top_15_diseases)))
ax4.set_yticks(range(len(top_15_diseases)))
ax4.set_xticklabels(top_15_diseases, rotation=45, ha='right', fontsize=9)
ax4.set_yticklabels(top_15_diseases, fontsize=9)
ax4.set_title('Disease Co-occurrence Correlation Matrix (Top 15)', fontsize=14, fontweight='bold', pad=20)

# Add colorbar
cbar = plt.colorbar(im, ax=ax4)
cbar.set_label('Correlation', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('EDA_Disease_Distribution.png', dpi=300, bbox_inches='tight')
print("\n‚úì Saved: EDA_Disease_Distribution.png")
plt.show()

## 5. Class Imbalance Analysis

In [None]:
# Calculate imbalance metrics
total_samples = len(train_labels)
max_count = disease_counts.max()
min_count = disease_counts[disease_counts > 0].min()
imbalance_ratio = max_count / min_count

print("="*80)
print("CLASS IMBALANCE ANALYSIS")
print("="*80)
print(f"\nImbalance Ratio: {imbalance_ratio:.2f}:1")
print(f"Most common disease: {disease_counts.idxmax()} ({max_count} samples, {max_count/total_samples*100:.2f}%)")
print(f"Least common disease: {disease_counts[disease_counts > 0].idxmin()} ({min_count} samples, {min_count/total_samples*100:.2f}%)")

# Categorize diseases by prevalence
rare_diseases = disease_counts[disease_counts < total_samples * 0.01]
uncommon_diseases = disease_counts[(disease_counts >= total_samples * 0.01) & (disease_counts < total_samples * 0.05)]
common_diseases = disease_counts[(disease_counts >= total_samples * 0.05) & (disease_counts < total_samples * 0.10)]
very_common_diseases = disease_counts[disease_counts >= total_samples * 0.10]

print(f"\nDisease Categories by Prevalence:")
print(f"  Very Common (>10%):  {len(very_common_diseases)} diseases")
print(f"  Common (5-10%):       {len(common_diseases)} diseases")
print(f"  Uncommon (1-5%):      {len(uncommon_diseases)} diseases")
print(f"  Rare (<1%):           {len(rare_diseases)} diseases")

## 6. Summary Report

In [None]:
# Generate summary report
report_lines = []
report_lines.append("="*80)
report_lines.append("RFMiD RETINAL DISEASE DATASET - EDA SUMMARY REPORT")
report_lines.append("="*80)
report_lines.append("")
report_lines.append("DATASET OVERVIEW")
report_lines.append("-"*80)
report_lines.append(f"Total Samples         : {len(all_labels):,}")
report_lines.append(f"Training Samples      : {len(train_labels):,} ({len(train_labels)/len(all_labels)*100:.1f}%)")
report_lines.append(f"Validation Samples    : {len(val_labels):,} ({len(val_labels)/len(all_labels)*100:.1f}%)")
report_lines.append(f"Testing Samples       : {len(test_labels):,} ({len(test_labels)/len(all_labels)*100:.1f}%)")
report_lines.append(f"Number of Classes     : {len(disease_columns)}")
report_lines.append("")
report_lines.append("MULTI-LABEL CHARACTERISTICS")
report_lines.append("-"*80)
report_lines.append(f"Labels per Sample     : {labels_per_sample.mean():.2f} (average)")
report_lines.append(f"                       {labels_per_sample.min():.0f} (min) to {labels_per_sample.max():.0f} (max)")
report_lines.append(f"Samples with 0 labels : {(labels_per_sample == 0).sum()} ({(labels_per_sample == 0).sum()/len(train_labels)*100:.2f}%)")
report_lines.append("")
report_lines.append("CLASS IMBALANCE METRICS")
report_lines.append("-"*80)
report_lines.append(f"Most Common Disease   : {disease_counts.idxmax()} ({disease_counts.max()} samples)")
report_lines.append(f"Least Common Disease  : {disease_counts[disease_counts > 0].idxmin()} ({disease_counts[disease_counts > 0].min()} samples)")
report_lines.append(f"Imbalance Ratio       : {imbalance_ratio:.1f}:1")
report_lines.append("")
report_lines.append("="*80)
report_lines.append("EDA Analysis Complete")
report_lines.append("="*80)

report = "\n".join(report_lines)
print(report)

# Save report
with open('EDA_Summary_Report.txt', 'w') as f:
    f.write(report)
print("\n‚úì Saved: EDA_Summary_Report.txt")

## üéâ EDA Analysis Complete!

### Generated Outputs:
1. **EDA_Disease_Distribution.png** - Comprehensive disease distribution visualizations
2. **EDA_Summary_Report.txt** - Text summary report

### Key Findings:
- 3,200 total images across 45 disease classes
- Severe class imbalance (133:1 ratio)
- Average 1.2 labels per image (multi-label classification)
- Training: 1,920 | Validation: 640 | Testing: 640

### Next Steps:
1. Proceed to model development (Vision Transformer, EfficientNet, GCN)
2. Implement focal loss for class imbalance
3. Use data augmentation for rare diseases
4. Evaluate with multi-label metrics (F1, AUC-ROC)