## 1. Import Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from pathlib import Path
from collections import Counter, defaultdict
from tqdm import tqdm
import yaml

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300

## 2. Configuration

**Note**: 
- Dataset must be prepared first using `process_bdd100k_to_yolo_dataset.py` script
- Integrity checks (image-label matching) are handled during dataset preparation
- This notebook focuses on **visualization** using pre-computed statistics from metadata files

In [None]:
# Base directory
BASE_DIR = Path(os.getcwd())

# Primary dataset for statistics (analyze the full dataset)
FULL_DATASET_ROOT = BASE_DIR / 'bdd100k_yolo'
FULL_DATA_YAML = FULL_DATASET_ROOT / 'data.yaml'

# Secondary dataset for visualization (use limited with representative samples)
LIMITED_DATASET_ROOT = BASE_DIR / 'bdd100k_yolo_limited'
LIMITED_DATA_YAML = LIMITED_DATASET_ROOT / 'data.yaml'

# Check which datasets exist
full_exists = FULL_DATA_YAML.exists()
limited_exists = LIMITED_DATA_YAML.exists()

print("=" * 70)
print("DATASET AVAILABILITY")
print("=" * 70)
print(f"Full dataset ({FULL_DATASET_ROOT.name}): {'✓ Found' if full_exists else '✗ Not found'}")
print(f"Limited dataset ({LIMITED_DATASET_ROOT.name}): {'✓ Found' if limited_exists else '✗ Not found'}")

if not full_exists and not limited_exists:
    raise FileNotFoundError(
        "No datasets found!\n\n"
        "Please run the dataset preparation script first:\n"
        "  python3 process_bdd100k_to_yolo_dataset.py\n"
    )

# Set analysis dataset (prefer full for statistics)
ANALYSIS_DATASET_ROOT = FULL_DATASET_ROOT if full_exists else LIMITED_DATASET_ROOT
ANALYSIS_DATA_YAML = FULL_DATA_YAML if full_exists else LIMITED_DATA_YAML

# Set visualization dataset (prefer limited for comprehensive visual coverage)
VIZ_DATASET_ROOT = LIMITED_DATASET_ROOT if limited_exists else FULL_DATASET_ROOT
VIZ_DATA_YAML = LIMITED_DATA_YAML if limited_exists else FULL_DATA_YAML

print("\n" + "=" * 70)
print("ANALYSIS CONFIGURATION")
print("=" * 70)
print(f"Statistics from: {ANALYSIS_DATASET_ROOT.name}")
print(f"Visualizations from: {VIZ_DATASET_ROOT.name}")
print("=" * 70)

print(f"\n✓ Configuration loaded successfully")

### Strategy

**This notebook uses TWO datasets:**
1. **FULL DATASET** (`bdd100k_yolo/`) - For comprehensive statistics and analysis
2. **LIMITED DATASET** (`bdd100k_yolo_limited/`) - For visualizations with representative samples

The limited dataset includes diverse representative samples covering:
- All attribute combinations (weather × scene × time)
- All object classes with adequate coverage
- All individual attribute values
- Class × attribute combinations

This ensures visualizations show all important scenarios while keeping the notebook responsive.

## 3. Load Metadata and Statistics

### 3.1 Load Metadata Files

Metadata files contain pre-computed statistics and representative sample information generated during dataset extraction.

In [None]:
import json

# Load metadata files for FULL dataset
print("=" * 90)
print("LOADING FULL DATASET METADATA")
print("=" * 90)

full_metadata_dir = FULL_DATASET_ROOT / 'representative_json'
full_metadata_by_split = {}
full_performance_by_split = {}

for split in ['train', 'val', 'test']:
    metadata_file = full_metadata_dir / f'{split}_metadata.json'
    
    if metadata_file.exists():
        with open(metadata_file, 'r') as f:
            full_metadata_by_split[split] = json.load(f)
        print(f"✓ Loaded {split} metadata: {metadata_file.name}")
    else:
        print(f"⚠ Missing {split} metadata: {metadata_file.name}")
        full_metadata_by_split[split] = None
    
    # Load performance analysis data (per-image details with attributes)
    performance_file = full_metadata_dir / f'{split}_performance_analysis.json'
    if performance_file.exists():
        with open(performance_file, 'r') as f:
            full_performance_by_split[split] = json.load(f)
        print(f"✓ Loaded {split} performance data: {performance_file.name}")
    else:
        print(f"⚠ Missing {split} performance data")
        full_performance_by_split[split] = None

if not any(full_metadata_by_split.values()):
    raise FileNotFoundError(
        "No full dataset metadata found!\n"
        "Please run: python3 process_bdd100k_to_yolo_dataset.py"
    )

# Load metadata files for LIMITED dataset
print("\n" + "=" * 90)
print("LOADING LIMITED DATASET METADATA")
print("=" * 90)

limited_metadata_dir = LIMITED_DATASET_ROOT / 'representative_json'
limited_metadata_by_split = {}
limited_performance_by_split = {}

if limited_metadata_dir.exists():
    for split in ['train', 'val', 'test']:
        metadata_file = limited_metadata_dir / f'{split}_metadata.json'
        
        if metadata_file.exists():
            with open(metadata_file, 'r') as f:
                limited_metadata_by_split[split] = json.load(f)
            print(f"✓ Loaded {split} metadata: {metadata_file.name}")
        else:
            print(f"⚠ Missing {split} metadata: {metadata_file.name}")
            limited_metadata_by_split[split] = None
        
        # Load performance analysis data
        performance_file = limited_metadata_dir / f'{split}_performance_analysis.json'
        if performance_file.exists():
            with open(performance_file, 'r') as f:
                limited_performance_by_split[split] = json.load(f)
            print(f"✓ Loaded {split} performance data: {performance_file.name}")
        else:
            print(f"⚠ Missing {split} performance data")
            limited_performance_by_split[split] = None
else:
    print("⚠ Limited dataset not found")
    limited_metadata_by_split = {'train': None, 'val': None, 'test': None}
    limited_performance_by_split = {'train': None, 'val': None, 'test': None}

# Load class names from data.yaml
with open(FULL_DATA_YAML, 'r') as f:
    dataset_config = yaml.safe_load(f)
    class_names = dataset_config['names']

print(f"\n✓ Class names loaded: {class_names}")
print(f"✓ Performance data available for per-image attribute analysis")
print("=" * 90)

### 3.2 Display Metadata Summary

In [None]:
# Display metadata summary for BOTH datasets - using ALL 6 metadata JSON files
print("=" * 90)
print("FULL DATASET METADATA (3 splits: train/val/test)")
print("=" * 90)

# Aggregate statistics from full dataset metadata (3 files)
# NEW STRUCTURE: representative samples are nested under statistics.representative_samples
full_stats = {
    'total_images': 0,
    'total_objects': 0,
    'representative_samples': 0,
    'representative_objects': 0,
    'by_split': {},
    'total_objects_by_class': {},
    'representative_objects_by_class': {}
}

# Initialize class totals
for cls in class_names:
    full_stats['total_objects_by_class'][cls] = 0
    full_stats['representative_objects_by_class'][cls] = 0

for split in ['train', 'val', 'test']:
    if not full_metadata_by_split[split]:
        continue
    
    meta = full_metadata_by_split[split]
    stats = meta['statistics']
    config = meta['configuration']
    
    # Extract from NEW metadata structure
    total_images = stats['total_files_analyzed']  # ALL images in split
    total_objects = sum(stats['by_class'].values())  # ALL objects in split
    
    # Representative samples data (nested)
    repr_samples = stats['representative_samples']['total_selected']
    repr_objects = sum(stats['representative_samples']['by_class'].values())
    
    full_stats['by_split'][split] = {
        'images': total_images,
        'objects': total_objects,
        'representative_samples': repr_samples,
        'representative_objects': repr_objects,
        'objects_by_class': stats['by_class'],  # ALL objects
        'representative_objects_by_class': stats['representative_samples']['by_class']  # Representative objects
    }
    full_stats['total_images'] += total_images
    full_stats['total_objects'] += total_objects
    full_stats['representative_samples'] += repr_samples
    full_stats['representative_objects'] += repr_objects
    
    # Aggregate object counts (ALL objects)
    for cls, count in stats['by_class'].items():
        full_stats['total_objects_by_class'][cls] += count
    
    # Aggregate representative object counts
    for cls, count in stats['representative_samples']['by_class'].items():
        full_stats['representative_objects_by_class'][cls] += count
    
    print(f"\n{split.upper()} SPLIT")
    print("-" * 90)
    print(f"  Generated: {meta.get('generation_date', 'N/A')}")
    print(f"  Total images in full dataset: {total_images:,}")
    print(f"  Total objects in full dataset: {total_objects:,}")
    print(f"\n  Representative sample selection (for limited dataset):")
    print(f"    Samples selected: {repr_samples:,} ({(repr_samples/total_images*100):.2f}% of this split)")
    print(f"    Objects in samples: {repr_objects:,} ({(repr_objects/total_objects*100):.2f}% of this split's objects)")
    
    print(f"\n  Selection Configuration:")
    print(f"    - Samples per attribute combo: {config['samples_per_attribute_combo']}")
    print(f"    - Min samples per class: {config['min_samples_per_class']}")
    print(f"    - Min samples per attribute value: {config['min_samples_per_attribute_value']}")
    print(f"    - Min samples per (class×attribute): {config['min_samples_per_class_attribute_combo']}")
    
    print(f"\n  Coverage Statistics (all images):")
    print(f"    - Classes with objects: {len([k for k,v in stats['by_class'].items() if v > 0])}/{len(meta['classes'])}")
    print(f"    - Weather types covered: {len([k for k,v in stats['by_weather'].items() if v > 0])}")
    print(f"    - Scene types covered: {len([k for k,v in stats['by_scene'].items() if v > 0])}")
    print(f"    - Time of day covered: {len([k for k,v in stats['by_timeofday'].items() if v > 0])}")
    print(f"    - Attribute combinations: {len(stats['by_attribute_combo'])}")

print(f"\n{'='*90}")
print(f"FULL DATASET TOTALS:")
print(f"  Total images (ALL): {full_stats['total_images']:,}")
print(f"  Total objects (ALL): {full_stats['total_objects']:,}")
print(f"\n  Representative samples info (for limited dataset creation):")
print(f"    Samples selected: {full_stats['representative_samples']:,} ({(full_stats['representative_samples']/full_stats['total_images']*100):.2f}% of full)")
print(f"    Objects in those samples: {full_stats['representative_objects']:,} ({(full_stats['representative_objects']/full_stats['total_objects']*100):.2f}% of full)")
print(f"    Note: These samples were physically copied to create the limited dataset")
print(f"  Number of classes: {len(class_names)}")
print("=" * 90)

# Display metadata summary for LIMITED dataset (3 files)
if any(limited_metadata_by_split.values()):
    print("\n\n" + "=" * 90)
    print("LIMITED DATASET METADATA (3 splits: train/val/test)")
    print("=" * 90)
    print("Note: Limited dataset contains ONLY the representative samples from full dataset")
    print("=" * 90)
    
    # Aggregate statistics from limited dataset metadata (3 files)
    limited_stats = {
        'total_images': 0,
        'total_selected': 0,
        'by_split': {},
        'total_objects_by_class': {}
    }
    
    # Initialize class totals
    for cls in class_names:
        limited_stats['total_objects_by_class'][cls] = 0
    
    for split in ['train', 'val', 'test']:
        if not limited_metadata_by_split[split]:
            continue
        
        meta = limited_metadata_by_split[split]
        stats = meta['statistics']
        
        # For limited dataset: use total_samples 
        total_images = meta['total_samples']
        total_selected = meta['total_samples']  # All images are representative
        
        limited_stats['by_split'][split] = {
            'images': total_images,
            'selected': total_selected,
            'objects_by_class': stats['by_class']
        }
        limited_stats['total_images'] += total_images
        limited_stats['total_selected'] += total_selected
        
        # Aggregate object counts
        for cls, count in stats['by_class'].items():
            limited_stats['total_objects_by_class'][cls] += count
        
        print(f"\n{split.upper()} SPLIT")
        print("-" * 90)
        print(f"  Source: {meta.get('data_source', 'N/A')}")
        print(f"  Total images: {total_images:,} (all are representative samples)")
        print(f"  Total objects detected: {sum(stats['by_class'].values()):,}")
        
        # Use class_names if 'classes' not in metadata
        num_classes = len(meta.get('classes', class_names))
        classes_with_objects = len([k for k,v in stats['by_class'].items() if v > 0])
        print(f"  Classes with objects: {classes_with_objects}/{num_classes}")
    
    print(f"\n{'='*90}")
    print(f"LIMITED DATASET TOTALS:")
    print(f"  Total images (all representative samples): {limited_stats['total_images']:,}")
    print(f"  Total objects across all splits: {sum(limited_stats['total_objects_by_class'].values()):,}")
    print(f"  Composition: Representative samples with comprehensive attribute coverage")
    print("=" * 90)
else:
    limited_stats = None

print("\n" + "=" * 90)
print("✓ All 6 metadata files loaded successfully")
print(f"  Full dataset metadata: 3 files (train/val/test_metadata.json)")
print(f"    - Contains statistics for ALL 100K images")
print(f"    - Tracks which samples were selected as representative")
print(f"  Limited dataset metadata: 3 files (train/val/test_metadata.json)")
print(f"    - Contains ONLY the representative samples (~2.3K)")
print(f"    - IS the physical copy of representative samples from full dataset")
print("=" * 90)

## 4. Statistical Analysis and Visualizations

### 4.1 Class Distribution Analysis

**Comparison:** Full Dataset (ALL 100K images) vs Limited Dataset (Representative ~2.3K samples)

**Important:** The limited dataset IS the representative samples - a physical copy of carefully selected diverse samples from the full dataset. There is no separate "Full Dataset Representative" - the representative samples exist ONLY in the limited dataset.

In [None]:
# CLASS DISTRIBUTION ANALYSIS - Full Dataset (ALL) vs Limited Dataset (Representative)
print("=" * 90)
print("CLASS DISTRIBUTION ANALYSIS - Full Dataset (ALL) vs Limited Dataset (Representative)")
print("=" * 90)

# Extract class statistics
# Full dataset: Use ALL objects (by_class)
# Limited dataset: IS the representative samples physically copied
class_stats_full_all = {split: full_metadata_by_split[split]['statistics']['by_class'] 
                        for split in ['train', 'val', 'test'] if full_metadata_by_split[split]}

class_stats_limited = {split: limited_metadata_by_split[split]['statistics']['by_class'] 
                       for split in ['train', 'val', 'test'] if limited_metadata_by_split[split]}

print(f"Full Dataset: {full_stats['total_images']:,} images, {full_stats['total_objects']:,} objects")
print(f"Limited Dataset: {limited_stats['total_images']:,} images, {sum(limited_stats['total_objects_by_class'].values()):,} objects")
print(f"Note: Limited dataset contains representative samples from full dataset ({(limited_stats['total_images']/full_stats['total_images']*100):.2f}%)")
print("=" * 90)

# Visualization 1: Side-by-side object counts per split
fig, axes = plt.subplots(3, 2, figsize=(14, 10))
fig.suptitle('Object Counts Per Class - Full Dataset (ALL) vs Limited Dataset (Representative)', fontsize=15, fontweight='bold')

colors = {'full': '#3498db', 'limited': '#e74c3c'}

for idx, split in enumerate(['train', 'val', 'test']):
    # Full dataset (ALL images)
    ax_full = axes[idx, 0]
    if split in class_stats_full_all and class_stats_full_all[split]:
        classes = list(class_stats_full_all[split].keys())
        counts = list(class_stats_full_all[split].values())
        
        bars = ax_full.barh(classes, counts, color=colors['full'], alpha=0.8)
        ax_full.set_xlabel('Total Objects', fontsize=11, fontweight='bold')
        ax_full.set_title(f'Full Dataset - {split.upper()} (ALL)\n({full_stats["by_split"][split]["images"]:,} images)', 
                         fontsize=14, fontweight='bold')
        ax_full.grid(axis='x', alpha=0.3)
        ax_full.tick_params(axis='both', labelsize=10)
        
        for bar in bars:
            width = bar.get_width()
            if width > 0:
                ax_full.text(width, bar.get_y() + bar.get_height()/2, 
                            f'{int(width):,}', ha='left', va='center', fontsize=9)
    
    # Limited dataset
    ax_limited = axes[idx, 1]
    if split in class_stats_limited and class_stats_limited[split]:
        classes = list(class_stats_limited[split].keys())
        counts = list(class_stats_limited[split].values())
        
        bars = ax_limited.barh(classes, counts, color=colors['limited'], alpha=0.8)
        ax_limited.set_xlabel('Total Objects', fontsize=11, fontweight='bold')
        ax_limited.set_title(f'Limited Dataset - {split.upper()}\n({limited_stats["by_split"][split]["images"]:,} images)', 
                            fontsize=14, fontweight='bold')
        ax_limited.grid(axis='x', alpha=0.3)
        ax_limited.tick_params(axis='both', labelsize=10)
        
        for bar in bars:
            width = bar.get_width()
            if width > 0:
                ax_limited.text(width, bar.get_y() + bar.get_height()/2, 
                               f'{int(width):,}', ha='left', va='center', fontsize=9)

plt.tight_layout(rect=[0, 0, 1, 0.99])
plt.show()

# Visualization 2: Direct comparison - Full (ALL) vs Limited for each split
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Full Dataset (ALL) vs Limited Dataset (Representative) - Object Count Comparison by Split', fontsize=14, fontweight='bold')

for idx, split in enumerate(['train', 'val', 'test']):
    ax = axes[idx]
    
    full_counts = [class_stats_full_all[split].get(cls, 0) for cls in class_names]
    limited_counts = [class_stats_limited[split].get(cls, 0) for cls in class_names]
    
    x = np.arange(len(class_names))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, full_counts, width, label='Full (ALL)', color=colors['full'], alpha=0.8)
    bars2 = ax.bar(x + width/2, limited_counts, width, label='Limited (Representative)', color=colors['limited'], alpha=0.8)
    
    ax.set_xlabel('Classes', fontsize=12, fontweight='bold')
    ax.set_ylabel('Total Objects', fontsize=12, fontweight='bold')
    ax.set_title(f'{split.upper()} Split', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(class_names, rotation=45, ha='right', fontsize=10)
    ax.legend(fontsize=10)
    ax.grid(axis='y', alpha=0.3)
    ax.tick_params(axis='y', labelsize=10)

plt.tight_layout(rect=[0, 0, 1, 0.99])
plt.show()

# Visualization 3: Total object counts across all splits - Full (ALL) vs Limited
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))

# Stacked comparison
full_totals = list(full_stats['total_objects_by_class'].values())  # Use ALL objects
limited_totals = list(limited_stats['total_objects_by_class'].values())

x = np.arange(len(class_names))
width = 0.35

bars1 = ax1.bar(x - width/2, full_totals, width, label='Full (ALL)', color=colors['full'], alpha=0.8)
bars2 = ax1.bar(x + width/2, limited_totals, width, label='Limited (Representative)', color=colors['limited'], alpha=0.8)

ax1.set_xlabel('Classes', fontsize=14, fontweight='bold')
ax1.set_ylabel('Total Objects (All Splits)', fontsize=14, fontweight='bold')
ax1.set_title('Total Object Counts - Full (ALL) vs Limited (Representative)', fontsize=14, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(class_names, rotation=45, ha='right', fontsize=9)
ax1.legend(fontsize=9)
ax1.grid(axis='y', alpha=0.3)

# Percentage representation
percentages = [(limited_totals[i] / full_totals[i] * 100) if full_totals[i] > 0 else 0 
               for i in range(len(class_names))]

bars = ax2.barh(class_names, percentages, color='#9b59b6', alpha=0.8)
ax2.set_xlabel('Limited as % of Full (%)', fontsize=14, fontweight='bold')
ax2.set_title('Limited Dataset (Representative) as % of Full Dataset (ALL)', fontsize=14, fontweight='bold')
ax2.grid(axis='x', alpha=0.3)
ax2.tick_params(axis='both', labelsize=11)

for idx, bar in enumerate(bars):
    width = bar.get_width()
    if width > 0 and full_totals[idx] > 0:
        ax2.text(width, bar.get_y() + bar.get_height()/2, 
                f'{width:.1f}%\n({limited_totals[idx]:,}/{full_totals[idx]:,})', 
                ha='left', va='center', fontsize=9)

plt.tight_layout(rect=[0, 0, 1, 0.99])
plt.show()

# Print detailed comparison summary
print("\nDetailed Class Distribution Summary:")
print("=" * 90)
print(f"\n{'Class':<20} | {'Full Dataset (ALL)':^50} | {'Limited (Representative)':^25}")
print(f"{'':20} | {'Train':>10} {'Val':>10} {'Test':>10} {'Total':>15} | {'Total':>15}")
print("-" * 90)

for cls in class_names:
    # Use ALL objects from full dataset
    full_train = class_stats_full_all['train'].get(cls, 0)
    full_val = class_stats_full_all['val'].get(cls, 0)
    full_test = class_stats_full_all['test'].get(cls, 0)
    full_total = full_train + full_val + full_test
    
    limited_total = sum(class_stats_limited[split].get(cls, 0) for split in ['train', 'val', 'test'])
    
    if full_total > 0 or limited_total > 0:
        pct = (limited_total / full_total * 100) if full_total > 0 else 0
        print(f"{cls:<20} | {full_train:>10,} {full_val:>10,} {full_test:>10,} {full_total:>15,} | {limited_total:>15,} ({pct:5.1f}%)")

print("-" * 90)
# Use ALL objects from full dataset for comparison
full_grand_total = sum(full_stats['total_objects_by_class'].values())
limited_grand_total = sum(limited_stats['total_objects_by_class'].values())
grand_pct = (limited_grand_total / full_grand_total * 100) if full_grand_total > 0 else 0
print(f"{'TOTAL':<20} | {'':<10} {'':<10} {'':<10} {full_grand_total:>15,} | {limited_grand_total:>15,} ({grand_pct:5.1f}%)")

print("\n✓ Class distribution analysis complete")
print(f"Note: Limited dataset is a representative sample of {grand_pct:.1f}% of full dataset objects")
print("=" * 90)

### 4.2 Attribute Distribution Analysis - Full Dataset (ALL) vs Limited Dataset (Representative)

Analyze distribution of weather, scene, and time attributes. Limited dataset IS the representative samples from the full dataset.

In [None]:
# Comprehensive attribute distribution analysis - FULL DATASET (ALL) vs LIMITED DATASET (Representative)
print("=" * 90)
print("ATTRIBUTE DISTRIBUTION ANALYSIS - Full Dataset (ALL) & Limited Dataset (Representative)")
print("=" * 90)

# Extract attribute statistics for FULL DATASET (ALL images)
full_weather_stats = {split: {} for split in ['train', 'val', 'test']}
full_scene_stats = {split: {} for split in ['train', 'val', 'test']}
full_timeofday_stats = {split: {} for split in ['train', 'val', 'test']}

# Extract attribute statistics for LIMITED DATASET (which IS the representative samples)
limited_weather_stats = {split: {} for split in ['train', 'val', 'test']}
limited_scene_stats = {split: {} for split in ['train', 'val', 'test']}
limited_timeofday_stats = {split: {} for split in ['train', 'val', 'test']}

for split in ['train', 'val', 'test']:
    if full_metadata_by_split[split]:
        stats = full_metadata_by_split[split]['statistics']
        
        # Full dataset attributes (ALL images)
        if 'full_dataset_attributes' in stats:
            full_weather_stats[split] = stats['full_dataset_attributes'].get('by_weather', {})
            full_scene_stats[split] = stats['full_dataset_attributes'].get('by_scene', {})
            full_timeofday_stats[split] = stats['full_dataset_attributes'].get('by_timeofday', {})
    
    # Limited dataset attributes - NEW STRUCTURE: list format [weather_dict, scene_dict, timeofday_dict]
    if limited_metadata_by_split[split]:
        stats = limited_metadata_by_split[split]['statistics']
        if 'attributes' in stats and isinstance(stats['attributes'], list) and len(stats['attributes']) >= 3:
            limited_weather_stats[split] = stats['attributes'][0]  # First element is weather
            limited_scene_stats[split] = stats['attributes'][1]    # Second is scene
            limited_timeofday_stats[split] = stats['attributes'][2]  # Third is timeofday

print("\nFull Dataset (ALL) Attribute Coverage:")
for split in ['train', 'val', 'test']:
    if full_weather_stats[split]:
        print(f"  {split.upper()}: {sum(full_weather_stats[split].values()):,} images with attributes")

print("\nLimited Dataset (Representative) Attribute Coverage:")
for split in ['train', 'val', 'test']:
    if limited_weather_stats[split]:
        print(f"  {split.upper()}: {sum(limited_weather_stats[split].values()):,} samples")

print("=" * 90)

# Visualization 1: Weather distribution - Full Dataset (ALL) vs Limited Dataset (Representative)
fig, axes = plt.subplots(2, 3, figsize=(15, 9))
fig.suptitle('Weather Attribute Distribution - Full Dataset (ALL) vs Limited Dataset (Representative)', fontsize=14, fontweight='bold')

colors_splits = ['#3498db', '#e67e22', '#2ecc71']

for idx, split in enumerate(['train', 'val', 'test']):
    # Full dataset (ALL)
    ax_full = axes[0, idx]
    if full_weather_stats[split]:
        weather_types = sorted(full_weather_stats[split].keys())
        counts = [full_weather_stats[split][w] for w in weather_types]
        
        bars = ax_full.barh(weather_types, counts, color=colors_splits[idx], alpha=0.8)
        ax_full.set_xlabel('Number of Images', fontsize=12, fontweight='bold')
        ax_full.set_title(f'Full (ALL) - {split.upper()}\n({sum(counts):,} images)', fontsize=14, fontweight='bold')
        ax_full.grid(axis='x', alpha=0.3)
        ax_full.tick_params(axis='both', labelsize=11)
        
        for bar in bars:
            width = bar.get_width()
            if width > 0:
                ax_full.text(width, bar.get_y() + bar.get_height()/2, 
                           f'{int(width):,}', ha='left', va='center', fontsize=10)
    
    # Limited dataset (Representative)
    ax_limited = axes[1, idx]
    if limited_weather_stats[split]:
        weather_types = sorted(limited_weather_stats[split].keys())
        counts = [limited_weather_stats[split][w] for w in weather_types]
        
        bars = ax_limited.barh(weather_types, counts, color=colors_splits[idx], alpha=0.8)
        ax_limited.set_xlabel('Number of Samples', fontsize=12, fontweight='bold')
        ax_limited.set_title(f'Limited (Representative) - {split.upper()}\n({sum(counts):,} samples)', fontsize=14, fontweight='bold')
        ax_limited.grid(axis='x', alpha=0.3)
        ax_limited.tick_params(axis='both', labelsize=11)
        
        for bar in bars:
            width = bar.get_width()
            if width > 0:
                ax_limited.text(width, bar.get_y() + bar.get_height()/2, 
                           f'{int(width):,}', ha='left', va='center', fontsize=10)

plt.tight_layout(rect=[0, 0, 1, 0.99])
plt.show()

# Visualization 2: Scene distribution - Full Dataset (ALL) vs Limited Dataset (Representative)
fig, axes = plt.subplots(2, 3, figsize=(15, 9))
fig.suptitle('Scene Attribute Distribution - Full Dataset (ALL) vs Limited Dataset (Representative)', fontsize=14, fontweight='bold')

for idx, split in enumerate(['train', 'val', 'test']):
    # Full dataset (ALL)
    ax_full = axes[0, idx]
    if full_scene_stats[split]:
        scene_types = sorted(full_scene_stats[split].keys())
        counts = [full_scene_stats[split][s] for s in scene_types]
        
        bars = ax_full.barh(scene_types, counts, color=colors_splits[idx], alpha=0.8)
        ax_full.set_xlabel('Number of Images', fontsize=12, fontweight='bold')
        ax_full.set_title(f'Full (ALL) - {split.upper()}\n({sum(counts):,} images)', fontsize=14, fontweight='bold')
        ax_full.grid(axis='x', alpha=0.3)
        ax_full.tick_params(axis='both', labelsize=11)
        
        for bar in bars:
            width = bar.get_width()
            if width > 0:
                ax_full.text(width, bar.get_y() + bar.get_height()/2, 
                           f'{int(width):,}', ha='left', va='center', fontsize=10)
    
    # Limited dataset (Representative)
    ax_limited = axes[1, idx]
    if limited_scene_stats[split]:
        scene_types = sorted(limited_scene_stats[split].keys())
        counts = [limited_scene_stats[split][s] for s in scene_types]
        
        bars = ax_limited.barh(scene_types, counts, color=colors_splits[idx], alpha=0.8)
        ax_limited.set_xlabel('Number of Samples', fontsize=12, fontweight='bold')
        ax_limited.set_title(f'Limited (Representative) - {split.upper()}\n({sum(counts):,} samples)', fontsize=14, fontweight='bold')
        ax_limited.grid(axis='x', alpha=0.3)
        ax_limited.tick_params(axis='both', labelsize=11)
        
        for bar in bars:
            width = bar.get_width()
            if width > 0:
                ax_limited.text(width, bar.get_y() + bar.get_height()/2, 
                           f'{int(width):,}', ha='left', va='center', fontsize=10)

plt.tight_layout(rect=[0, 0, 1, 0.99])
plt.show()

# Visualization 3: Time of Day distribution - Full Dataset (ALL) vs Limited Dataset (Representative)
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
fig.suptitle('Time of Day Attribute Distribution - Full Dataset (ALL) vs Limited Dataset (Representative)', fontsize=14, fontweight='bold')

for idx, split in enumerate(['train', 'val', 'test']):
    # Full dataset (ALL)
    ax_full = axes[0, idx]
    if full_timeofday_stats[split]:
        timeofday_types = sorted(full_timeofday_stats[split].keys())
        counts = [full_timeofday_stats[split][t] for t in timeofday_types]
        
        bars = ax_full.barh(timeofday_types, counts, color=colors_splits[idx], alpha=0.8)
        ax_full.set_xlabel('Number of Images', fontsize=12, fontweight='bold')
        ax_full.set_title(f'Full (ALL) - {split.upper()}\n({sum(counts):,} images)', fontsize=14, fontweight='bold')
        ax_full.grid(axis='x', alpha=0.3)
        ax_full.tick_params(axis='both', labelsize=11)
        
        for bar in bars:
            width = bar.get_width()
            if width > 0:
                ax_full.text(width, bar.get_y() + bar.get_height()/2, 
                           f'{int(width):,}', ha='left', va='center', fontsize=10)
    
    # Limited dataset (Representative)
    ax_limited = axes[1, idx]
    if limited_timeofday_stats[split]:
        timeofday_types = sorted(limited_timeofday_stats[split].keys())
        counts = [limited_timeofday_stats[split][t] for t in timeofday_types]
        
        bars = ax_limited.barh(timeofday_types, counts, color=colors_splits[idx], alpha=0.8)
        ax_limited.set_xlabel('Number of Samples', fontsize=12, fontweight='bold')
        ax_limited.set_title(f'Limited (Representative) - {split.upper()}\n({sum(counts):,} samples)', fontsize=14, fontweight='bold')
        ax_limited.grid(axis='x', alpha=0.3)
        ax_limited.tick_params(axis='both', labelsize=11)
        
        for bar in bars:
            width = bar.get_width()
            if width > 0:
                ax_limited.text(width, bar.get_y() + bar.get_height()/2, 
                           f'{int(width):,}', ha='left', va='center', fontsize=10)

plt.tight_layout(rect=[0, 0, 1, 0.99])
plt.show()

# Summary table
print("\n" + "=" * 90)
print("ATTRIBUTE DISTRIBUTION SUMMARY")
print("=" * 90)

print("\nWeather Distribution:")
print(f"{'Attribute Value':<20} | {'Full (ALL) Train':>15} {'Val':>12} {'Test':>12} | {'Limited Train':>15} {'Val':>12} {'Test':>12}")
print("-" * 90)

all_weather = set()
for split_data in full_weather_stats.values():
    all_weather.update(split_data.keys())
for split_data in limited_weather_stats.values():
    all_weather.update(split_data.keys())

for weather in sorted(all_weather):
    full_train = full_weather_stats['train'].get(weather, 0)
    full_val = full_weather_stats['val'].get(weather, 0)
    full_test = full_weather_stats['test'].get(weather, 0)
    limited_train = limited_weather_stats['train'].get(weather, 0)
    limited_val = limited_weather_stats['val'].get(weather, 0)
    limited_test = limited_weather_stats['test'].get(weather, 0)
    
    print(f"{weather:<20} | {full_train:>15,} {full_val:>12,} {full_test:>12,} | {limited_train:>15,} {limited_val:>12,} {limited_test:>12,}")

print("\n✓ Attribute distribution analysis complete")
print("Note: Limited dataset IS the representative sample from full dataset")
print("=" * 90)

### 4.3 Dataset Comparison - Full vs Limited

Compare statistics between full and limited datasets.

In [None]:
# Comprehensive dataset comparison - Full vs Limited using CORRECT metadata
print("=" * 90)
print("DATASET COMPARISON - FULL VS LIMITED")
print("=" * 90)

if limited_stats:
    # Prepare data
    splits = ['train', 'val', 'test']
    full_images = [full_stats['by_split'][split]['images'] for split in splits]
    limited_images = [limited_stats['by_split'][split]['images'] for split in splits]
    full_objects = [sum(full_stats['by_split'][split]['objects_by_class'].values()) for split in splits]
    limited_objects = [sum(limited_stats['by_split'][split]['objects_by_class'].values()) for split in splits]
    
    # Visualization: Percentage comparison (cleaner than raw counts)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))
    
    # Image percentage
    img_percentages = [(limited_images[i] / full_images[i] * 100) for i in range(len(splits))]
    bars = ax1.barh(splits, img_percentages, color=['#1f77b4', '#ff7f0e', '#2ca02c'], alpha=0.8)
    ax1.set_xlabel('Percentage (%)', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Split', fontsize=12, fontweight='bold')
    ax1.set_title('Limited Dataset Images as % of Full Dataset', fontsize=13, fontweight='bold')
    ax1.grid(axis='x', alpha=0.3)
    ax1.tick_params(axis='both', labelsize=11)
    
    for idx, bar in enumerate(bars):
        width = bar.get_width()
        ax1.text(width, bar.get_y() + bar.get_height()/2, 
               f'{width:.2f}%\n({limited_images[idx]:,}/{full_images[idx]:,})', 
               ha='left', va='center', fontsize=8, fontweight='bold')
    
    # Object percentage
    obj_percentages = [(limited_objects[i] / full_objects[i] * 100) if full_objects[i] > 0 else 0 
                       for i in range(len(splits))]
    bars = ax2.barh(splits, obj_percentages, color=['#1f77b4', '#ff7f0e', '#2ca02c'], alpha=0.8)
    ax2.set_xlabel('Percentage (%)', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Split', fontsize=12, fontweight='bold')
    ax2.set_title('Limited Dataset Objects as % of Full Dataset', fontsize=13, fontweight='bold')
    ax2.grid(axis='x', alpha=0.3)
    ax2.tick_params(axis='both', labelsize=11)
    
    for idx, bar in enumerate(bars):
        width = bar.get_width()
        ax2.text(width, bar.get_y() + bar.get_height()/2, 
               f'{width:.2f}%\n({limited_objects[idx]:,}/{full_objects[idx]:,})', 
               ha='left', va='center', fontsize=8, fontweight='bold')
    
    plt.tight_layout(rect=[0, 0, 1, 0.99])
    plt.show()
    
    # Summary (concise - detailed counts already in Section 3.2 & 4.1)
    full_total_obj = sum(full_stats['total_objects_by_class'].values())
    limited_total_obj = sum(limited_stats['total_objects_by_class'].values())
    
    print("\n" + "=" * 90)
    print("COMPARISON SUMMARY")
    print("=" * 90)
    print(f"Dataset Reduction: {full_stats['total_images'] / limited_stats['total_images']:.1f}x smaller")
    print(f"Object Coverage: {(limited_total_obj/full_total_obj*100):.1f}% of full dataset objects")
    print(f"Object Density: {limited_total_obj/limited_stats['total_images']:.1f} objects/image (limited) vs "
          f"{full_total_obj/full_stats['total_images']:.1f} objects/image (full)")
    print("=" * 90)
else:
    print("⚠ Limited dataset not available for comparison")

print("\n✓ Dataset comparison complete")
print("=" * 90)

## 5. Visualizations

### 5.1 Sample Images with Annotations

Representative samples showing diverse attributes (weather, scene, time) for each class.

### 5.2 Complex Multi-Object Scenes

Examples of images with multiple objects and diverse classes.

In [None]:
# Helper function to draw bounding boxes - can filter by target class
def draw_yolo_boxes(img_path, label_path, class_names, target_class_id=None):
    """
    Draw bounding boxes on image with class labels.
    
    Args:
        img_path: Path to image
        label_path: Path to label file
        class_names: List of class names
        target_class_id: If specified, only draw boxes for this class. If None, draw all classes.
    """
    if not img_path.exists():
        return None, 0
    
    img = cv2.imread(str(img_path))
    if img is None:
        return None, 0
    
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    h, w = img.shape[:2]
    
    num_objects = 0
    if label_path.exists():
        with open(label_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 5:
                    continue
                
                class_id = int(parts[0])
                
                # Skip if target_class_id is specified and this isn't it
                if target_class_id is not None and class_id != target_class_id:
                    continue
                
                x_center, y_center, width, height = map(float, parts[1:5])
                
                # Convert to pixel coordinates
                x1 = int((x_center - width / 2) * w)
                y1 = int((y_center - height / 2) * h)
                x2 = int((x_center + width / 2) * w)
                y2 = int((y_center + height / 2) * h)
                
                # Draw box
                colors = plt.cm.tab10.colors
                color = tuple(int(c * 255) for c in colors[class_id % len(colors)])
                cv2.rectangle(img, (x1, y1), (x2, y2), color, 3)
                
                # Add class label only
                label = class_names[class_id]
                font = cv2.FONT_HERSHEY_SIMPLEX
                (text_w, text_h), _ = cv2.getTextSize(label, font, 0.7, 2)
                cv2.rectangle(img, (x1, y1 - text_h - 10), (x1 + text_w + 10, y1), color, -1)
                cv2.putText(img, label, (x1 + 5, y1 - 5), font, 0.7, (255, 255, 255), 2)
                
                num_objects += 1
    
    return img, num_objects


# Organize samples by class from metadata
print("=" * 90)
print("ORGANIZING REPRESENTATIVE SAMPLES FROM METADATA")
print("=" * 90)

viz_images_dir = LIMITED_DATASET_ROOT / 'images' if LIMITED_DATASET_ROOT.exists() else FULL_DATASET_ROOT / 'images'
viz_labels_dir = LIMITED_DATASET_ROOT / 'labels' if LIMITED_DATASET_ROOT.exists() else FULL_DATASET_ROOT / 'labels'

samples_by_class = {class_id: [] for class_id in range(len(class_names))}

# Use performance_analysis data (works for both full and limited datasets)
performance_source = limited_performance_by_split if any(limited_performance_by_split.values()) else full_performance_by_split

for split in ['train', 'val', 'test']:
    if not performance_source[split]:
        continue
    
    perf_data = performance_source[split]
    
    # Group images by class from performance data
    for img_info in perf_data['images']:
        basename = img_info['basename']
        classes_present = img_info['classes_present']
        
        # Add this image to samples for each class it contains
        for class_name in classes_present:
            if class_name in class_names:
                class_id = class_names.index(class_name)
                
                # Limit samples per class per split
                if len([s for s in samples_by_class[class_id] if s['split'] == split]) >= 5:
                    continue
                
                # Find image file
                img_path = None
                for ext in ['.jpg', '.png', '.jpeg']:
                    test_path = viz_images_dir / split / f"{basename}{ext}"
                    if test_path.exists():
                        img_path = test_path
                        break
                
                label_path = viz_labels_dir / split / f"{basename}.txt"
                
                if img_path and label_path.exists():
                    samples_by_class[class_id].append({
                        'img_path': img_path,
                        'label_path': label_path,
                        'split': split
                    })

print(f"✓ Organized samples for visualization")

# Display samples for each class - ONLY ANNOTATE TARGET CLASS
print("\n" + "=" * 90)
print("VISUALIZING REPRESENTATIVE SAMPLES PER CLASS")
print("=" * 90)

for class_id, class_name in enumerate(class_names):
    samples = samples_by_class[class_id]
    
    if not samples:
        continue
    
    print(f"\nClass: {class_name.upper()}")
    
    num_samples = min(len(samples), 9)  # Max 9 samples (3x3 grid)
    cols = 3
    rows = (num_samples + cols - 1) // cols
    
    fig, axes = plt.subplots(rows, cols, figsize=(14, 3 * rows))
    
    if rows == 1:
        axes = axes.reshape(1, -1)
    
    fig.suptitle(f"Class: {class_name}", fontsize=14, fontweight='bold', y=0.998)
    
    for idx, sample in enumerate(samples[:num_samples]):
        row = idx // cols
        col = idx % cols
        ax = axes[row, col]
        
        # ONLY draw boxes for this specific class
        img_with_boxes, num_objects = draw_yolo_boxes(
            sample['img_path'], 
            sample['label_path'], 
            class_names,
            target_class_id=class_id  # Filter to only this class
        )
        
        if img_with_boxes is not None:
            ax.imshow(img_with_boxes)
            ax.set_title(
                f"{sample['img_path'].name}\nSplit: {sample['split'].upper()}", 
                fontsize=10, pad=3
            )
        else:
            ax.text(0.5, 0.5, 'Image not found', ha='center', va='center', fontsize=10)
        
        ax.axis('off')
    
    # Hide unused subplots
    for idx in range(num_samples, rows * cols):
        row = idx // cols
        col = idx % cols
        axes[row, col].axis('off')
    
    plt.tight_layout(rect=[0, 0, 1, 0.99])
    plt.show()

print("\n" + "=" * 90)
print("✓ Visualization complete")
print("=" * 90)

In [None]:
# Find images with multiple objects
print("=" * 90)
print("FINDING COMPLEX MULTI-OBJECT SCENES")
print("=" * 90)

multi_object_samples = []

for split in ['train', 'val', 'test']:
    split_labels_dir = viz_labels_dir / split
    
    if not split_labels_dir.exists():
        continue
    
    for label_file in split_labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        
        if len(lines) >= 5:  # At least 5 objects
            # Count unique classes
            classes_in_image = set()
            for line in lines:
                parts = line.strip().split()
                if len(parts) >= 5:
                    classes_in_image.add(int(parts[0]))
            
            # Find corresponding image
            img_path = None
            for ext in ['.jpg', '.png', '.jpeg']:
                test_path = viz_images_dir / split / f"{label_file.stem}{ext}"
                if test_path.exists():
                    img_path = test_path
                    break
            
            if img_path:
                multi_object_samples.append({
                    'img_path': img_path,
                    'label_path': label_file,
                    'num_objects': len(lines),
                    'num_classes': len(classes_in_image),
                    'split': split
                })

# Sort by diversity (more classes first), then by number of objects
multi_object_samples.sort(key=lambda x: (x['num_classes'], x['num_objects']), reverse=True)

print(f"✓ Found {len(multi_object_samples)} images with 5+ objects")

# Display top 5 most complex scenes - ANNOTATE ALL CLASSES
num_to_display = min(5, len(multi_object_samples))

print(f"\nDisplaying {num_to_display} most complex scenes:")
print("=" * 90)

for i in range(num_to_display):
    sample = multi_object_samples[i]
    
    # Draw ALL classes in multi-object scenes (target_class_id=None)
    img_with_boxes, num_objects = draw_yolo_boxes(
        sample['img_path'],
        sample['label_path'],
        class_names,
        target_class_id=None  # Show all classes
    )
    
    if img_with_boxes is not None:
        # Get class distribution
        class_counts = Counter()
        with open(sample['label_path'], 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5:
                    class_counts[int(parts[0])] += 1
        
        class_summary = ", ".join([
            f"{class_names[cid]}({cnt})" 
            for cid, cnt in sorted(class_counts.items())
        ])
        
        # Display
        fig, ax = plt.subplots(1, 1, figsize=(14, 8))
        ax.imshow(img_with_boxes)
        ax.set_title(
            f"Scene #{i+1} | Total Objects: {num_objects} | Unique Classes: {sample['num_classes']} | "
            f"Split: {sample['split'].upper()}\n{class_summary}",
            fontsize=14, fontweight='bold', pad=12
        )
        ax.axis('off')
        plt.tight_layout(rect=[0, 0, 1, 0.99])
        plt.show()
        
        print(f"✓ Scene #{i+1}: {num_objects} objects, {sample['num_classes']} unique classes")

print("=" * 90)

## 6. Final Summary

In [None]:
print("=" * 90)
print("BDD100K DATASET SUMMARY")
print("=" * 90)

print("\n1. CLASSES")
print("-" * 90)
for idx, class_name in enumerate(class_names):
    print(f"  {idx}. {class_name}")

print("\n2. METADATA FILES")
print("-" * 90)
print(f"  Full dataset: {FULL_DATASET_ROOT / 'representative_json'}")
for split in ['train', 'val', 'test']:
    print(f"    - {split}_metadata.json")
if limited_stats:
    print(f"  Limited dataset: {LIMITED_DATASET_ROOT / 'representative_json'}")
    for split in ['train', 'val', 'test']:
        print(f"    - {split}_metadata.json")

print("\n" + "=" * 90)
print("✅ Dataset exploration complete!")
print("=" * 90)
print("\nNotes:")
print("  - All file counts and statistics shown in Section 3.2")
print("  - Integrity checks (image-label matching) performed during dataset preparation")
print("  - Statistics loaded from pre-computed metadata files")
print("  - Visualizations use representative samples with comprehensive coverage")
print("  - Dataset ready for YOLO training!")
print("=" * 90)

## 6. Per-Image Attribute Data

Performance metadata containing per-image attributes and class information for model evaluation.

In [None]:
# Display sample of per-image performance data
print("=" * 90)
print("PER-IMAGE ATTRIBUTE DATA (Performance Analysis Metadata)")
print("=" * 90)

# Check if performance data is available
if full_performance_by_split.get('test'):
    perf_data = full_performance_by_split['test']
    
    print(f"\nFull Dataset - Test Split:")
    print(f"  Total images: {perf_data['total_images']}")
    print(f"  Generation date: {perf_data['generation_date']}")
    
    # Show sample entries
    print(f"\nSample entries (first 5 images):")
    print("-" * 90)
    
    for i, img_data in enumerate(perf_data['images'][:5]):
        print(f"\n{i+1}. Image: {img_data['basename']}")
        print(f"   Weather: {img_data['weather']}")
        print(f"   Scene: {img_data['scene']}")
        print(f"   Time of day: {img_data['timeofday']}")
        print(f"   Classes present: {', '.join(img_data['classes_present']) if img_data['classes_present'] else 'None'}")
        print(f"   Total objects: {img_data['total_objects']}")
        if img_data['objects_per_class']:
            print(f"   Objects per class: {dict(list(img_data['objects_per_class'].items())[:3])}")
    
    # Statistics about attributes
    print("\n" + "=" * 90)
    print("ATTRIBUTE COVERAGE IN PERFORMANCE DATA")
    print("=" * 90)
    
    weather_count = {}
    scene_count = {}
    timeofday_count = {}
    
    for img_data in perf_data['images']:
        weather = img_data['weather']
        scene = img_data['scene']
        timeofday = img_data['timeofday']
        
        weather_count[weather] = weather_count.get(weather, 0) + 1
        scene_count[scene] = scene_count.get(scene, 0) + 1
        timeofday_count[timeofday] = timeofday_count.get(timeofday, 0) + 1
    
    print(f"\nWeather conditions: {len(weather_count)} types")
    for weather, count in sorted(weather_count.items(), key=lambda x: x[1], reverse=True):
        print(f"  {weather}: {count} images")
    
    print(f"\nScene types: {len(scene_count)} types")
    for scene, count in sorted(scene_count.items(), key=lambda x: x[1], reverse=True):
        print(f"  {scene}: {count} images")
    
    print(f"\nTime of day: {len(timeofday_count)} types")
    for timeofday, count in sorted(timeofday_count.items(), key=lambda x: x[1], reverse=True):
        print(f"  {timeofday}: {count} images")
    
    print("\n" + "=" * 90)
    print("✓ Performance metadata ready for YOLO model evaluation")
    print("  Use this data in yolo_test notebook for attribute-based performance analysis")
    print("=" * 90)
    
else:
    print("\n⚠️  Performance data not available for test split")
    print("Run: python3 process_bdd100k_to_yolo_dataset.py")

# Check limited dataset performance data
if limited_performance_by_split.get('test'):
    perf_data = limited_performance_by_split['test']
    
    print(f"\n\nLimited Dataset - Test Split:")
    print(f"  Total images: {perf_data['total_images']}")
    print(f"  Generation date: {perf_data['generation_date']}")
    
    # Show sample entries
    print(f"\nSample entries (first 5 representative images):")
    print("-" * 90)
    
    for i, img_data in enumerate(perf_data['images'][:5]):
        print(f"\n{i+1}. Image: {img_data['basename']}")
        print(f"   Weather: {img_data['weather']}")
        print(f"   Scene: {img_data['scene']}")
        print(f"   Time of day: {img_data['timeofday']}")
        print(f"   Classes present: {', '.join(img_data['classes_present']) if img_data['classes_present'] else 'None'}")
        print(f"   Total objects: {img_data['total_objects']}")
        if img_data['objects_per_class']:
            print(f"   Objects per class: {dict(list(img_data['objects_per_class'].items())[:3])}")
    
    # Statistics about attributes
    print("\n" + "=" * 90)
    print("ATTRIBUTE COVERAGE IN LIMITED DATASET PERFORMANCE DATA")
    print("=" * 90)
    
    weather_count = {}
    scene_count = {}
    timeofday_count = {}
    
    for img_data in perf_data['images']:
        weather = img_data['weather']
        scene = img_data['scene']
        timeofday = img_data['timeofday']
        
        weather_count[weather] = weather_count.get(weather, 0) + 1
        scene_count[scene] = scene_count.get(scene, 0) + 1
        timeofday_count[timeofday] = timeofday_count.get(timeofday, 0) + 1
    
    print(f"\nWeather conditions: {len(weather_count)} types")
    for weather, count in sorted(weather_count.items(), key=lambda x: x[1], reverse=True):
        print(f"  {weather}: {count} representative samples")
    
    print(f"\nScene types: {len(scene_count)} types")
    for scene, count in sorted(scene_count.items(), key=lambda x: x[1], reverse=True):
        print(f"  {scene}: {count} representative samples")
    
    print(f"\nTime of day: {len(timeofday_count)} types")
    for timeofday, count in sorted(timeofday_count.items(), key=lambda x: x[1], reverse=True):
        print(f"  {timeofday}: {count} representative samples")
    
    print("\n" + "=" * 90)
    print("✓ Limited dataset performance metadata ready for YOLO model evaluation")
    print("  Use this data in yolo_test notebook for attribute-based performance analysis")
    print("  Representative samples ensure comprehensive coverage across all attributes")
    print("=" * 90)
else:
    print("\n\n⚠️  Performance data not available for limited dataset test split")


## Exploration Complete

This notebook has successfully:
1. ✓ Loaded metadata from both full and limited datasets with complete file counts (Section 3.2)
2. ✓ Displayed comprehensive pre-computed statistics from metadata files for both datasets
3. ✓ Visualized class distribution across splits (train/val/test) for both datasets (Section 4.1)
4. ✓ Analyzed attribute distributions (weather, scene, time) across splits (Section 4.2)
5. ✓ Created comparison charts between full and limited datasets (Section 4.3)
6. ✓ Visualized representative samples per class with **only target class annotated** (Section 5.1)
7. ✓ Showed complex multi-object scenes with **all classes annotated** (Section 5.2)
8. ✓ Loaded per-image performance metadata with attributes and class information (Section 6)

**Key Statistical Analyses Performed:**
- **Class Distribution**: Bar charts showing class counts per split with reduced font sizes
- **Split Comparison**: Charts comparing train/val/test distributions
- **Attribute Analysis**: Weather, scene, and time of day distributions
- **Dataset Comparison**: Full vs Limited dataset with percentages shown alongside counts
- **Coverage Analysis**: Comprehensive coverage verification across all dimensions
- **Performance Metadata**: Per-image attributes ready for model evaluation

**Annotation Strategy:**
- **Per-Class Visualization**: Only the target class is annotated (clean, focused view)
- **Multi-Object Scenes**: All classes are annotated (comprehensive scene understanding)

**Performance Analysis Features:**
- **Per-Image Attributes**: Weather, scene, timeofday for each image
- **Class Distribution**: Objects per class for each image
- **Evaluation Ready**: Data structured for attribute-based performance analysis in test notebook

**Key Points:**
- **Full Dataset**: ~100k images for comprehensive statistics and training
- **Limited Dataset**: Representative samples ensuring comprehensive coverage
- **Metadata Files**: Pre-computed statistics generated during dataset preparation
- **Performance Data**: Per-image details for attribute-based model evaluation
- **Integrity Checks**: Performed automatically during dataset preparation
- **Font Sizes**: Reduced for better readability and professional appearance

The dataset is now ready for YOLO model training with complete understanding of its composition and distribution!