# FSC-147 Dataset Exploration

Explore the FSC-147 dataset for sequential counting.

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from collections import Counter
from PIL import Image

from dataset_fsc147 import FSC147Dataset, get_dataset_stats
from utils import VisualMarker

plt.rcParams['figure.figsize'] = (14, 8)

## 1. Dataset Overview

In [None]:
DATASET_ROOT = '/media/M2SSD/FSC147'

# Get statistics
stats = get_dataset_stats(DATASET_ROOT)

print("FSC-147 Dataset Statistics")
print("=" * 80)
for split_name, split_stats in stats.items():
    print(f"\n{split_name.upper()}:")
    print(f"  Images: {split_stats['num_images']}")
    print(f"  With annotations: {split_stats['num_with_annotations']}")
    print(f"  Avg objects/image: {split_stats['avg_count']:.1f}")
    print(f"  Median: {split_stats['median_count']:.0f}")
    print(f"  Range: [{split_stats['min_count']}, {split_stats['max_count']}]")
    print(f"  Unique object types: {len(split_stats['object_types'])}")

## 2. Object Count Distribution

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (split_name, ax) in enumerate(zip(['train', 'val', 'test'], axes)):
    counts = stats[split_name]['object_counts']
    
    ax.hist(counts, bins=50, alpha=0.7, edgecolor='black')
    ax.set_title(f"{split_name.capitalize()} (n={len(counts)})")
    ax.set_xlabel('Objects per Image')
    ax.set_ylabel('Frequency')
    ax.axvline(np.mean(counts), color='red', linestyle='--', label=f'Mean: {np.mean(counts):.1f}')
    ax.axvline(np.median(counts), color='green', linestyle='--', label=f'Median: {np.median(counts):.1f}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Top Object Categories

In [None]:
top_n = 20

for split_name in ['train', 'val', 'test']:
    object_types = stats[split_name]['object_types']
    top_types = object_types.most_common(top_n)
    
    print(f"\n{split_name.upper()} - Top {top_n} Object Types:")
    print("=" * 60)
    for obj_type, count in top_types:
        print(f"{obj_type:<30} {count:>5} images")

## 4. Load Dataset and Visualize Samples

In [None]:
# Load train dataset
dataset = FSC147Dataset(
    dataset_root=DATASET_ROOT,
    split='train',
    spatial_order='reading_order',
    min_objects=5,
    max_objects=50
)

print(f"Loaded {len(dataset)} training samples with 5-50 objects")

## 5. Sample Visualization with Numbered Markers

In [None]:
marker = VisualMarker(strategy='numbers', alpha=0.7)

def visualize_sample(idx=0, num_to_mark=None):
    """Visualize a sample with optional marking."""
    img, points, meta = dataset[idx]
    
    if num_to_mark is None:
        num_to_mark = len(points)
    
    num_to_mark = min(num_to_mark, len(points))
    
    img_np = np.array(img)
    if num_to_mark > 0:
        marked_img = marker.mark_image(img_np, points[:num_to_mark])
    else:
        marked_img = img_np
    
    plt.figure(figsize=(12, 8))
    plt.imshow(marked_img)
    
    title = f"Object Type: {meta['object_type']}\n"
    title += f"Total: {len(points)} | Marked: {num_to_mark}"
    if num_to_mark < len(points):
        next_pt = points[num_to_mark]
        title += f" | Next: ({next_pt[0]:.0f}, {next_pt[1]:.0f})"
    
    plt.title(title, fontsize=14)
    plt.axis('off')
    plt.tight_layout()
    plt.show()

# Show multiple samples
for i in range(min(5, len(dataset))):
    visualize_sample(i, num_to_mark=0)  # Show unmarked
    visualize_sample(i, num_to_mark=len(dataset[i][1]) // 2)  # Show half marked

## 6. Interactive Sample Browser

In [None]:
from ipywidgets import interact, IntSlider

def explore_sample(sample_idx=0, num_marked=0):
    """Interactive sample explorer."""
    sample_idx = min(sample_idx, len(dataset) - 1)
    img, points, meta = dataset[sample_idx]
    num_marked = min(num_marked, len(points))
    
    # Mark points
    img_np = np.array(img)
    if num_marked > 0:
        marked_img = marker.mark_image(img_np, points[:num_marked])
    else:
        marked_img = img_np
    
    plt.figure(figsize=(12, 8))
    plt.imshow(marked_img)
    
    title = f"{meta['object_type']} | Total: {len(points)} | Marked: {num_marked}"
    if num_marked < len(points):
        next_pt = points[num_marked]
        title += f" | Next: ({next_pt[0]:.0f}, {next_pt[1]:.0f})"
    else:
        title += " | DONE"
    
    plt.title(title, fontsize=14)
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    print(f"Image: {meta['image_name']}")
    print(f"Size: {meta['image_size']}")

interact(
    explore_sample,
    sample_idx=IntSlider(min=0, max=len(dataset)-1, step=1, value=0, description='Sample:'),
    num_marked=IntSlider(min=0, max=50, step=1, value=0, description='Marked:')
)

## 7. Compare Spatial Orderings

In [None]:
from dataset_fsc147 import SpatialSorter

def compare_orderings(sample_idx=0):
    """Show same image with different spatial orderings."""
    # Get unsorted points
    example = dataset.examples[sample_idx]
    unsorted_points = example['points']
    
    # Load image
    img = Image.open(example['image_path']).convert('RGB')
    
    # Apply different orderings
    sorter = SpatialSorter()
    orderings = {
        'Reading Order': sorter.reading_order(unsorted_points),
        'Left to Right': sorter.left_to_right(unsorted_points),
        'Nearest Neighbor': sorter.nearest_neighbor(unsorted_points)
    }
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    for ax, (title, ordered_points) in zip(axes, orderings.items()):
        img_np = np.array(img)
        marked_img = marker.mark_image(img_np, ordered_points)
        
        ax.imshow(marked_img)
        ax.set_title(f"{title}\n{example['object_type']} (n={len(ordered_points)})")
        ax.axis('off')
    
    plt.tight_layout()
    plt.show()

# Show comparisons for first few samples
for i in range(min(3, len(dataset))):
    compare_orderings(i)

## 8. Export Dataset Summary

In [None]:
# Save summary to JSON
summary = {
    'dataset': 'FSC-147',
    'total_images': sum(s['num_images'] for s in stats.values()),
    'splits': {}
}

for split_name, split_stats in stats.items():
    summary['splits'][split_name] = {
        'num_images': split_stats['num_images'],
        'avg_count': float(split_stats['avg_count']),
        'median_count': float(split_stats['median_count']),
        'min_count': int(split_stats['min_count']),
        'max_count': int(split_stats['max_count']),
        'unique_object_types': len(split_stats['object_types'])
    }

with open('fsc147_dataset_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("Dataset summary saved to fsc147_dataset_summary.json")
print(f"\nTotal images: {summary['total_images']:,}")