In [None]:
"""
README: YOLO City & Class Distribution Counter

This script scans YOLO-format label files for each split (train/valid/test),
and counts the number of images and class occurrences per city.
Cities are identified by keywords in filenames. Results are printed
in a readable summary.

How to use:
- Set the 'base_dir', 'splits', and 'label_dirname' as appropriate.
- Adjust 'city_keywords' and 'class_names' as needed for your dataset.

Author: Bahadir Akin Akgul
Date: 13.07.2025
"""

import os
from pathlib import Path
from collections import defaultdict

# === Settings ===
base_dir = Path("PATH_TO_DATASET_ROOT")
splits = ["train", "valid", "test"]
label_dirname = "labels"

# City matching
city_keywords = {
    'istanbul': ['libadiye', 'levent', 'taksim', 'ciragan', 'barbaros', 'dolmabahce', 'bagdat', 'muallim', 'katar'],
    'paris': ['paris-champs'],
    'munich': ['munih'],
    'marseille': ['marsilya']
}
city_translation = {
    'istanbul': 'Istanbul', 'paris': 'Paris',
    'munich': 'Munich', 'marseille': 'Marseille',
    'unknown': 'Unknown'
}

# Class names
class_names = {0: 'pedestrian', 1: 'road', 2: 'vehicle'}

# Result containers
city_image_counts = defaultdict(int)
city_class_counts = defaultdict(lambda: defaultdict(int))

# City extractor
def get_city(filename):
    name = filename.lower()
    for city, keywords in city_keywords.items():
        if any(k in name for k in keywords):
            return city
    return 'unknown'

# === Counting ===
for split in splits:
    label_path = base_dir / split / label_dirname
    for txt_file in label_path.glob("*.txt"):
        city = get_city(txt_file.stem)
        city_image_counts[city] += 1

        with open(txt_file) as f:
            for line in f:
                if not line.strip():
                    continue
                parts = line.strip().split()
                class_id = int(parts[0])
                if class_id in class_names:
                    city_class_counts[city][class_id] += 1

# === Output ===
total_images = sum(city_image_counts.values())
print(f"\nTotal Images: {total_images}\n")

for city, count in city_image_counts.items():
    city_label = city_translation.get(city, city.capitalize())
    print(f"{city_label}: {count} images")

print("\nClass Counts per City:")
for city in city_image_counts:
    city_label = city_translation.get(city, city.capitalize())
    print(f"\n{city_label}:")
    for cls_id, cls_name in class_names.items():
        cls_count = city_class_counts[city][cls_id]
        print(f"  - {cls_name}: {cls_count}")
