In [None]:
"""
Hard Hat / PPE Detection Dataset - Data Collection and Validation
==================================================================
This notebook validates and explores the Hard Hat Workers dataset.
The dataset contains 3 classes: head, helmet, person
"""

from pathlib import Path
import os
from collections import Counter
import yaml

# ==== CONFIGURATION ====
PROJECT_ROOT = Path(os.getcwd()).parent
DATA_DIR = PROJECT_ROOT / "data"
DATA_YAML = DATA_DIR / "data.yaml"

print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")
print(f"Data YAML exists: {DATA_YAML.exists()}")

# Load dataset configuration
if DATA_YAML.exists():
    with open(DATA_YAML, 'r') as f:
        config = yaml.safe_load(f)
    print(f"\nDataset Configuration:")
    print(f"  Classes: {config.get('nc', 'N/A')}")
    print(f"  Class names: {config.get('names', [])}")
    print(f"  Train: {config.get('train', 'N/A')}")
    print(f"  Val: {config.get('val', 'N/A')}")
    print(f"  Test: {config.get('test', 'N/A')}")
else:
    print("‚ö†Ô∏è  data.yaml not found!")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ==== STEP 1: Validate Dataset Structure ====
from glob import glob

IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.bmp', '.webp'}

# Check train/val/test splits
splits = ['train', 'valid', 'test']
dataset_stats = {}

for split in splits:
    split_dir = DATA_DIR / split
    images_dir = split_dir / 'images'
    labels_dir = split_dir / 'labels'
    
    if images_dir.exists():
        images = [f for f in images_dir.rglob('*') if f.suffix.lower() in IMAGE_EXTS]
        labels = list(labels_dir.rglob('*.txt')) if labels_dir.exists() else []
        
        dataset_stats[split] = {
            'images': len(images),
            'labels': len(labels),
            'images_dir': images_dir.exists(),
            'labels_dir': labels_dir.exists()
        }
    else:
        dataset_stats[split] = {'images': 0, 'labels': 0, 'images_dir': False, 'labels_dir': False}

print("\nüìä Dataset Statistics:")
print("=" * 60)
for split, stats in dataset_stats.items():
    print(f"{split.upper():10s} | Images: {stats['images']:5d} | Labels: {stats['labels']:5d}")
print("=" * 60)

total_images = sum(s['images'] for s in dataset_stats.values())
total_labels = sum(s['labels'] for s in dataset_stats.values())
print(f"\nTotal: {total_images} images, {total_labels} labels")

üì• Downloading 5000 samples from COCO-2017 (all classes)...
Downloading split 'train' to '/home/omar/fiftyone/coco-2017/train' if necessary
Found annotations at '/home/omar/fiftyone/coco-2017/raw/instances_train2017.json'
Sufficient images already downloaded
Existing download of split 'train' is sufficient
You are running the oldest supported major version of MongoDB. Please refer to https://deprecation.voxel51.com for deprecation notices. You can suppress this exception by setting your `database_validation` config parameter to `False`. See https://docs.voxel51.com/user_guide/config.html#configuring-a-mongodb-connection for more information
Loading 'coco-2017' split 'train'
 100% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [29.7s elapsed, 0s remaining, 186.3 samples/s]      
Dataset 'coco-mini-all-5000' created
‚úÖ Dataset 'coco-mini-all' loaded with 5000 samples.


In [None]:
# ==== STEP 2: Validate Label-Image Pairs ====
import re

def parse_yolo_label(path):
    """Parse YOLO label file and return class counts."""
    classes = []
    try:
        with open(path, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                parts = re.split(r'\s+', line)
                if len(parts) >= 5:
                    cls_id = int(parts[0])
                    classes.append(cls_id)
    except Exception as e:
        print(f"Error reading {path}: {e}")
    return classes

# Check label-image pairs and class distribution
class_counts = Counter()
missing_labels = []
missing_images = []

for split in splits:
    split_dir = DATA_DIR / split
    images_dir = split_dir / 'images'
    labels_dir = split_dir / 'labels'
    
    if not images_dir.exists() or not labels_dir.exists():
        continue
    
    # Get all images and labels
    images = {f.stem: f for f in images_dir.rglob('*') if f.suffix.lower() in IMAGE_EXTS}
    labels = {f.stem: f for f in labels_dir.rglob('*.txt')}
    
    # Check pairs
    for stem, img_path in images.items():
        if stem not in labels:
            missing_labels.append((split, stem))
    
    for stem, lbl_path in labels.items():
        if stem not in images:
            missing_images.append((split, stem))
        else:
            # Count classes in this label
            classes = parse_yolo_label(lbl_path)
            class_counts.update(classes)

print(f"\n‚úÖ Label-Image Pair Validation:")
print(f"  Missing labels: {len(missing_labels)}")
print(f"  Missing images: {len(missing_images)}")

if missing_labels:
    print(f"\n‚ö†Ô∏è  Images without labels (first 5):")
    for split, stem in missing_labels[:5]:
        print(f"    {split}/{stem}")

if missing_images:
    print(f"\n‚ö†Ô∏è  Labels without images (first 5):")
    for split, stem in missing_images[:5]:
        print(f"    {split}/{stem}")

# Class distribution
class_names = config.get('names', ['head', 'helmet', 'person'])
print(f"\nüìà Class Distribution:")
for cls_id, count in sorted(class_counts.items()):
    cls_name = class_names[cls_id] if cls_id < len(class_names) else f"class_{cls_id}"
    print(f"  {cls_name:10s} (ID {cls_id}): {count:6d} instances")

Total images in dataset: 5000


In [None]:
# ==== STEP 3: Sample Visualization ====
from PIL import Image
import matplotlib.pyplot as plt
import random
import numpy as np

%matplotlib inline

def yolo_to_xyxy(x, y, w, h, img_w, img_h):
    """Convert YOLO format to xyxy coordinates."""
    cx = x * img_w
    cy = y * img_h
    bw = w * img_w
    bh = h * img_h
    x1 = int(max(0, cx - bw / 2))
    y1 = int(max(0, cy - bh / 2))
    x2 = int(min(img_w - 1, cx + bw / 2))
    y2 = int(min(img_h - 1, cy + bh / 2))
    return x1, y1, x2, y2

# Visualize a few random samples
num_samples = 4
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
axes = axes.flatten()

samples_shown = 0
for split in splits:
    if samples_shown >= num_samples:
        break
    
    split_dir = DATA_DIR / split
    images_dir = split_dir / 'images'
    labels_dir = split_dir / 'labels'
    
    if not images_dir.exists():
        continue
    
    images = [f for f in images_dir.rglob('*') if f.suffix.lower() in IMAGE_EXTS]
    if not images:
        continue
    
    # Sample random image
    sample_img = random.choice(images)
    sample_lbl = labels_dir / f"{sample_img.stem}.txt"
    
    if not sample_lbl.exists():
        continue
    
    # Load and display
    img = Image.open(sample_img).convert('RGB')
    img_w, img_h = img.size
    
    ax = axes[samples_shown]
    ax.imshow(img)
    ax.set_title(f"{split.upper()}: {sample_img.name}", fontsize=10)
    
    # Draw bounding boxes
    with open(sample_lbl, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = re.split(r'\s+', line)
            if len(parts) >= 5:
                cls_id = int(parts[0])
                x, y, w, h = map(float, parts[1:5])
                x1, y1, x2, y2 = yolo_to_xyxy(x, y, w, h, img_w, img_h)
                
                cls_name = class_names[cls_id] if cls_id < len(class_names) else f"class_{cls_id}"
                color = 'lime' if cls_id == 1 else ('red' if cls_id == 0 else 'cyan')
                
                rect = plt.Rectangle((x1, y1), x2-x1, y2-y1, fill=False, 
                                    edgecolor=color, linewidth=2)
                ax.add_patch(rect)
                ax.text(x1, y1-5, cls_name, color='white', fontsize=8,
                       bbox=dict(facecolor=color, alpha=0.7, pad=2))
    
    ax.axis('off')
    samples_shown += 1

# Hide unused subplots
for i in range(samples_shown, num_samples):
    axes[i].axis('off')

plt.tight_layout()
plt.show()

print(f"\n‚úÖ Dataset validation complete!")
print(f"   Ready for training with {total_images} images across {len(splits)} splits")

üì¶ Exporting to YOLO format...
Directory '/home/omar/Desktop/object-detection-video/object_detection_project/data/yolo_dataset' already exists; export will be merged with existing files
 100% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [15.5s elapsed, 0s remaining, 345.7 samples/s]      
‚úÖ Export complete!
Your dataset is ready at: /home/omar/Desktop/object-detection-video/object_detection_project/data/yolo_dataset


In [None]:
# ==== STEP 4: Dataset Summary ====
print("\n" + "=" * 60)
print("DATASET SUMMARY - Hard Hat / PPE Detection")
print("=" * 60)
print(f"Classes: {', '.join(class_names)}")
print(f"Total images: {total_images}")
print(f"Total labels: {total_labels}")
print(f"\nSplit distribution:")
for split, stats in dataset_stats.items():
    pct = (stats['images'] / total_images * 100) if total_images > 0 else 0
    print(f"  {split:10s}: {stats['images']:5d} images ({pct:5.1f}%)")
print("\n‚úÖ Dataset is ready for training!")
print("=" * 60)

Number of samples loaded in FiftyOne dataset: 5000


In [None]:
# ==== STEP 5: Verify YOLO Format Compatibility ====
print("\nüîç Verifying YOLO format compatibility...")

# Check a few label files
sample_checked = 0
for split in splits:
    if sample_checked >= 3:
        break
    
    split_dir = DATA_DIR / split
    labels_dir = split_dir / 'labels'
    
    if not labels_dir.exists():
        continue
    
    labels = list(labels_dir.rglob('*.txt'))
    if not labels:
        continue
    
    sample_lbl = random.choice(labels)
    with open(sample_lbl, 'r') as f:
        lines = [l.strip() for l in f.readlines() if l.strip()]
        if lines:
            parts = re.split(r'\s+', lines[0])
            if len(parts) >= 5:
                cls_id = int(parts[0])
                coords = list(map(float, parts[1:5]))
                if all(0 <= c <= 1 for c in coords[1:]) and 0 <= coords[0] < 1:
                    print(f"  ‚úÖ {split}/{sample_lbl.name}: Valid YOLO format")
                    sample_checked += 1
                else:
                    print(f"  ‚ö†Ô∏è  {split}/{sample_lbl.name}: Coordinates out of range")
            else:
                print(f"  ‚ùå {split}/{sample_lbl.name}: Invalid format")

print("\n‚úÖ Dataset format validation complete!")

Notebook sessions cannot wait
