### Data preprocessing
This notebook processes a dataset of LEGO images and annotations, preparing it for training an object detection model using YOLO. 

The script:
1. Parses XML annotations in PASCAL VOC format
2. Filters problematic data (unusual boxes, duplicates, etc.)
3. Converts annotations to YOLO format
4. Splits data into training, validation, and test sets

All LEGO pieces are treated as a single class for object detection.

In [34]:
import os
import xml.etree.ElementTree as ET
from collections import Counter
import shutil
from sklearn.model_selection import train_test_split
import random
import glob

In [35]:
# Set paths
img_dir = 'dataset_20210629145407_top_600/images'
ann_dir = 'dataset_20210629145407_top_600/annotations'
output_dir = "dataset_yolo"

In [36]:
# Step 1: Parse XML annotations and limit the number for faster processing
def parse_annotations(annotation_dir, max_annotations=15000):
    """
    Parse XML annotations, filter nonexistent files, and limit the number of annotations.
    Args:
        annotation_dir: Directory containing XML annotation files
        max_annotations: Maximum number of annotations to parse
    Returns: List of parsed annotation dictionaries
    """
    annotations = []
    count = 0
    
    for file in os.listdir(annotation_dir):
        count += 1
        # limit the number of annotations
        if count >= max_annotations:
            break
        
        # confirm the file is an XML file
        if not file.endswith('.xml'):
            continue
        
        try:
            # parse the XML file
            file_path = os.path.join(annotation_dir, file)
            tree = ET.parse(file_path)
            root = tree.getroot()
            
            # Skip files with missing required elements
            if root.find('filename') is None or root.find('size/width') is None:
                continue
                
            image_data = {
                'filename': root.find('filename').text,
                'xml_file': file,
                'size': {
                    'width': int(root.find('size/width').text),
                    'height': int(root.find('size/height').text),
                },
                'objects': []
            }
            
            for obj in root.findall('object'):
                # Skip objects with missing required elements
                if obj.find('name') is None or obj.find('bndbox') is None:
                    continue
                    
                bbox = obj.find('bndbox')
                if (bbox.find('xmin') is None or bbox.find('ymin') is None or
                    bbox.find('xmax') is None or bbox.find('ymax') is None):
                    continue
                
                # Get difficult flag if available, default to 0
                difficult = 0
                if obj.find('difficult') is not None:
                    difficult = int(obj.find('difficult').text)
                
                obj_data = {
                    'name': obj.find('name').text,
                    'difficult': difficult,
                    'bbox': {
                        'xmin': int(float(bbox.find('xmin').text)),
                        'ymin': int(float(bbox.find('ymin').text)),
                        'xmax': int(float(bbox.find('xmax').text)),
                        'ymax': int(float(bbox.find('ymax').text)),
                    }
                }
                image_data['objects'].append(obj_data)
            
            # Only add images with at least one valid object
            if image_data['objects']:
                annotations.append(image_data)
                
        except Exception as e:
            print(f"Error parsing {file}: {e}")
    
    print(f"\nParsed {len(annotations)} valid annotations out of {count} files")
    return annotations

# Parse annotations
annotations = parse_annotations(ann_dir)



Parsed 14999 valid annotations out of 15000 files


In [37]:
# Step 2: Filter problematic annotations
# Filter unusual bounding box sizes
def check_bounding_box(annotations, min_area=100, max_area=50000, min_aspect_ratio=0.2, max_aspect_ratio=5.0):
    """
    Check bounding box areas and filter unusual sizes, such as too small or too large,
    too wide or too narrow.
    Args:
        annotations: List of parsed annotations
        min_area: Minimum bounding box area, default 100 (10x10)
        max_area: Maximum bounding box area, default 50000 (200x250)
        min_aspect_ratio: Minimum bounding box aspect ratio, default 0.2 (1:5)
        max_aspect_ratio: Maximum bounding box aspect ratio, default 5.0 (5:1)
    Returns: List of filtered annotations
    """
    annotations_filtered = []
    for ann in annotations:
        valid = True
        for obj in ann['objects']:
            bbox = obj['bbox']
            width = bbox['xmax'] - bbox['xmin']
            height = bbox['ymax'] - bbox['ymin']
            area = width * height
            aspect_ratio = width / height if height > 0 else 0
            
            if area < min_area or area > max_area or aspect_ratio < min_aspect_ratio or aspect_ratio > max_aspect_ratio:
                valid = False
                break
        if valid:
            annotations_filtered.append(ann)
    print(f"\nFiltered {len(annotations) - len(annotations_filtered)} annotations with unusual bounding box sizes")
    print(f"Remaining annotations: {len(annotations_filtered)}")
    return annotations_filtered

# Filter annotations
annotations = check_bounding_box(annotations)


Filtered 4096 annotations with unusual bounding box sizes
Remaining annotations: 10903


In [38]:
# Check intersection over union (IoU) between bounding boxes and filter high IoU
def calculate_iou(bbox1, bbox2):
    """
    Calculate intersection over union (IoU) between two bounding boxes.
    Args:
        bbox1: First bounding box coordinates (xmin, ymin, xmax, ymax)
        bbox2: Second bounding box coordinates (xmin, ymin, xmax, ymax)
    Returns: IoU value
    """
    x_left = max(bbox1['xmin'], bbox2['xmin'])
    y_top = max(bbox1['ymin'], bbox2['ymin'])
    x_right = min(bbox1['xmax'], bbox2['xmax'])
    y_bottom = min(bbox1['ymax'], bbox2['ymax'])
    
    intersection = max(0, x_right - x_left) * max(0, y_bottom - y_top)
    area1 = (bbox1['xmax'] - bbox1['xmin']) * (bbox1['ymax'] - bbox1['ymin'])
    area2 = (bbox2['xmax'] - bbox2['xmin']) * (bbox2['ymax'] - bbox2['ymin'])
    union = area1 + area2 - intersection
    
    return intersection / union if union > 0 else 0.0

def check_iou(annotations, max_iou=0.9):
    """
    Check intersection over union (IoU) between bounding boxes and filter annotations with duplicates.
    Args:
        annotations: List of parsed annotations
        max_iou: Maximum IoU threshold to consider boxes as duplicates, default 0.9
    Returns: List of filtered annotations without duplicated bounding boxes
    """
    annotations_filtered = []
    duplicate = 0
    for ann in annotations:
        valid = True
        objects = ann['objects']
        num_objects = len(objects)
        
        # Skip IoU if there is only one object
        if num_objects < 2:
            annotations_filtered.append(ann)
            continue
        
        # Calculate IoU for all pairs of objects
        for i in range(num_objects):
            for j in range(i + 1, num_objects):
                iou = calculate_iou(objects[i]['bbox'], objects[j]['bbox'])
                if iou > max_iou:
                    duplicate += 1
                    valid = False
                    break
            if not valid:
                break
        if valid:
            annotations_filtered.append(ann)
    
    print(f"\nFiltered {duplicate} annotations with duplicate boxes (IoU > {max_iou})")
    print(f"Remaining annotations: {len(annotations_filtered)}")
    return annotations_filtered

annotations = check_iou(annotations)


Filtered 0 annotations with duplicate boxes (IoU > 0.9)
Remaining annotations: 10903


In [39]:
# Check image files and filter annotations with missing images
def check_image(annotations, img_dir):
    """
    Check if image files exist for each annotation.
    Args:
        annotations: List of parsed annotations
        img_dir: Directory containing image files
    Returns: List of filtered annotations with existing image files
    """
    annotations_filtered = []
    for ann in annotations:
        img = ann['filename']
        if os.path.exists(os.path.join(img_dir, img)):
            annotations_filtered.append(ann)
    print(f"\nFiltered {len(annotations) - len(annotations_filtered)} annotations with missing image files")
    print(f"Remaining annotations: {len(annotations_filtered)}")
    return annotations_filtered


annotations = check_image(annotations, img_dir)


Filtered 1 annotations with missing image files
Remaining annotations: 10902


In [40]:
# Create directories
def create_directory_structure(base_dir):
    """
    Create the directory structure for the YOLO dataset.
    Args:
        base_dir: Base directory for the dataset
    Returns: Dictionary of directory paths
    """
    
    # Create base directory
    os.makedirs(base_dir, exist_ok=True)
    
    # Create split directories
    dirs = {}
    for split in ['train', 'val', 'test']:
        # Create main split directory
        split_dir = os.path.join(base_dir, split)
        os.makedirs(split_dir, exist_ok=True)
        
        # Create images and labels directories
        images_dir = os.path.join(split_dir, 'images')
        labels_dir = os.path.join(split_dir, 'labels')
        os.makedirs(images_dir, exist_ok=True)
        os.makedirs(labels_dir, exist_ok=True)
        
        # Store directory paths
        dirs[f'{split}_dir'] = split_dir
        dirs[f'{split}_images_dir'] = images_dir
        dirs[f'{split}_labels_dir'] = labels_dir
    return dirs

dirs = create_directory_structure(output_dir)


In [41]:
# Step 3: Convert annotations to YOLO format and process splits
def normalize_bbox(bbox, img_width, img_height):
    """
    Convert a bounding box to YOLO format.
    Args:
        bbox: Dictionary with xmin, ymin, xmax, ymax keys
        img_width: Width of the image
        img_height: Height of the image
    Returns: Tuple of (x_center, y_center, width, height) normalized to [0, 1]
    """
    # Calculate center coordinates and dimensions
    x_center = (bbox['xmin'] + bbox['xmax']) / 2 / img_width
    y_center = (bbox['ymin'] + bbox['ymax']) / 2 / img_height
    width = (bbox['xmax'] - bbox['xmin']) / img_width
    height = (bbox['ymax'] - bbox['ymin']) / img_height
    
    # Ensure values are within [0, 1]
    x_center = max(0, min(1, x_center))
    y_center = max(0, min(1, y_center))
    width = max(0, min(1, width))
    height = max(0, min(1, height))
    
    return x_center, y_center, width, height

def process_split(split_annotations, images_dir, labels_dir, source_img_dir, class_id=0):
    """
    Process a data split by copying images and creating label files in YOLO format.
    
    Args:
        split_annotations: List of annotations for this split
        images_dir: Destination directory for images
        labels_dir: Destination directory for labels
        source_img_dir: Source directory containing original images
        class_id: Class ID for YOLO format (default: 0 for all LEGO pieces)
    """
    
    for ann in split_annotations:
        # Copy image
        src_img_path = os.path.join(source_img_dir, ann['filename'])
        dst_img_path = os.path.join(images_dir, ann['filename'])
        shutil.copy2(src_img_path, dst_img_path)
        
        # Create label file
        filename = os.path.splitext(ann['filename'])[0]
        label_path = os.path.join(labels_dir, f"{filename}.txt")
        
        with open(label_path, 'w') as f:
            img_width = ann['size']['width']
            img_height = ann['size']['height']
            
            for obj in ann['objects']:
                # Convert to YOLO format
                x_center, y_center, width, height = normalize_bbox(obj['bbox'], img_width, img_height)
                
                # Write YOLO format: class_id x_center y_center width height
                f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

print("Splitting dataset into train, validation, and test sets...")
random.seed(42)
random.shuffle(annotations)

# Split into train (70%), validation (15%), and test (15%)
train_annotations, temp_annotations = train_test_split(annotations, test_size=0.3, random_state=42)
val_annotations, test_annotations = train_test_split(temp_annotations, test_size=0.5, random_state=42)

splits = {
        'train': len(train_annotations),
        'val': len(val_annotations),
        'test': len(test_annotations)
    }

print(f"Train set: {splits['train']} annotations")
print(f"Validation set: {splits['val']} annotations")
print(f"Test set: {splits['test']} annotations")

# Process splits
process_split(train_annotations, dirs['train_images_dir'], dirs['train_labels_dir'], img_dir)
process_split(val_annotations, dirs['val_images_dir'], dirs['val_labels_dir'], img_dir)
process_split(test_annotations, dirs['test_images_dir'], dirs['test_labels_dir'], img_dir)
print("Complete process splits")

Splitting dataset into train, validation, and test sets...
Train set: 7631 annotations
Validation set: 1635 annotations
Test set: 1636 annotations
Complete process splits


In [42]:
# Step 4: Create a YAML configuration file for YOLOv8
def create_yaml_config(output_dir, splits):
    """
    Create a YAML configuration file for YOLOv8.
    Args:
        output_dir: Output directory for the dataset
        splits: Dictionary containing split statistics
    """
    yaml_content = f"""
path: {os.path.abspath(output_dir)}
train: train/images
val: val/images
test: test/images
nc: 1  # number of classes
names: ['lego']  # class names
"""
    # Save the YAML content to a file
    yaml_path = os.path.join(output_dir, 'dataset.yaml')
    with open(yaml_path, 'w') as f:
        f.write(yaml_content)
    
    print(f"\nYAML configuration saved to {yaml_path}")

# Create YAML configuration file
create_yaml_config(output_dir, splits)



YAML configuration saved to dataset_yolo/dataset.yaml


In [43]:
def verify_dataset(dirs):
    """
    Verify the dataset structure and content.
    Args:
        dirs: Dictionary of directory paths
    """
    # Count files in each directory
    for split in ['train', 'val', 'test']:
        images_count = len(os.listdir(dirs[f'{split}_images_dir']))
        labels_count = len(os.listdir(dirs[f'{split}_labels_dir']))
        
        print(f"{split.capitalize()} images: {images_count}")
        print(f"{split.capitalize()} labels: {labels_count}")
        
        # Ensure counts match
        if images_count != labels_count:
            print(f"Warning: Mismatch between {split} images and labels!")
    
    # Check a random label file to confirm format
    split = random.choice(['train', 'val', 'test'])
    label_files = glob.glob(os.path.join(dirs[f'{split}_labels_dir'], '*.txt'))
    
    if label_files:
        sample_label = random.choice(label_files)
        print(f"\nSample label file ({os.path.basename(sample_label)}):")
        with open(sample_label, 'r') as f:
            print(f.read())

# Verify the dataset
verify_dataset(dirs)

Train images: 7631
Train labels: 7631
Val images: 1635
Val labels: 1635
Test images: 1636
Test labels: 1636

Sample label file (ff0fd59e-da2d-11eb-90ee-3497f683a169.txt):
0 0.630000 0.675000 0.143333 0.086667
0 0.037500 0.628333 0.075000 0.076667
0 0.915833 0.424167 0.168333 0.148333
0 0.215833 0.289167 0.108333 0.128333

