# Annotations xml to txt 

In [5]:
import os
import xml.etree.ElementTree as ET

def voc_to_yolo_and_classes(xml_folder, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Set to store unique class names
    classes = set()

    # Collect all class names first
    for xml_file in os.listdir(xml_folder):
        if xml_file.endswith('.xml'):
            xml_path = os.path.join(xml_folder, xml_file)
            tree = ET.parse(xml_path)
            root = tree.getroot()

            for obj in root.findall('object'):
                class_name = obj.find('name').text
                classes.add(class_name)

    # Sort classes and create class-to-ID mapping
    sorted_classes = sorted(classes)
    class_to_id = {class_name: i for i, class_name in enumerate(sorted_classes)}

    # Save class names to file
    classes_file = os.path.join(output_folder, "class_names.txt")
    with open(classes_file, "w") as f:
        f.write("\n".join(sorted_classes))

    print(f"Saved class names to: {classes_file}")

    # Process each XML file again to generate YOLO labels
    for xml_file in os.listdir(xml_folder):
        if not xml_file.endswith('.xml'):
            continue

        xml_path = os.path.join(xml_folder, xml_file)
        tree = ET.parse(xml_path)
        root = tree.getroot()

        # Get image dimensions
        size = root.find('size')
        img_width = int(size.find('width').text)
        img_height = int(size.find('height').text)

        # Prepare YOLO annotations
        yolo_annotations = []
        for obj in root.findall('object'):
            class_name = obj.find('name').text
            class_id = class_to_id[class_name]  # ✅ Get correct ID

            bbox = obj.find('bndbox')
            xmin = float(bbox.find('xmin').text)
            ymin = float(bbox.find('ymin').text)
            xmax = float(bbox.find('xmax').text)
            ymax = float(bbox.find('ymax').text)

            # Convert to YOLO format
            center_x = ((xmin + xmax) / 2) / img_width
            center_y = ((ymin + ymax) / 2) / img_height
            box_width = (xmax - xmin) / img_width
            box_height = (ymax - ymin) / img_height

            yolo_annotations.append(f"{class_id} {center_x:.6f} {center_y:.6f} {box_width:.6f} {box_height:.6f}")

        # Save YOLO annotation to .txt file
        yolo_file = os.path.join(output_folder, xml_file.replace('.xml', '.txt'))
        with open(yolo_file, 'w') as f:
            f.write("\n".join(yolo_annotations))

    print(f"Conversion completed. YOLO annotations saved to '{output_folder}'.")

# Example Usage
xml_folder = "/mnt/f/final2/China_MotorBike/China_MotorBike/train/annotations/xmls"
output_folder = "/mnt/f/final2/China_MotorBike/China_MotorBike/train/labels"  

voc_to_yolo_and_classes(xml_folder, output_folder)


Saved class names to: /mnt/f/final2/China_MotorBike/China_MotorBike/train/labels/class_names.txt
Conversion completed. YOLO annotations saved to '/mnt/f/final2/China_MotorBike/China_MotorBike/train/labels'.


# Class Mapping

In [6]:
import os

# Path to your YOLO annotation files
annotation_folder = "/mnt/f/final2/China_MotorBike/China_MotorBike/train/labels"

# Updated class mapping
class_mapping = {
    "D00": "0",
    "D01": "5",
    "D0w0": "6",
    "D10": "1",
    "D11": "4",
    "D20": "2",
    "D40": "3",
    "D43": "7",
    "D44": "8",
    "D50": "9"
}

# Function to update YOLO annotations
def update_class_names(annotation_folder, class_mapping):
   
    if not os.path.exists(annotation_folder):
        print(f"Annotation folder {annotation_folder} does not exist.")
        return

    for file_name in os.listdir(annotation_folder):
        if file_name.endswith(".txt"):
            file_path = os.path.join(annotation_folder, file_name)

            with open(file_path, "r") as file:
                lines = file.readlines()

            updated_lines = []
            for line in lines:
                parts = line.strip().split()
                if parts:  # Ensure line is not empty
                    old_class_name = parts[0]
                    if old_class_name in class_mapping:
                        parts[0] = class_mapping[old_class_name]  # Update class ID
                        updated_lines.append(" ".join(parts))

            # Only update the file if changes were made
            if updated_lines:
                with open(file_path, "w") as file:
                    file.write("\n".join(updated_lines))

                print(f"Updated annotations in {file_name}")

update_class_names(annotation_folder, class_mapping)


Updated annotations in class_names.txt


In [8]:
import os
from collections import defaultdict


ANNOTATIONS_DIR = "/mnt/f/final2/China_MotorBike/China_MotorBike/train/labels" 


class_counts = defaultdict(int)

for file in os.listdir(ANNOTATIONS_DIR):
    if file.endswith(".txt"):  
        with open(os.path.join(ANNOTATIONS_DIR, file), "r") as f:
            lines = f.readlines()
            for line in lines:
                class_id = line.split()[0]  
                class_counts[class_id] += 1

# Print class distribution
for class_id, count in class_counts.items():
    print(f"Class {class_id}: {count} annotations")

Class 4: 277 annotations
Class 2: 1476 annotations
Class 0: 9429 annotations
Class 3: 371 annotations
Class 1: 4392 annotations


# Data Spliting (Train-Val-Test)

In [9]:
import os
import shutil
import random

# Define source directories
SOURCE_IMAGE_DIR = "/mnt/f/final2/China_MotorBike/China_MotorBike/train/images"  
SOURCE_LABEL_DIR = "/mnt/f/final2/China_MotorBike/China_MotorBike/train/labels"  

# Define base output directory
OUTPUT_DIR = "/mnt/f/final2/DATASET"

# Define train, val, and test subdirectories dynamically
SPLITS = ["train", "val", "test"]
for split in SPLITS:
    os.makedirs(os.path.join(OUTPUT_DIR, split, "images"), exist_ok=True)
    os.makedirs(os.path.join(OUTPUT_DIR, split, "labels"), exist_ok=True)

# Get all image files (assuming .jpg and .png images)
image_files = [f for f in os.listdir(SOURCE_IMAGE_DIR) if f.endswith((".jpg", ".png"))]
random.shuffle(image_files)  # Shuffle to ensure random distribution

# Calculate split indices
total_images = len(image_files)
train_split = int(0.7 * total_images)
val_split = int(0.2 * total_images)

train_files = image_files[:train_split]
val_files = image_files[train_split:train_split + val_split]
test_files = image_files[train_split + val_split:]

# Function to move images and corresponding labels
def move_files(files, src_img_dir, src_label_dir, dest_img_dir, dest_label_dir):
    for file in files:
        # Move image
        shutil.move(os.path.join(src_img_dir, file), os.path.join(dest_img_dir, file))

        # Move corresponding label file (if it exists)
        label_file = file.replace(".jpg", ".txt").replace(".png", ".txt")
        label_path = os.path.join(src_label_dir, label_file)
        if os.path.exists(label_path):
            shutil.move(label_path, os.path.join(dest_label_dir, label_file))

# Move files to respective folders
move_files(train_files, SOURCE_IMAGE_DIR, SOURCE_LABEL_DIR, os.path.join(OUTPUT_DIR, "train/images"), os.path.join(OUTPUT_DIR, "train/labels"))
move_files(val_files, SOURCE_IMAGE_DIR, SOURCE_LABEL_DIR, os.path.join(OUTPUT_DIR, "val/images"), os.path.join(OUTPUT_DIR, "val/labels"))
move_files(test_files, SOURCE_IMAGE_DIR, SOURCE_LABEL_DIR, os.path.join(OUTPUT_DIR, "test/images"), os.path.join(OUTPUT_DIR, "test/labels"))

# Summary of the split
print("Dataset split completed!")
print(f" Train: {len(train_files)} images")
print(f" Validation: {len(val_files)} images")
print(f" Test: {len(test_files)} images")


Dataset split completed!
 Train: 7759 images
 Validation: 2217 images
 Test: 1109 images


# Removing Irrelevant Classes

# Train Set

In [10]:
import os

# Define the path to the validation labels
VAL_LABELS_DIR = "/mnt/f/final2/DATASET/train/labels"

# Define the classes to remove
UNWANTED_CLASSES = {"7", "8", "9", "Repair"}

def clean_labels(label_dir):
    
    for label_file in os.listdir(label_dir):
        if label_file.endswith(".txt"):
            file_path = os.path.join(label_dir, label_file)
            
            with open(file_path, "r") as f:
                lines = f.readlines()

            # Keep only lines that don't have unwanted class labels
            filtered_lines = [line for line in lines if line.split()[0] not in UNWANTED_CLASSES]

            # Overwrite the file with cleaned data
            with open(file_path, "w") as f:
                f.writelines(filtered_lines)

            # Print info if file was modified
            if len(lines) != len(filtered_lines):
                print(f"Cleaned {label_file}: Removed {len(lines) - len(filtered_lines)} instances")

# Run the cleaning function
clean_labels(VAL_LABELS_DIR)

print("\nCleaning completed. Run the label check script again to verify!")


Cleaning completed. Run the label check script again to verify!


In [14]:
import os

def merge_and_clean_labels(labels_folder):
    for label_file in os.listdir(labels_folder):
        if label_file.endswith(".txt"):
            file_path = os.path.join(labels_folder, label_file)

            # Read original labels
            with open(file_path, "r") as f:
                lines = f.readlines()

            new_lines = []
            for line in lines:
                parts = line.strip().split()
                if not parts:
                    continue  # Skip empty lines

                class_id = parts[0]

                # Convert to integer
                if class_id.endswith(".0"):
                    class_id = class_id[:-2]  # Remove .0

                # Remove class 4
                if class_id == "4":
                    continue  

                # Reconstruct the YOLO annotation line
                new_line = f"{class_id} " + " ".join(parts[1:])
                new_lines.append(new_line)

            # Overwrite file with cleaned labels
            with open(file_path, "w") as f:
                f.write("\n".join(new_lines))

    print("Label merging and cleaning completed!")

# Example usage
labels_folder = "/mnt/f/final2/DATASET/train/labels"  
merge_and_clean_labels(labels_folder)


Label merging and cleaning completed!


In [15]:
import os
from collections import Counter

# Path to your validation labels directory
VAL_LABELS_DIR = "/mnt/f/final2/DATASET/train/labels"

def count_labels(label_dir):
    class_counter = Counter()

    if not os.path.exists(label_dir):
        print(f"Directory {label_dir} does not exist!")
        return

    # Read each annotation file
    for label_file in os.listdir(label_dir):
        if label_file.endswith(".txt"):
            with open(os.path.join(label_dir, label_file), "r") as f:
                lines = f.readlines()
                for line in lines:
                    parts = line.strip().split()
                    if parts:
                        class_label = parts[0]  # First item is the class ID
                        class_counter[class_label] += 1

    return class_counter

# Run the function on val set
label_counts = count_labels(VAL_LABELS_DIR)

# Print the label distribution
if label_counts:
    print("\nClass Label Distribution in Train Set:")
    for class_label, count in sorted(label_counts.items()):
        print(f"Class {class_label}: {count} instances")
else:
    print("No labels found in Train set!")


Class Label Distribution in Train Set:
Class 0: 8251 instances
Class 1: 3884 instances
Class 2: 2620 instances
Class 3: 3114 instances


# Test Set

In [16]:
import os

# Define the path to the validation labels
VAL_LABELS_DIR = "/mnt/f/final2/DATASET/test/labels"

# Define the classes to remove
UNWANTED_CLASSES = {"7", "8", "9", "Repair"}

def clean_labels(label_dir):
    
    for label_file in os.listdir(label_dir):
        if label_file.endswith(".txt"):
            file_path = os.path.join(label_dir, label_file)
            
            with open(file_path, "r") as f:
                lines = f.readlines()

            # Keep only lines that don't have unwanted class labels
            filtered_lines = [line for line in lines if line.split()[0] not in UNWANTED_CLASSES]

            # Overwrite the file with cleaned data
            with open(file_path, "w") as f:
                f.writelines(filtered_lines)

            # Print info if file was modified
            if len(lines) != len(filtered_lines):
                print(f"Cleaned {label_file}: Removed {len(lines) - len(filtered_lines)} instances")

# Run the cleaning function
clean_labels(VAL_LABELS_DIR)

print("\n✅ Cleaning completed. Run the label check script again to verify!")


✅ Cleaning completed. Run the label check script again to verify!


In [18]:
import os

def merge_and_clean_labels(labels_folder):
    for label_file in os.listdir(labels_folder):
        if label_file.endswith(".txt"):
            file_path = os.path.join(labels_folder, label_file)

            # Read original labels
            with open(file_path, "r") as f:
                lines = f.readlines()

            new_lines = []
            for line in lines:
                parts = line.strip().split()
                if not parts:
                    continue  # Skip empty lines

                class_id = parts[0]

                # Convert to integer
                if class_id.endswith(".0"):
                    class_id = class_id[:-2]  # Remove .0

                # Remove class 4
                if class_id == "4":
                    continue  

                # Reconstruct the YOLO annotation line
                new_line = f"{class_id} " + " ".join(parts[1:])
                new_lines.append(new_line)

            # Overwrite file with cleaned labels
            with open(file_path, "w") as f:
                f.write("\n".join(new_lines))

    print("Label merging and cleaning completed!")

# Example usage
labels_folder = "/mnt/f/final2/DATASET/test/labels"  
merge_and_clean_labels(labels_folder)


Label merging and cleaning completed!


In [19]:
import os
from collections import Counter

# Path to your validation labels directory
VAL_LABELS_DIR = "/mnt/f/final2/DATASET/test/labels"

def count_labels(label_dir):
    class_counter = Counter()

    if not os.path.exists(label_dir):
        print(f"Directory {label_dir} does not exist!")
        return

    # Read each annotation file
    for label_file in os.listdir(label_dir):
        if label_file.endswith(".txt"):
            with open(os.path.join(label_dir, label_file), "r") as f:
                lines = f.readlines()
                for line in lines:
                    parts = line.strip().split()
                    if parts:
                        class_label = parts[0]  # First item is the class ID
                        class_counter[class_label] += 1

    return class_counter

# Run the function on val set
label_counts = count_labels(VAL_LABELS_DIR)

# Print the label distribution
if label_counts:
    print("\nClass Label Distribution in Test Set:")
    for class_label, count in sorted(label_counts.items()):
        print(f"Class {class_label}: {count} instances")
else:
    print("No labels found in test set!")


Class Label Distribution in Test Set:
Class 0: 1248 instances
Class 1: 572 instances
Class 2: 362 instances
Class 3: 395 instances


# Val Set

In [20]:
import os

# Define the path to the validation labels
VAL_LABELS_DIR = "/mnt/f/final2/DATASET/val/labels"

# Define the classes to remove
UNWANTED_CLASSES = {"7", "8", "9", "Repair"}

def clean_labels(label_dir):
    
    for label_file in os.listdir(label_dir):
        if label_file.endswith(".txt"):
            file_path = os.path.join(label_dir, label_file)
            
            with open(file_path, "r") as f:
                lines = f.readlines()

            # Keep only lines that don't have unwanted class labels
            filtered_lines = [line for line in lines if line.split()[0] not in UNWANTED_CLASSES]

            # Overwrite the file with cleaned data
            with open(file_path, "w") as f:
                f.writelines(filtered_lines)

            # Print info if file was modified
            if len(lines) != len(filtered_lines):
                print(f"Cleaned {label_file}: Removed {len(lines) - len(filtered_lines)} instances")

# Run the cleaning function
clean_labels(VAL_LABELS_DIR)

print("\n Cleaning completed. Run the label check script again to verify!")


 Cleaning completed. Run the label check script again to verify!


In [24]:
import os

def merge_and_clean_labels(labels_folder):
    for label_file in os.listdir(labels_folder):
        if label_file.endswith(".txt"):
            file_path = os.path.join(labels_folder, label_file)

            # Read original labels
            with open(file_path, "r") as f:
                lines = f.readlines()

            new_lines = []
            for line in lines:
                parts = line.strip().split()
                if not parts:
                    continue  # Skip empty lines

                class_id = parts[0]

                # Convert to integer
                if class_id.endswith(".0"):
                    class_id = class_id[:-2]  # Remove .0

                # Remove class 4
                if class_id == "4":
                    continue  

                # Reconstruct the YOLO annotation line
                new_line = f"{class_id} " + " ".join(parts[1:])
                new_lines.append(new_line)

            # Overwrite file with cleaned labels
            with open(file_path, "w") as f:
                f.write("\n".join(new_lines))

    print("Label merging and cleaning completed!")

# Example usage
labels_folder = "/mnt/f/final2/DATASET/val/labels"  
merge_and_clean_labels(labels_folder)


Label merging and cleaning completed!


In [25]:
import os
from collections import Counter

# Path to your validation labels directory
VAL_LABELS_DIR = "/mnt/f/final2/DATASET/val/labels"

def count_labels(label_dir):
    class_counter = Counter()

    if not os.path.exists(label_dir):
        print(f"Directory {label_dir} does not exist!")
        return

    # Read each annotation file
    for label_file in os.listdir(label_dir):
        if label_file.endswith(".txt"):
            with open(os.path.join(label_dir, label_file), "r") as f:
                lines = f.readlines()
                for line in lines:
                    parts = line.strip().split()
                    if parts:
                        class_label = parts[0]  # First item is the class ID
                        class_counter[class_label] += 1

    return class_counter

# Run the function on val set
label_counts = count_labels(VAL_LABELS_DIR)

# Print the label distribution
if label_counts:
    print("\nClass Label Distribution in Validation Set:")
    for class_label, count in sorted(label_counts.items()):
        print(f"Class {class_label}: {count} instances")
else:
    print("No labels found in validation set!")


Class Label Distribution in Validation Set:
Class 0: 2438 instances
Class 1: 1227 instances
Class 2: 748 instances
Class 3: 906 instances
