In [None]:
import os
import shutil
import random
import pandas as pd
import cv2
from glob import glob

# Define paths
csv_file = 'ABOShipsDataset/Labels/Vesibussi_Labels.csv'
images_root = 'ABOShipsDataset/Seaships'

# Define output directories for YOLO
output_images_train = 'datasets/ABOShipsDataset/images/train'
output_images_val = 'datasets/ABOShipsDataset/images/val'
output_images_test = 'datasets/ABOShipsDataset/images/test'
output_labels_train = 'datasets/ABOShipsDataset/labels/train'
output_labels_val = 'datasets/ABOShipsDataset/labels/val'
output_labels_test = 'datasets/ABOShipsDataset/labels/test'

os.makedirs(output_images_train, exist_ok=True)
os.makedirs(output_images_val, exist_ok=True)
os.makedirs(output_images_test, exist_ok=True)
os.makedirs(output_labels_train, exist_ok=True)
os.makedirs(output_labels_val, exist_ok=True)
os.makedirs(output_labels_test, exist_ok=True)

# Read the CSV annotations
df = pd.read_csv(csv_file)

# Define class mappping
classes = {class_name: idx for idx, class_name in enumerate(df['class'].unique())}

# Gather all image file paths (search recursively for PNGs)
image_files = glob(os.path.join(images_root, '**', '*.png'), recursive=True)
# Create a lookup dictionary based on the base filename (without extension)
image_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in image_files}

# Group CSV rows by filename so that each image gets all its boxes together.
grouped = df.groupby('filename')

# Set of annotated filenames (from the CSV)
annotated_filenames = set(grouped.groups.keys())

# Create a full list of all image filenames (from the found image files)
all_image_filenames = list(image_dict.keys())

# Shuffle and split into 80% train, 10% val, 10% test
random.seed(42)
random.shuffle(all_image_filenames)

n_total = len(all_image_filenames)
n_train = int(0.8 * n_total)
n_val = int(0.1 * n_total)
n_test = n_total - n_train - n_val

train_filenames = set(all_image_filenames[:n_train])
val_filenames = set(all_image_filenames[n_train:n_train + n_val])
test_filenames = set(all_image_filenames[n_train + n_val:])

# Process each image file
for filename in all_image_filenames:
    image_path = image_dict[filename]
    img = cv2.imread(image_path)
    if img is None:
        print(f"Error reading image: {image_path}")
        continue
    img_h, img_w = img.shape[:2]

    # Prepare annotation lines in YOLO format
    annotation_lines = []
    if filename in annotated_filenames:
        group = grouped.get_group(filename)
        for _, row in group.iterrows():
            xmin, xmax, ymin, ymax = row['xmin'], row['xmax'], row['ymin'], row['ymax']
            x_center = ((xmin + xmax) / 2) / img_w
            y_center = ((ymin + ymax) / 2) / img_h
            bbox_w = (xmax - xmin) / img_w
            bbox_h = (ymax - ymin) / img_h
            class_id = classes[row['class']]
            annotation_lines.append(f"{class_id} {x_center:.6f} {y_center:.6f} {bbox_w:.6f} {bbox_h:.6f}")

    # Decide split
    if filename in train_filenames:
        dest_img_dir = output_images_train
        dest_label_dir = output_labels_train
    elif filename in val_filenames:
        dest_img_dir = output_images_val
        dest_label_dir = output_labels_val
    else:
        dest_img_dir = output_images_test
        dest_label_dir = output_labels_test

    # Copy image
    dest_image_path = os.path.join(dest_img_dir, os.path.basename(image_path))
    shutil.copy(image_path, dest_image_path)

    # Write annotation file
    label_filename = os.path.splitext(os.path.basename(image_path))[0] + '.txt'
    dest_label_path = os.path.join(dest_label_dir, label_filename)
    with open(dest_label_path, 'w') as f:
        for line in annotation_lines:
            f.write(line + "\n")