In [1]:
import os
import shutil
import random
import pandas as pd
from PIL import Image

In [2]:
# ==== CONFIGURATION ====
N = 1000  # number of images to select
images_folder = "images"
annotations_csv = "raw-dataset.csv"
#output_images_folder = "images_100"
output_images_folder = f"images_{N}"
output_labels_folder = f"labels_{N}"
classes = ["Bythograeid crab"]  # define your classes here

In [3]:
# ==== PREPARE OUTPUT ====
os.makedirs(output_images_folder, exist_ok=True)
os.makedirs(output_labels_folder, exist_ok=True)

# ==== LOAD ANNOTATIONS ====
df = pd.read_csv(annotations_csv, sep=';')
df = df[df['name_sp'].isin(classes)]  # Filter known classes only

# ==== SELECT RANDOM IMAGES ====
all_images = df['name_img'].unique().tolist()
selected_images = random.sample(all_images, min(N, len(all_images)))

  df = pd.read_csv(annotations_csv, sep=';')


In [4]:
# ==== PROCESS EACH SELECTED IMAGE ====
for img_name in selected_images:
    src_img_path = os.path.join(images_folder, img_name)
    dst_img_path = os.path.join(output_images_folder, img_name)

    if not os.path.exists(src_img_path):
        print(f"Warning: Image {img_name} not found.")
        continue

    # Copy image
    shutil.copy(src_img_path, dst_img_path)

    # Open image to get dimensions
    with Image.open(src_img_path) as img:
        img_width, img_height = img.size

    # Filter annotations for this image
    annots = df[df['name_img'] == img_name]

    # Write YOLO-format txt
    txt_filename = os.path.splitext(img_name)[0] + ".txt"
    txt_path = os.path.join(output_labels_folder, txt_filename)
    with open(txt_path, "w") as f:
        for _, row in annots.iterrows():
            class_id = classes.index(row['name_sp'])
            # Get box in x1, y1, x2, y2 format
            x1, y1, x2, y2 = row['x1'], row['y1'], row['x2'], row['y2']
            if any(pd.isnull([x1, y1, x2, y2])):
                continue  # skip invalid rows

            x_center = ((x1 + x2) / 2) / img_width
            y_center = ((y1 + y2) / 2) / img_height
            width = abs(x2 - x1) / img_width
            height = abs(y2 - y1) / img_height

            # Write to txt
            f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

print(f"✅ Done. {len(selected_images)} images and annotations processed.")


✅ Done. 463 images and annotations processed.
