In [58]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0


In [51]:
import pandas as pd
import os
import shutil
import cv2
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [52]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
# === USER SETTINGS ===
BASE_PATH = "/content/drive/MyDrive/DeepSeaProject/dataset_seanoe_101899"
CSV_NAME = "raw-dataset.csv"
IMAGES_FOLDER = "images/Images"
OUTPUT_DIR = os.path.join(BASE_PATH, "yolo_dataset")
RANDOM_SEED = 42
VAL_RATIO = 0.2

CSV_PATH = os.path.join(BASE_PATH, CSV_NAME)
IMAGES_DIR = os.path.join(BASE_PATH, IMAGES_FOLDER)

# === SETUP OUTPUT STRUCTURE ===
for split in ["train", "val"]:
    os.makedirs(f"{OUTPUT_DIR}/images/{split}", exist_ok=True)
    os.makedirs(f"{OUTPUT_DIR}/labels/{split}", exist_ok=True)

In [73]:
df = pd.read_csv(CSV_PATH, delimiter=';', on_bad_lines='skip', engine='python')

# Check first rows
df.head()

Unnamed: 0.1,Unnamed: 0,name_img,name_sp,x1,y1,x2,y2,length,middle_x,middle_y,polygon_values
0,0,MOMAR_20140727180039.jpg,Bythograeid crab,815.0,839.0,826.0,792.0,48.0,821.0,816.0,
1,1,MOMAR_20140727180039.jpg,Bythograeid crab,817.0,837.0,817.0,800.0,37.0,817.0,819.0,
2,2,MOMAR_20140727180039.jpg,Other fish,1329.0,153.0,1262.0,234.0,105.0,1296.0,194.0,
3,3,MOMAR_20140727180039.jpg,Bythograeid crab,826.0,790.0,812.0,842.0,54.0,819.0,816.0,
4,4,MOMAR_20140727180039.jpg,Bythograeid crab,814.0,829.0,825.0,794.0,37.0,820.0,812.0,


In [74]:
df = df[["name_img", "name_sp", "x1", "y1", "x2", "y2"]]
valid_df = df.dropna(subset=["x1", "y1", "x2", "y2"])

# === CLASS-WISE SUMMARY ===
summary = (
    valid_df
    .groupby("name_sp")
    .agg(
        num_images=pd.NamedAgg(column="name_img", aggfunc=lambda x: x.nunique()),
        num_valid_annotations=pd.NamedAgg(column="name_img", aggfunc="count")
    )
    .sort_values("num_valid_annotations", ascending=False)
)
summary.reset_index(inplace=True)

# Display summary
summary_display = summary.copy()
summary_display.columns = ["Species", "Images", "Valid Annotations"]

In [75]:
from IPython.display import display

print("\n✅ Class-wise summary of valid annotations:")
display(summary_display)



✅ Class-wise summary of valid annotations:


Unnamed: 0,Species,Images,Valid Annotations
0,Buccinid snail,3304,98282
1,Spider crab,2588,34803
2,Polynoid worms,2449,12680
3,Zoarcid fish,1715,6185
4,Bythograeid crab,463,2426
5,Polynoid worm,956,1999
6,Cataetyx fish,249,603
7,Brittle star,113,483
8,Other fish,234,455
9,Chimera fish,96,174


In [79]:
# === SELECT CLASSES TO TRAIN ===
SELECTED_CLASSES = [
    "Buccinid snail"
]

In [80]:
# === FILTER DATA FOR SELECTED CLASSES ===
df_filtered = valid_df[valid_df["name_sp"].isin(SELECTED_CLASSES)]
classes = sorted(df_filtered["name_sp"].unique())
class2id = {cls: i for i, cls in enumerate(classes)}

# === SPLIT IMAGES INTO TRAIN/VAL ===
unique_imgs = df_filtered["name_img"].unique()
train_imgs, val_imgs = train_test_split(unique_imgs, test_size=VAL_RATIO, random_state=RANDOM_SEED)
grouped = df_filtered.groupby("name_img")

In [81]:
# === CONVERT TO YOLO FORMAT AND SAVE ===
def process_split(image_list, split_name):
    for img_name in tqdm(image_list, desc=f"Processing {split_name}"):
        img_path = os.path.join(IMAGES_DIR, img_name)
        if not os.path.exists(img_path):
            print(f"⚠️ Image not found: {img_path}")
            continue

        img = cv2.imread(img_path)
        if img is None:
            print(f"⚠️ Cannot read image: {img_path}")
            continue
        h, w = img.shape[:2]

        shutil.copy(img_path, f"{OUTPUT_DIR}/images/{split_name}/{img_name}")

        try:
            anns = grouped.get_group(img_name)
        except KeyError:
            print(f"⚠️ No annotations for: {img_name}")
            continue

        yolo_lines = []
        for _, row in anns.iterrows():
            try:
                x_c = (row["x1"] + row["x2"]) / 2 / w
                y_c = (row["y1"] + row["y2"]) / 2 / h
                bw = abs(row["x2"] - row["x1"]) / w
                bh = abs(row["y2"] - row["y1"]) / h
                class_id = class2id[row["name_sp"]]
                yolo_lines.append(f"{class_id} {x_c:.6f} {y_c:.6f} {bw:.6f} {bh:.6f}")
            except Exception as e:
                print(f"⚠️ Skipping bad annotation in {img_name}: {e}")

        if yolo_lines:
            label_file = img_name.replace(".jpg", ".txt").replace(".png", ".txt")
            with open(f"{OUTPUT_DIR}/labels/{split_name}/{label_file}", "w") as f:
                f.write("\n".join(yolo_lines))


In [82]:
# Run for both splits
process_split(train_imgs, "train")
process_split(val_imgs, "val")

Processing train: 100%|██████████| 2643/2643 [02:03<00:00, 21.34it/s]
Processing val: 100%|██████████| 661/661 [00:31<00:00, 21.16it/s]


In [83]:
# === WRITE data.yaml ===
yaml_path = os.path.join(OUTPUT_DIR, "data.yaml")
with open(yaml_path, "w") as f:
    f.write(f"path: {OUTPUT_DIR}\n")
    f.write("train: images/train\n")
    f.write("val: images/val\n")
    f.write(f"nc: {len(classes)}\n")
    f.write(f"names: {classes}\n")

In [84]:
# Final report
print("\n✅ Dataset is ready!")
print(f"→ Train images: {len(train_imgs)} | Val images: {len(val_imgs)}")
print(f"→ Classes: {classes}")
print(f"→ data.yaml created at: {yaml_path}")


✅ Dataset is ready!
→ Train images: 2643 | Val images: 661
→ Classes: ['Buccinid snail']
→ data.yaml created at: /content/drive/MyDrive/DeepSeaProject/dataset_seanoe_101899/yolo_dataset/data.yaml
