In [1]:
import pandas as pd
import os
import shutil
import cv2
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
from yolo_preparation import (
    read_annotation_csv,
    setup_output_structure,
    filter_and_split_dataset,
    process_split,
    create_data_yaml,
)

from config import (
    BASE_PATH, CSV_PATH, IMAGES_DIR, DATASET_DIR,
    SELECTED_CLASSES, SAFE_CLASS_NAMES, VAL_RATIO, RANDOM_SEED
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# === USER SETTINGS ===
BASE_PATH = "/content/drive/MyDrive/DeepSeaProject/dataset_seanoe_101899"
CSV_NAME = "raw-dataset.csv"
IMAGES_FOLDER = "images/Images"
OUTPUT_DIR = os.path.join(BASE_PATH, "yolo_dataset")
RANDOM_SEED = 42
VAL_RATIO = 0.2

CSV_PATH = os.path.join(BASE_PATH, CSV_NAME)
IMAGES_DIR = os.path.join(BASE_PATH, IMAGES_FOLDER)

In [None]:
df = pd.read_csv(CSV_PATH, delimiter=';', on_bad_lines='skip', engine='python')

# Check first rows
df.head()

Unnamed: 0.1,Unnamed: 0,name_img,name_sp,x1,y1,x2,y2,length,middle_x,middle_y,polygon_values
0,0,MOMAR_20140727180039.jpg,Bythograeid crab,815.0,839.0,826.0,792.0,48.0,821.0,816.0,
1,1,MOMAR_20140727180039.jpg,Bythograeid crab,817.0,837.0,817.0,800.0,37.0,817.0,819.0,
2,2,MOMAR_20140727180039.jpg,Other fish,1329.0,153.0,1262.0,234.0,105.0,1296.0,194.0,
3,3,MOMAR_20140727180039.jpg,Bythograeid crab,826.0,790.0,812.0,842.0,54.0,819.0,816.0,
4,4,MOMAR_20140727180039.jpg,Bythograeid crab,814.0,829.0,825.0,794.0,37.0,820.0,812.0,


In [None]:
df = df[["name_img", "name_sp", "x1", "y1", "x2", "y2"]]
valid_df = df.dropna(subset=["x1", "y1", "x2", "y2"])

# === CLASS-WISE SUMMARY ===
summary = (
    valid_df
    .groupby("name_sp")
    .agg(
        num_images=pd.NamedAgg(column="name_img", aggfunc=lambda x: x.nunique()),
        num_valid_annotations=pd.NamedAgg(column="name_img", aggfunc="count")
    )
    .sort_values("num_valid_annotations", ascending=False)
)
summary.reset_index(inplace=True)

# Display summary
summary_display = summary.copy()
summary_display.columns = ["Species", "Images", "Valid Annotations"]

In [None]:
from IPython.display import display

print("\n✅ Class-wise summary of valid annotations:")
display(summary_display)



✅ Class-wise summary of valid annotations:


Unnamed: 0,Species,Images,Valid Annotations
0,Buccinid snail,3304,98282
1,Spider crab,2588,34803
2,Polynoid worms,2449,12680
3,Zoarcid fish,1715,6185
4,Bythograeid crab,463,2426
5,Polynoid worm,956,1999
6,Cataetyx fish,249,603
7,Brittle star,113,483
8,Other fish,234,455
9,Chimera fish,96,174


In [None]:
# === SELECT CLASSES TO TRAIN ===
SELECTED_CLASSES = [
    "Buccinid snail"
]

In [None]:
# === PATHS ===
CSV_PATH = os.path.join(BASE_PATH, CSV_NAME)
IMAGES_DIR = os.path.join(BASE_PATH, IMAGES_FOLDER)
safe_class_names = "_".join(cls.replace(" ", "_") for cls in SELECTED_CLASSES)
OUTPUT_DIR = os.path.join(BASE_PATH, f"training_{safe_class_names}")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === PROCESS ===
df = read_annotation_csv(CSV_PATH)
setup_output_structure(OUTPUT_DIR)
grouped, class2id, train_imgs, val_imgs, classes = filter_and_split_dataset(
    df, SELECTED_CLASSES, val_ratio=VAL_RATIO, random_seed=RANDOM_SEED
)
process_split(train_imgs, "train", IMAGES_DIR, OUTPUT_DIR, grouped, class2id)
process_split(val_imgs, "val", IMAGES_DIR, OUTPUT_DIR, grouped, class2id)
create_data_yaml(OUTPUT_DIR, classes)