In [None]:
"""
README / Script Info
--------------------

Script Name: split_symlink_train_yolov8_955.ipynb

Description:
------------
This script automates the process of:
1. Collecting all images from a dataset structured with 'train', 'valid', and 'test' folders.
2. Creating new splits (train/valid/test) with a specified ratio (e.g., 95-5).
3. Using symlinks to efficiently organize the new split data folders without duplicating files.
4. Generating a YOLO-compatible data.yaml file.
5. Training a YOLOv8 model on the new split using the Ultralytics YOLO package.

Features:
---------
- Customizable split ratios.
- Uses symlinks for storage efficiency.
- Compatible with Ultralytics YOLOv8 models.
- Includes automatic training with chosen hyperparameters.

How to Use:
-----------
1. Set the BASE_DIR to your dataset root folder.
2. Adjust the split ratio(s) in the 'splits' dictionary.
3. Run the script (recommended: as a Jupyter Notebook or via CLI).

Note:
-----
- Paths in this script are placeholders; replace with your own dataset locations.
- This script requires Ultralytics YOLOv8, PyYAML, tqdm, and scikit-learn.

Author: Bahadir Akin Akgul
Date: 13.07.2025
"""

import os
import random
import shutil
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import yaml
from ultralytics import YOLO

# === CONFIGURATION ===
BASE_DIR = "/your/data/root"  # <-- CHANGE THIS to your dataset root folder

ORIGINAL_DIRS = ['train', 'valid', 'test']
IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.bmp')

# Define your split ratios here
splits = {
    "95-5": 0.95,  # e.g. 95% train, 5% (valid + test)
    # Add more splits if needed: "80-20": 0.8, etc.
}

def collect_data(data_dir):
    img_files = []
    for folder in ORIGINAL_DIRS:
        img_folder = os.path.join(data_dir, folder, "images")
        if not os.path.exists(img_folder):
            continue
        for fname in os.listdir(img_folder):
            if fname.endswith(IMG_EXTENSIONS):
                img_files.append(os.path.join(img_folder, fname))
    return img_files

def link_data(files, dest_dir):
    for img_path in tqdm(files, desc=f"Linking to {dest_dir}"):
        label_path = img_path.replace('/images/', '/labels/').rsplit('.', 1)[0] + '.txt'
        new_img_path = os.path.join(dest_dir, "images", os.path.basename(img_path))
        new_label_path = os.path.join(dest_dir, "labels", os.path.basename(label_path))

        os.makedirs(os.path.dirname(new_img_path), exist_ok=True)
        os.makedirs(os.path.dirname(new_label_path), exist_ok=True)

        for src, dst in [(img_path, new_img_path), (label_path, new_label_path)]:
            try:
                if os.path.exists(dst) or os.path.islink(dst):
                    os.remove(dst)
                if os.path.exists(src):
                    os.symlink(src, dst)
            except Exception as e:
                print(f"WARNING: {e} -> {dst}")

def write_data_yaml(split_dir):
    yaml_path = os.path.join(split_dir, 'data.yaml')
    data_yaml = {
        'train': os.path.join(split_dir, 'train/images'),
        'val': os.path.join(split_dir, 'valid/images'),
        'test': os.path.join(split_dir, 'test/images'),
        'nc': 3,
        'names': ['pedestrian', 'road', 'vehicle']
    }
    with open(yaml_path, 'w') as f:
        yaml.dump(data_yaml, f)
    print(f"data.yaml created: {yaml_path}")
    return yaml_path

def create_split_and_train(base_dir, ratio_name, train_ratio):
    print(f"\nCreating split '{ratio_name}' and training...")

    split_dir = os.path.join(base_dir, "splits", f"split_{ratio_name}")
    if os.path.exists(split_dir):
        shutil.rmtree(split_dir)

    for sub in ['train', 'valid', 'test']:
        for t in ['images', 'labels']:
            os.makedirs(os.path.join(split_dir, sub, t), exist_ok=True)

    # Collect all images and split
    all_imgs = collect_data(base_dir)
    random.shuffle(all_imgs)
    train_imgs, testvalid_imgs = train_test_split(all_imgs, train_size=train_ratio, random_state=42)
    valid_imgs, test_imgs = train_test_split(testvalid_imgs, test_size=0.5, random_state=42)

    # Symlink to folders
    link_data(train_imgs, os.path.join(split_dir, "train"))
    link_data(valid_imgs, os.path.join(split_dir, "valid"))
    link_data(test_imgs, os.path.join(split_dir, "test"))

    # Write YAML
    yaml_path = write_data_yaml(split_dir)

    # Train YOLOv8
    model = YOLO('yolov8l.pt')
    results = model.train(
        data=yaml_path,
        epochs=100,
        imgsz=1024,
        batch=20,
        device=[0, 1],  # Adjust GPU IDs as needed
        workers=8,
        optimizer="SGD",
        save_period=10,
        name=f"exp_{ratio_name.replace('-', '')}"
    )

    print(f"Training finished: {ratio_name}\nResults in: runs/detect/exp_{ratio_name.replace('-', '')}")

# MAIN EXECUTION
if __name__ == "__main__":
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    for ratio_name, train_ratio in splits.items():
        create_split_and_train(BASE_DIR, ratio_name, train_ratio)



🔧 95-5 oranında bölünüyor ve eğitiliyor...


Linking to /truba/home/baakgul/roadtr-14032025/splits/split_95-5/train: 100%|██████████| 8491/8491 [00:39<00:00, 215.35it/s]
Linking to /truba/home/baakgul/roadtr-14032025/splits/split_95-5/valid: 100%|██████████| 223/223 [00:01<00:00, 222.65it/s]
Linking to /truba/home/baakgul/roadtr-14032025/splits/split_95-5/test: 100%|██████████| 224/224 [00:00<00:00, 232.69it/s]


✅ data.yaml oluşturuldu: /truba/home/baakgul/roadtr-14032025/splits/split_95-5/data.yaml
New https://pypi.org/project/ultralytics/8.3.111 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.91 🚀 Python-3.10.15 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
                                                       CUDA:1 (Tesla P100-PCIE-16GB, 16269MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8l.pt, data=/truba/home/baakgul/roadtr-14032025/splits/split_95-5/data.yaml, epochs=100, time=None, patience=100, batch=20, imgsz=1024, save=True, save_period=10, cache=False, device=[0, 1], workers=8, project=None, name=exp_955, exist_ok=False, pretrained=True, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_

[34m[1mtrain: [0mScanning /truba/home/baakgul/roadtr-14032025/splits/split_95-5/train/labels... 8491 images, 2 backgrounds, 0 corrupt: 100%|██████████| 8491/8491 [00:19<00:00, 432.51it/s]


[34m[1mtrain: [0mNew cache created: /truba/home/baakgul/roadtr-14032025/splits/split_95-5/train/labels.cache
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))


[34m[1mval: [0mScanning /truba/home/baakgul/roadtr-14032025/splits/split_95-5/valid/labels... 223 images, 0 backgrounds, 0 corrupt: 100%|██████████| 223/223 [00:00<00:00, 455.19it/s]


[34m[1mval: [0mNew cache created: /truba/home/baakgul/roadtr-14032025/splits/split_95-5/valid/labels.cache
Plotting labels to runs/detect/exp_955/labels.jpg... 
[34m[1moptimizer:[0m SGD(lr=0.01, momentum=0.937) with parameter groups 97 weight(decay=0.0), 104 weight(decay=0.00046875), 103 bias(decay=0.0)


2025/04/19 08:16:17 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2025/04/19 08:16:17 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2025/04/19 08:16:17 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


[34m[1mMLflow: [0mlogging run_id(f26b015631f14c35999361cd0f2c059a) to runs/mlflow
[34m[1mMLflow: [0mview at http://127.0.0.1:5000 with 'mlflow server --backend-store-uri runs/mlflow'
[34m[1mMLflow: [0mdisable with 'yolo settings mlflow=False'
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 1024 train, 1024 val
Using 16 dataloader workers
Logging results to [1mruns/detect/exp_955[0m
Starting training for 100 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      1/100      14.9G      1.284     0.9904      1.139        175       1024: 100%|██████████| 425/425 [12:08<00:00,  1.71s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:07<00:00,  1.58it/s]


                   all        223       3438       0.74      0.638      0.633      0.376

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      2/100      14.9G      1.218     0.8133      1.095        130       1024: 100%|██████████| 425/425 [11:59<00:00,  1.69s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:07<00:00,  1.57it/s]


                   all        223       3438      0.749      0.605      0.627      0.392

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      3/100      15.3G       1.23     0.8434      1.114        155       1024: 100%|██████████| 425/425 [11:55<00:00,  1.68s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:07<00:00,  1.60it/s]


                   all        223       3438      0.752      0.603      0.615      0.385

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      4/100      14.7G      1.259     0.8696       1.13        378       1024:  62%|██████▏   | 262/425 [07:21<04:36,  1.70s/it]