In [1]:
"""
README: YOLOv10 Multi-GPU Training Resume Script

This script:
- Continues YOLOv10 training from a previous checkpoint (e.g. 'last.pt') with DataParallel (device=[0,1]).
- Expands dynamic CUDA memory allocation for large models or large batch sizes.
- Trains using the provided data.yaml file.
- After training, evaluates the model on the test split.

Key differences from fresh training:
- Loads the model from a checkpoint path (e.g. 'runs/detect/.../weights/last.pt').
- Uses 'resume=True' to continue training instead of starting over.

How to use:
- Change the 'data' and 'weights' paths to match your dataset and checkpoint.
- Adjust batch size, epochs, and other parameters as needed for your hardware.

Author: Bahadir Akin Akgul
Date: 13.07.2025
"""

import torch
import os
from ultralytics import YOLO

# Enable dynamic CUDA memory expansion
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Ensure multiple GPUs are detected
num_gpus = torch.cuda.device_count()
assert num_gpus > 1, "Multi-GPU setup not detected!"
print(f"Using {num_gpus} GPUs for training...")

# Load YOLOv10 model from checkpoint (resume training)
model = YOLO('runs/detect/yolov10-70-30/weights/last.pt')

# Train the model with DataParallel (device=[0,1]), resume from checkpoint
model.train(
    data="/PATH/TO/your/data.yaml",      # <-- CHANGE THIS to your data.yaml
    epochs=100,
    imgsz=1024,                          # Lower if OOM errors occur
    batch=12,                            # Adjust based on your VRAM
    device=[0, 1],                       # DataParallel for multi-GPU
    optimizer="SGD",
    save_period=10,
    workers=16,
    cache=False,
    name='yolov10-70-30',
    resume=True
)

# Evaluate the model on the test split after training
test_results = model.val(split='test')


Using 2 GPUs for training...
New https://pypi.org/project/ultralytics/8.3.117 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.91 🚀 Python-3.10.15 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
                                                       CUDA:1 (Tesla P100-PCIE-16GB, 16269MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=runs/detect/yolov10-70-30/weights/last.pt, data=/truba/home/baakgul/roadtr-14032025/data.yaml, epochs=100, time=None, patience=100, batch=12, imgsz=1024, save=True, save_period=10, cache=False, device=[0, 1], workers=16, project=None, name=yolov10-70-303, exist_ok=False, pretrained=True, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=runs/detect/yolov10-70-30/weights/last.pt, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hyb

[34m[1mtrain: [0mScanning /truba/home/baakgul/roadtr-14032025/train/labels.cache... 6299 images, 0 backgrounds, 0 corrupt: 100%|██████████| 6299/6299 [00:00<?, ?it/s]


[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))


[rank1]: Traceback (most recent call last):
[rank1]:   File "/arf/home/baakgul/.config/Ultralytics/DDP/_temp_cj21km3s23322666605760.py", line 13, in <module>
[rank1]:     results = trainer.train()
[rank1]:   File "/arf/home/baakgul/.local/lib/python3.10/site-packages/ultralytics/engine/trainer.py", line 211, in train
[rank1]:     self._do_train(world_size)
[rank1]:   File "/arf/home/baakgul/.local/lib/python3.10/site-packages/ultralytics/engine/trainer.py", line 326, in _do_train
[rank1]:     self._setup_train(world_size)
[rank1]:   File "/arf/home/baakgul/.local/lib/python3.10/site-packages/ultralytics/engine/trainer.py", line 318, in _setup_train
[rank1]:     self.resume_training(ckpt)
[rank1]:   File "/arf/home/baakgul/.local/lib/python3.10/site-packages/ultralytics/engine/trainer.py", line 757, in resume_training
[rank1]:     assert start_epoch > 0, (
[rank1]: AssertionError: runs/detect/yolov10-70-30/weights/last.pt training to 100 epochs is finished, nothing to resume.
[rank1]: S

CalledProcessError: Command '['/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python', '-m', 'torch.distributed.run', '--nproc_per_node', '2', '--master_port', '55085', '/arf/home/baakgul/.config/Ultralytics/DDP/_temp_cj21km3s23322666605760.py']' returned non-zero exit status 1.