# STAGE III - Run Model Training/Finetuning and Convert Resulting Checkpoint to ONNX

We now finetune the model with the previously downloaded dataset and then, once the process is finished, the resulting checkpoint is saved in ONNX format.

In [None]:
!pip install ultralytics torch

In [None]:
# load libraries
try:
    import os
    import torch
    import torch.cuda as tc
    import ultralytics
except Exception as e:
    print(f"Caught Exception: {e}")

In [None]:
# detect accelerator
def detectAccelerator() -> (str, torch.dtype):
    accelerator = "cpu"
    dtype = torch.float32

    # ensure the apple mps backend is loaded and hardware initialized
    if tc.is_available():
        print("CUDA Accelerator Available")
        accelerator = "cuda"
        dtype = torch.float16
        !nvidia-smi

    # return
    return (accelerator, dtype)

In [None]:
# declare global setting variables
PERSISTENCE_DIR: str = os.environ.get('PERSISTENCE_DIR')
ULTRALYTICS_DIR = os.path.join(PERSISTENCE_DIR, "ultralytics")
CHECKPOINT_NAME: str = os.environ.get("YOLO_CHECKPOINT")
CHECKPOINT_CONFIG: str = os.environ.get("YOLO_CONFIG")
YOLO_MODEL_PATH: str = os.path.join(ULTRALYTICS_DIR, "Ultralytics/YOLO11")
YOLO_ORIGINAL_MODEL: str = "/".join((YOLO_MODEL_PATH, CHECKPOINT_NAME))
DATASET_NAME: str = "mario"
TRAINING_DATASET_PATH: str = os.path.join(PERSISTENCE_DIR, "data")

print(f"Using YOLO Model Original Checkpoint at: {YOLO_ORIGINAL_MODEL}")
print(f"Dataset '{DATASET_NAME}' will be loaded from {TRAINING_DATASET_PATH}")

## Setup the training job

Make sure the checkpoint is available and load that into the available accelerator (CPU/GPU)

In [None]:
# Training Parameters
JOB = os.environ.get("JOB_TYPE", "detect")
RUN_NAME = os.environ.get("JOB_NAME", "train")
CHECKPOINT = "last.pt"
EPOCHS = int(os.environ.get("EPOCHS", "20"))
LR = 1e-4
IMG_SIZE = int(os.environ.get("IMG_SIZE", "640"))
BATCH = int(os.environ.get("BATCH", "2"))
OPTIMIZER = os.environ.get("OPTIMIZER", "AdamW")
AUGMENT = True

In [None]:
# Copy Dataset


In [None]:
# detect accelerator
accelerator, dtype = detectAccelerator()

# load checkpoint in memory
print(f"Loading checkpoint {YOLO_ORIGINAL_MODEL}...")
yolo_model = ultralytics.YOLO(CHECKPOINT_CONFIG).load(YOLO_ORIGINAL_MODEL)
yolo_model.to(accelerator)
resume = False

In [None]:
# start training!
# this does not seem to when run in a pipeline.
#ultralytics.settings.update({'datasets_dir': TRAINING_DATASET_PATH})
# UGLY workaround: copy datasets to default path
!mkdir -p /opt/app-root/src/datasets
!cp -r $TRAINING_DATASET_PATH/mario /opt/app-root/src/datasets/

# train
yolo_model.train(data=f"{TRAINING_DATASET_PATH}/{DATASET_NAME}/mario.yaml",
                 epochs=EPOCHS, lr0=LR, imgsz=IMG_SIZE, batch=BATCH,
                 resume=resume, optimizer=OPTIMIZER, augment=AUGMENT)

In [None]:
# convert checkpoint
yolo_model.export(format="onnx")

# make sure checkpoint exists
latest_checkpoint: str = f"runs/{JOB}/{RUN_NAME}/weights/best.onnx"

# validate
if not os.path.exists(latest_checkpoint):
    raise Exception(f"Checkpoint {latest_checkpoint} not found in filesystem")