# Fine tune YOLOv11

Fine tuning a YOLOv11 model to recognize new images and new categories. This notebook will outline a general implementation

The outcome is to:
- Get and save a pretrained YOLO checkpoint from huggingface hub
- Decompress and load a custom dataset
- Train/Fine-tune the model
- Test the new checkpoint
- Save the checkpoint for further use

We'll need a bunch of libraries:
- *pandas* in order to deal with structured annotation formats
- *pytorch* to deal with training functions and data types
- *ultralytics* to actually train the model

In [None]:
!pip install torch torchvision ultralytics pandas numpy huggingface_hub
!pip list | awk '/torch|ultralytics|pandas|numpy|huggingface/ {print $1}'

In [None]:
# declare global setting variables 
YOLO_MODEL_PATH: str = "../model_checkpoints/Ultralytics/YOLO11/"
YOLO_MODEL_CHECKPOINT: str = "yolo11x.pt"
YOLO_MODEL_FILE: str = "yolo11x.yaml"
YOLO_ORIGINAL_MODEL: str = "/".join((YOLO_MODEL_PATH, YOLO_MODEL_CHECKPOINT))
TRAINING_DATASET_PATH: str = "/".join(("..", "datasets"))
DATASET_NAME: str = "mario"
DATASET_ARCHIVE: str = "mario-dataset.tar.gz"
CONFIG_FILE: str = "../parameters.yaml.local"

# setup ultralytics
from ultralytics import settings
settings.update({"datasets_dir": "."})

print(f"Using YOLO Model Original Checkpoint at: {YOLO_ORIGINAL_MODEL}")
print(f"Dataset '{DATASET_NAME}' will be loaded from {TRAINING_DATASET_PATH}")

In [None]:
try:
    import os
    import torch
    from torch.utils.data import Dataset
    from torch import float16, float32
    import torch.cuda as tc
    import torch.backends.mps as apple_mps
    from pathlib import Path
    from torchvision.io import read_image
    import matplotlib.pyplot as plt
    from pandas import DataFrame, read_table
    import ultralytics
except Exception as e:
    print(f"Cannot load pytorch: {e}")

try:
    from libs.huggingface import pullFromHuggingfaceHub
    from libs.parameters import loadConfig, Parameters
except Exception as e:
    print(f"Cannot load custom python module: {e}")

# Decompress dataset

Decompress the data set tarball in the current directory for further use.

The dataset is composed of
- images/{train,validate} folders: where actual training images are stored for training and validation purposes
- labels/{train,validate} folders: where label annotations are stored, one txt file per source image
- task.yaml: the training task descriptor

In [None]:
# decompress tarball
def decompressDataset(datasetPath:str, destination: str) -> None:
    try:
        import tarfile as tf
    except Exception as e:
        raise e

    # check destination path
    if not os.path.isdir(destination):
        print(f"Creating destination dir {destination}...")
        os.mkdir(destination)

    # decompress file
    with tf.open(datasetPath) as dset_file:
        for f in dset_file.getnames():
            if not os.path.exists("/".join((destination,f))):
                print(f"Extracting {f}...")
                dset_file.extract(f, destination)

# Load the training dataset and explore it

Now we define a custom Dataset class that will hold our training and validation data.
The custom class scans the dataset path for images and annotation files and organizes them in a data structure

The dataset class is then iterable and returns a data point tuple:
- a *torch.Tensor* object holding the image pixel data
- a *pandas.DataFrame* object holding the label annotations for that specific image


In [None]:
# create a new dataset class that holds training data information
class CustomDataset(Dataset):
    def __init__(self, path: str, step: str = "train") -> None:
        self.image_datapath: Path = Path(path + "/images" + f"/{step}")
        self.labels_datapath: Path = Path(path + "/labels" + f"/{step}")
        
        # load objects
        self.imgs: list = [f for f in self.image_datapath.glob("**/*.jpg")]
        self.labels: list = [f for f in self.labels_datapath.glob("**/*.txt")]

        # validate
        for fname in self.imgs:
            img_id = fname.stem
            labelname = self.labels_datapath / f"{img_id}.txt"
            if labelname not in self.labels:
                raise Exception(f"Missing labels file for image id {img_id}")

    def __len__(self) -> int:
        return len(self.imgs)

    def __getitem__(self, idx) -> (torch.Tensor, DataFrame):
        # get image at index 'idx'
        image: torch.Tensor = read_image(self.imgs[idx])
        # load related annotations
        labels: DataFrame = read_table(self.labels_datapath / f"{self.imgs[idx].stem}.txt",
                                      sep=" ",
                                      header=None)

        # return datapoint
        return (image, labels)

# load labels and ids
def loadLabels(descriptorFile: str) -> dict:
    import yaml
    try:
        with open(descriptorFile) as yf:
            descriptor_contents = yaml.safe_load(yf)
    except Exception as e:
        print(f"Caught YAML Exception: {e}")

    return descriptor_contents.get("names")

In [None]:
# datapoint visualization
def plotDataPoint(img_rgb: torch.Tensor, annotations: DataFrame, labels: dict) -> None:
    """
        plot an image with its relative object bounding boxes in overlay
    """
    # permute channels and get image sizes
    img = img_rgb.permute(1,2,0)
    img_h, img_w, channels = img.shape

    # plot image
    plt.title(f"Dataset Point: {len(annotations)} objects")
    plt.imshow(img)

    # calculate bounding boxes
    axes = plt.gca()
    for annotation in range(len(annotations)):
        label, center_x, center_y, bounding_w, bounding_h = annotations.loc[annotation, :].values.flatten().tolist()
        print(f"Bounding Box: {label}, {center_x}, {center_y}, {bounding_w}, {bounding_h}")

        # scale coordinates - BOX CENTER
        cx, cy = center_x * img_w, center_y * img_h
        # scale coordinates - BOX DIMENSIONS
        bw, bh = (bounding_w * img_w), (bounding_h * img_h)

        # add bounding box
        from matplotlib.patches import Rectangle
        axes.add_patch(Rectangle((cx - bw/2, cy - bh/2), bw, bh, color="white", fill=None))
        # add label
        ltext = labels.get(label)
        lpos = (cx - bw/2, cy - bh/2 - 10)
        axes.text(lpos[0], lpos[1], ltext, color="white", fontsize=12)

    # show datapoint
    plt.show()

# Dataset test

A datapoint is a tuple containing:
- A torch.Tensor
- A pandas DataFrame

The tensor represents the image. Its shape is by default in the format (C,H,W):
- C: number of channels
- H: image height
- W: image width

To plot the image with pyplot we need to permute channels from (C,H,W) to (H,W,C)

The Dataframe holds information about labels, in the format that is needed by the Ultralytics libs to train the YOLO model.
The file is structured like this:

- One bounding box per row
- Each Row contains:
  - The Label Class (int)
  - The coordinates of the center of the bounding box (float) relative to the image size
  - The sizes of the bounding box relative to the image size and the bounding box center (float)

In [None]:
# dataset file path
ds_file_path: str = "/".join((TRAINING_DATASET_PATH, DATASET_ARCHIVE))
ds_target_path: str = "."
dataset_path: str = ds_target_path + "/mario"
ds_descriptor_file: str = ds_target_path + "/mario/mario.yaml"

# decompress dataset in the target directory
try:
    decompressDataset(ds_file_path, ds_target_path)
except Exception as e:
    print(f"Caught exception: {e}")

# load dataset
training_dataset = CustomDataset(dataset_path, step="train")
validation_dataset = CustomDataset(dataset_path, step="validate")
print(f"Training Dataset loaded, contains {len(training_dataset)} images.")
print(f"Validation Dataset loaded, contains {len(validation_dataset)} images.")

# load labels
label_dict: dict = loadLabels(ds_descriptor_file)

# get a data point from data set
try:
    datapoint: tuple = training_dataset[0]
    
    # image from dataset
    img: torch.Tensor = datapoint[0]
    
    # datapoint annotations
    annotations: DataFrame = datapoint[1]

    # plot datapoint
    plotDataPoint(img, annotations, label_dict)
except Exception as e:
    print(f"Caught exception: {e}")

# Hardware detection

Now we proceed to determine which hardware we can use for training the model.
Currently (and based on what pytorch supports), these backends are autodetected:

- Plain CPU, no acceleration
- Apple Metal Performance Shaders (mps)
- Nvidia CUDA (or any accelerator labelled as 'cuda' by pytorch)

In [None]:
# detect accelerator
def detectAccelerator() -> (str, torch.dtype):
    accelerator = "cpu"
    dtype = float32
    
    # ensure the apple mps backend is loaded and hardware initialized
    if apple_mps.is_available():
        print("Apple Metal Performance Shaders Available")
        accelerator = "mps"
        dtype = float16
    # check for cuda
    elif tc.is_available():
        print("CUDA Accelerator Available")
        accelerator = "cuda"
        dtype = float16
        !nvidia-smi

    # return
    return (accelerator, dtype)

In [None]:
# load parameters from config file
import yaml
try:
    parms = loadConfig(CONFIG_FILE)
except yaml.YAMLError as e:
    print(f"Error while loading config parameters {e}")

# analyze detected features
def detectedObjects(inferenceOutput):
    for o in inferenceOutput:
        object_classes = o.names
        for obj in o.boxes:
            if type(obj.xyxy) is torch.Tensor:
                # determine object coordinates
                bbox = obj.xyxy.cpu().type(torch.int32).numpy()
                x1, y1, x2, y2 = bbox[0]

                # determine object classification label and confidence score
                class_label = object_classes[int(obj.cls)]
                confidence = float(obj.conf)
                # print detected object class
                print(f"{class_label} - {confidence:.2f}")

# Download the pretrained YOLO checkpoint we want to finetune

If the checkpoint is not already present on the filesystem, pull it from huggingface
Then load the pretrained weights and move the model to the selected accelerator.

In [None]:
# Training Parameters
JOB = "detect"
RUN_NAME = "train"
CHECKPOINT = "last.pt"
EPOCHS = 20
LR = 1e-4
IMG_SIZE = 640
BATCH = 2
OPTIMIZER = "AdamW"
AUGMENT = True

In [None]:
# detect accelerator
accelerator, dtype = detectAccelerator()

# make sure the checkpoint is on the filesystem
latest_checkpoint: str = f"runs/{JOB}/{RUN_NAME}/weights/{CHECKPOINT}"
if os.path.exists(latest_checkpoint):
    print(f"Loading checkpoint {latest_checkpoint}...")
    yolo_model = ultralytics.YOLO(YOLO_MODEL_FILE).load(latest_checkpoint)
    yolo_model.to(accelerator)
    resume = True
else:
    if not os.path.exists(YOLO_ORIGINAL_MODEL):
        print(f"Downloading model from huggingface... {YOLO_ORIGINAL_MODEL}")
        pullFromHuggingfaceHub(parms)
    # start from scratch
    print(f"Loading checkpoint {YOLO_ORIGINAL_MODEL}...")
    yolo_model = ultralytics.YOLO(YOLO_MODEL_FILE).load(YOLO_ORIGINAL_MODEL)
    yolo_model.to(accelerator)
    resume = False

In [None]:
# start training!
yolo_model.train(data="./mario/mario.yaml",
                 epochs=EPOCHS, lr0=LR, imgsz=IMG_SIZE, batch=BATCH,
                 resume=resume, optimizer=OPTIMIZER, augment=AUGMENT
                )

# Training Review

At this step, the model has been finetuned and we can measure the outcomes: look into the training run folder for detailed graphs and information

In [None]:
# load results
results_path: str = f"runs/{JOB}/{RUN_NAME}/"

# print graphs
plt.figure(figsize=(20,10))
plt.title("Training Results")
plt.subplot(121)
plt.imshow(read_image(results_path + "labels.jpg").permute(1,2,0))
plt.subplot(122)
plt.imshow(read_image(results_path + "results.png").permute(1,2,0))

In [None]:
# Test Inference
prediction = yolo_model(dataset_path + "/test/mario_test.png")

In [None]:
detectedObjects(prediction)

# Save finetuned model and cleanup

In [None]:
# convert checkpoint?
CONVERT_ONNX = True
if CONVERT_ONNX:
    yolo_model.export(format="onnx")

# export model to data path
import shutil
shutil.copy(latest_checkpoint, YOLO_MODEL_PATH)