In [1]:
import kagglehub

# download dataset
data_dir = kagglehub.dataset_download("andrewmvd/dog-and-cat-detection")
print("Path to downloaded dataset:", data_dir)

  from .autonotebook import tqdm as notebook_tqdm


Path to downloaded dataset: /home/hongong/.cache/kagglehub/datasets/andrewmvd/dog-and-cat-detection/versions/1


In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import patches
from collections import Counter
import cv2
from glob import glob
from tqdm import tqdm
from termcolor import colored

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torchvision
from torchvision import transforms

import albumentations as A
from albumentations.pytorch import ToTensorV2

# Data preparation

In [4]:
class CustomVOCDataset(torchvision.datasets.VOCDetection):
    def __init__(self, class_mapping, S=7, B=2, C=20, custom_transforms=None):
        self.S = S  # Grid size S x S
        self.B = B  # Number of bounding boxes
        self.C = C  # Number of classes
        self.class_mapping = class_mapping  # Mapping of class names to class indices
        self.custom_transforms = custom_transforms

    def __getitem__(self, index):
        # get an image and its target (annotations) from the VOC dataset
        image, target = super(CustomVOCDataset, self).__getitem__(index)
        img_width, img_height = image.size

        # convert target annotations to YOLO format bounding boxes
        boxes = self.convert_to_yolo_format(
            target, img_width, img_height, self.class_mapping
        )
        just_boxes = boxes[:, 1:]
        labels = boxes[:, 0]

        # transform
        if self.custom_transforms:
            sample = {"image": np.array(image), "bboxes": just_boxes, "labels": labels}
            sample = self.custom_transforms(**sample)
            image = sample["image"]
            boxes = sample["bboxes"]
            labels = sample["labels"]

        # create an empty label matrix for YOLO ground truth
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.float32)
        image = torch.as_tensor(image, dtype=torch.float32)

        # iterate through each bounding box in YOLO format
        for box, class_label in zip(boxes, labels):
            x, y, width, height = box.tolist()
            class_label = int(class_label)

            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i

            # calculate the width and height of the box relative to the grid cell
            width_cell, height_cell = (
                width * self.S,
                height * self.S,
            )

            # if no object has been found in this specific cell (i, j) before
            if label_matrix[i, j, 20] == 0:
                # mark that an object exists in this cell
                label_matrix[i, j, 20] = 1

                # store the box coordinates as an offset from the cell boundaries
                box_coordinates = torch.tensor(
                    [x_cell, y_cell, width_cell, height_cell]
                )

                # set the box coordinates in the label matrix
                label_matrix[i, j, 21:25] = box_coordinates

                # set the one-hot encoding for the class label
                label_matrix[i, j, class_label] = 1

        return image, label_matrix

In [6]:
def convert_to_yolo_format(target, img_width, img_height, class_mapping):
    """
    Convert annotation data from VOC format to YOLO format

    Parameters:
        target (dict): annotation data from VOCDetection dataset.
        img_width (int): width of the image.
        img_height (int): height of the image.
        class_mapping (dict): mapping of class names to class to interger IDS.

    Returns:
        torch.Tensor: Tensor of shape [N, 5],  for N bounding boxes.
                    each with [class_id, x_center, y_center, width, height]
    """

    # Extract the list of a n n o t a t i o n s from the target dic ti ona ry .
    annotations = target["annotation"]["object"]

    # get the real width and height of the image from the annotation
    real_width = int(target["annotation"]["size"]["width"])
    real_height = int(target["annotation"]["size"]["height"])

    # ensure that annotations is a list, even if there's only one object
    if not isinstance(annotations, list):
        annotations = [annotations]

    # initialize an empty list to store the converted bounding boxes
    boxes = []

    # loop through each annotation and convert it to YOLO format
    for anno in annotations:
        xmin = int(anno["bndbox"]["xmin"]) / real_width
        xmax = int(anno["bndbox"]["xmax"]) / real_width
        ymin = int(anno["bndbox"]["ymin"]) / real_height
        ymax = int(anno["bndbox"]["ymax"]) / real_height

        # Calculate the center coordinates , width , and height of the bounding box
        x_center = (xmin + xmax) / 2
        y_center = (ymin + ymax) / 2
        width = xmax - xmin
        height = ymax - ymin

        # retrieve the class name from the annotation and map it to an interger ID
        class_name = anno["name"]
        class_id = class_mapping[class_name] if class_name in class_mapping else 0

        # append the YOLO formatted bbox to the list.
        boxes.append([class_id, x_center, y_center, width, height])

    # convert the list of  boxes to a torch tensor
    return np.array(boxes)