# Gate Detection

[1] <https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html>


## Defining the Dataset

In [15]:
from pathlib import Path
from PIL import Image
import torch
import numpy as np
import xml.etree.ElementTree as ET


class TelloDataset(torch.utils.data.Dataset):
    def __init__(self, base, transforms=None):
        self.base = Path(base)
        tree = ET.parse(self.base / "annotations.xml")
        self.annotations = tree.getroot()
        self.labels = {
            "front": 1,
            "back": 2,
        }
        self.transforms = transforms

    def _image(self, index):
        element = self.annotations.find(f"image[{index + 1}]")
        if element:
            return Image.open(self.base / "images" / element.get("name")).convert("RGB")
        raise KeyError(index)

    def _box(self, index):
        element = self.annotations.find(f"image[{index + 1}]/box")
        return [
            float(element.get(attribute)) for attribute in ["xtl", "ytl", "xbr", "ybr"]
        ]

    def _label(self, index):
        element = self.annotations.find(f"image[{index + 1}]/box")
        label = element.get("label")
        return self.labels[label]

    def __getitem__(self, index):
        # get image
        image = self._image(index)

        # get bounding box coordinates
        box = self._box(index)

        # get label
        label = self._label(index)

        # convert everything into a torch.Tensor
        boxes = torch.as_tensor([box], dtype=torch.float32)
        labels = torch.as_tensor([label], dtype=torch.int64)
        image_id = torch.tensor([index])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((1,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            image, target = self.transforms(image, target)

        return image, target

    def __len__(self):
        return len(list(self.annotations.iter("image")))


dataset = TelloDataset("data/train")
dataset_test = TelloDataset("data/test")

print(len(dataset))
print(len(dataset_test))
print(dataset[0])
print(dataset[1])
print(dataset[14])


94
38
(<PIL.Image.Image image mode=RGB size=960x720 at 0x7F3A19348670>, {'boxes': tensor([[251.7500, 154.1200, 561.7500, 385.0600]]), 'labels': tensor([1]), 'image_id': tensor([0]), 'area': tensor([71591.3984]), 'iscrowd': tensor([0])})
(<PIL.Image.Image image mode=RGB size=960x720 at 0x7F3A19348700>, {'boxes': tensor([[178.8000, 140.4200, 729.9100, 602.6400]]), 'labels': tensor([1]), 'image_id': tensor([1]), 'area': tensor([254734.0781]), 'iscrowd': tensor([0])})
(<PIL.Image.Image image mode=RGB size=960x720 at 0x7F3A19348250>, {'boxes': tensor([[369.7500, 272.5600, 784.0500, 690.6600]]), 'labels': tensor([2]), 'image_id': tensor([14]), 'area': tensor([173218.8125]), 'iscrowd': tensor([0])})


## Training and evaluation

In [16]:
from engine import train_one_epoch, evaluate
import utils

import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

import transforms as T


def get_transform(train):
    transforms = []
    transforms.append(T.PILToTensor())
    transforms.append(T.ConvertImageDtype(torch.float))
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)


def get_model(num_classes):
    # load a model pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model


def main():
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # our dataset has three classes - background, front and back
    num_classes = 3  # use our dataset and defined transformations

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(
        TelloDataset("data/train", get_transform(train=True)),
        batch_size=2,
        shuffle=True,
        num_workers=4,
        collate_fn=utils.collate_fn,
    )

    data_loader_test = torch.utils.data.DataLoader(
        TelloDataset("data/test", get_transform(train=False)),
        batch_size=1,
        shuffle=False,
        num_workers=4,
        collate_fn=utils.collate_fn,
    )

    # get the model using our helper function
    model = get_model(num_classes)

    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = 10

    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=device)

    print("That's it!")


main()


Epoch: [0]  [ 0/47]  eta: 0:00:48  lr: 0.000114  loss: 1.6247 (1.6247)  loss_classifier: 1.3354 (1.3354)  loss_box_reg: 0.1590 (0.1590)  loss_objectness: 0.1275 (0.1275)  loss_rpn_box_reg: 0.0028 (0.0028)  time: 1.0302  data: 0.3732  max mem: 6749
