In [36]:
import cv2
import numpy as np
import os
import os.path as osp
import sys
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
from torch.utils.data import Dataset

import torchvision
from torchvision import tv_tensors
from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision.transforms import v2 as T
from torchvision.transforms.v2 import functional as F
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks

import matplotlib.pyplot as plt

import utils
from engine import train_one_epoch, evaluate

# Data preparation

In [3]:
!mkdir -p ./datasets/

In [4]:
!wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P ./datasets/

--2024-04-28 13:00:51--  https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip
Resolving www.cis.upenn.edu (www.cis.upenn.edu)... 158.130.69.163, 2607:f470:8:64:5ea5::d
Connecting to www.cis.upenn.edu (www.cis.upenn.edu)|158.130.69.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53723336 (51M) [application/zip]
Saving to: ‘PennFudanPed.zip’


2024-04-28 13:00:51 (101 MB/s) - ‘PennFudanPed.zip’ saved [53723336/53723336]

--2024-04-28 13:00:51--  http://./
Resolving . (.)... failed: No address associated with hostname.
wget: unable to resolve host address ‘.’
FINISHED --2024-04-28 13:00:51--
Total wall clock time: 0.6s
Downloaded: 1 files, 51M in 0.5s (101 MB/s)


In [None]:
!cd ./datasets

In [5]:
!unzip PennFudanPed.zip

Archive:  ./PennFudanPed.zip
   creating: ./datasets/PennFudanPed/
  inflating: ./datasets/PennFudanPed/added-object-list.txt  
   creating: ./datasets/PennFudanPed/Annotation/
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00001.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00002.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00003.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00004.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00005.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00006.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00007.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00008.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00009.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00010.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00011.txt  
  inflating: ./datasets/PennFudanPed/Annotation/FudanPed00012.txt  
  infla

# Dataset

In [30]:
class PennFudanDataset(Dataset):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(osp.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(osp.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        # load images and masks
        img_path = osp.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = osp.join(self.root, "PedMasks", self.masks[idx])
        img = read_image(img_path)
        # mask is not converted into RGB,
        # because each color corresponds to a different instance
        # with 0 being background
        mask = read_image(mask_path)
        # instances are encoded as different colors
        obj_ids = torch.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8)

        # # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = masks_to_boxes(masks)

        # convert everything into a torch.Tensor
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)

        image_id = idx
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        img = tv_tensors.Image(img)
        target = {}
        target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img))
        target["labels"] = labels
        target["masks"] = tv_tensors.Mask(masks)
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        # database size
        return len(self.imgs)

# Model

In [7]:
# load a model pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
# replace the classifier with a new one
# 1 class (person) + background
num_classes = 2
# get number of input feature for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:01<00:00, 139MB/s]


In [8]:
# load a pre-trained model for classification and
# return only the features
backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features
# FasterRCNN needs to know the number of output channels
# mobilenet_v2 has 1280 output channels
backbone.out_channels = 1280
# RPN generates 5x3 anchors per spatial location
# with 5 different sizes and 3 different aspect ratios
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))
# feature maps that we will use to perform
# the region of interest cropping,
# as well as the size of the crop after rescaling
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
                                                output_size=7,
                                                sampling_ratio=2)
# put the pieces together inside a FasterRCNN model
model = FasterRCNN(backbone,
                   num_classes=num_classes,
                   rpn_anchor_generator=anchor_generator,
                   box_roi_pool=roi_pooler)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-7ebf99e0.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-7ebf99e0.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 71.5MB/s]


In [9]:
def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model

In [28]:
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py")
# os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py")

0

In [31]:
# data augmentation
def get_transform(train):
    transforms = []
    # transforms.append(T.PILToTensor())
    # transforms.append(T.ToDtype(torch.float))
    if train:
      transforms.append(T.RandomHorizontalFlip(0.5))
    transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(T.ToPureTensor())
    return T.Compose(transforms)

# Training

In [35]:
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# two classes only - background and person
num_classes = 2
# use train dataset and defined transformations
dataset = PennFudanDataset('/content/datasets/PennFudanPed', get_transform(train=True))
# use test dataset, no transformations
dataset_test = PennFudanDataset('/content/datasets/PennFudanPed', get_transform(train=False))

# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
dataset = Subset(dataset, indices[:-50])
dataset_test = Subset(dataset_test, indices[-50:])

# define training data loaders
data_loader = DataLoader(
    dataset, batch_size=2, shuffle=True, num_workers=0,
    collate_fn=utils.collate_fn)

# define testing data loaders
data_loader_test = DataLoader(
    dataset_test, batch_size=1, shuffle=False, num_workers=0,
    collate_fn=utils.collate_fn)

# get the model using our helper function
model = get_model_instance_segmentation(num_classes)

# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# let's train it for 10 epochs
num_epochs = 10

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

print("Finish training...")

Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
100%|██████████| 170M/170M [00:01<00:00, 142MB/s]


Epoch: [0]  [ 0/60]  eta: 0:03:15  lr: 0.000090  loss: 4.1325 (4.1325)  loss_classifier: 1.0344 (1.0344)  loss_box_reg: 0.2712 (0.2712)  loss_mask: 2.7927 (2.7927)  loss_objectness: 0.0323 (0.0323)  loss_rpn_box_reg: 0.0019 (0.0019)  time: 3.2664  data: 0.0231  max mem: 2107
Epoch: [0]  [10/60]  eta: 0:00:38  lr: 0.000936  loss: 1.9420 (2.3301)  loss_classifier: 0.6118 (0.5739)  loss_box_reg: 0.2401 (0.2562)  loss_mask: 0.9444 (1.4737)  loss_objectness: 0.0222 (0.0228)  loss_rpn_box_reg: 0.0030 (0.0034)  time: 0.7649  data: 0.0211  max mem: 3041
Epoch: [0]  [20/60]  eta: 0:00:25  lr: 0.001783  loss: 0.7643 (1.5267)  loss_classifier: 0.1943 (0.3759)  loss_box_reg: 0.1951 (0.2241)  loss_mask: 0.3628 (0.9009)  loss_objectness: 0.0189 (0.0208)  loss_rpn_box_reg: 0.0030 (0.0050)  time: 0.5008  data: 0.0191  max mem: 3041
Epoch: [0]  [30/60]  eta: 0:00:18  lr: 0.002629  loss: 0.6664 (1.2503)  loss_classifier: 0.1353 (0.2982)  loss_box_reg: 0.2121 (0.2473)  loss_mask: 0.2289 (0.6806)  loss_ob

# Inference

In [37]:
image = read_image("/content/datasets/PennFudanPed/PNGImages/PennPed00034.png")
eval_transform = get_transform(train=False)

In [46]:
# no gradient computation
model.eval()
with torch.no_grad():
  x = eval_transform(image)
  # convert RGBA -> RGB and move to device
  x = x[:3, ...].to(device)
  predictions = model([x,])
  pred = predictions[0]

In [48]:
image = (255.0 * (image-image.min()) / (image.max()-image.min())).to(torch.uint8)
image = image[:3, ...]
pred_labels = [f"pedestrian: {score:.3f}" for label, score in zip(pred["labels"],pred["scores"])]
pred_boxes = pred["boxes"].long()
output_image = draw_bounding_boxes(image, pred_boxes, pred_labels, colors="red")
masks = (pred["masks"] > 0.7).squeeze(1)
output_image = draw_segmentation_masks(output_image, masks, alpha=0.5, colors="blue")

KeyError: 'mask'

In [50]:
pred.keys()

tensor([0.5509, 0.5490, 0.5449, 0.5404, 0.5395, 0.5394, 0.5390, 0.5389, 0.5387,
        0.5378, 0.5377, 0.5370, 0.5367, 0.5364, 0.5362, 0.5360, 0.5345, 0.5344,
        0.5340, 0.5340, 0.5337, 0.5336, 0.5335, 0.5323, 0.5321, 0.5320, 0.5319,
        0.5316, 0.5316, 0.5316, 0.5315, 0.5302, 0.5301, 0.5301, 0.5299, 0.5297,
        0.5297, 0.5294, 0.5291, 0.5290, 0.5281, 0.5280, 0.5278, 0.5278, 0.5276,
        0.5275, 0.5273, 0.5269, 0.5267, 0.5266, 0.5263, 0.5260, 0.5259, 0.5255,
        0.5254, 0.5253, 0.5253, 0.5252, 0.5251, 0.5246, 0.5242, 0.5241, 0.5241,
        0.5241, 0.5239, 0.5239, 0.5237, 0.5236, 0.5236, 0.5236, 0.5234, 0.5233,
        0.5232, 0.5232, 0.5232, 0.5230, 0.5230, 0.5229, 0.5228, 0.5227, 0.5224,
        0.5224, 0.5223, 0.5223, 0.5223, 0.5223, 0.5222, 0.5221, 0.5221, 0.5220,
        0.5220, 0.5220, 0.5219, 0.5218, 0.5218, 0.5216, 0.5216, 0.5215, 0.5215,
        0.5215], device='cuda:0')