## Setup

To start, we need to install FiftyOne:

*If you're working in Google Colab, be sure to [enable a GPU runtime](https://colab.research.google.com/drive/1P7okDVh6viCIOkii6UAF2O9sTAcKGNWq) before running any cell*

In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
%pip install git+https://github.com/deepvision-class/starter-code

In [None]:
%pip install fiftyone

We'll also need pytorch, and torchvision, as well as clone the torchvision GitHub repository to use the training and evaluation utilities provided for the [Torchvision Object Deteciton Tutorial](https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html#defining-the-dataset) that we are using to train a basic object detection model.

In [None]:
%%sh

git clone https://github.com/pytorch/vision.git
cd vision
git checkout v0.3.0

cp references/detection/utils.py ../
cp references/detection/transforms.py ../
# cp references/detection/engine.py ../
# cp references/detection/coco_eval.py ../ #change the torch._six to six
cp references/detection/coco_utils.py ../

In [None]:
import os
import shutil

In [None]:
import math
import sys
import time
import torch
import json
import tempfile
import numpy as np
import copy
import torch
import six
import utils
import random
import matplotlib.pyplot as plt
import cv2

from torchvision import models
from torchsummary import summary
import torch.nn as nn
from torchvision import transforms
from PIL import Image
from pycocotools.cocoeval import COCOeval
from pycocotools.coco import COCO
from coco_utils import get_coco_api_from_dataset
from collections import defaultdict
import pycocotools.mask as mask_util
import fiftyone as fo
import fiftyone.zoo as foz
import fiftyone.utils.coco as fouc
import torch.nn.functional as F
from torch import optim
import transforms as T
from fiftyone import ViewField 

In [None]:
torch.manual_seed(1)

# FiftyOne and Load the dataset

In [None]:
classes = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl"]
#bottle,fork,knife,spoon

dataset = foz.load_zoo_dataset(
    "coco-2017",
    splits=["validation","train"],
    classes=classes,
    max_samples=100,
    only_matching=True,
    dataset_dir="/C/Users/Bliss/Desktop",
    dataset_name="Open-c"
)

We will be needing the height and width of images later in this notebook so we need to compute metadata on our dataset.

In [None]:
# dataset.compute_metadata()

We can create a session and visualize this dataset in the [FiftyOne App](https://voxel51.com/docs/fiftyone/user_guide/app.html).

In [None]:
session = fo.launch_app(dataset)

In [None]:
class FiftyOneTorchDataset(torch.utils.data.Dataset):
    """A class to construct a PyTorch dataset from a FiftyOne dataset.

    Args:
        fiftyone_dataset: a FiftyOne dataset or view that will be used for training or testing
        transforms (None): a list of PyTorch transforms to apply to images and targets when loading
        gt_field ("ground_truth"): the name of the field in fiftyone_dataset that contains the
            desired labels to load
        classes (None): a list of class strings that are used to define the mapping between
            class names and indices. If None, it will use all classes present in the given fiftyone_dataset.
    """

    def __init__(
        self,
        fiftyone_dataset,
        transforms=None,
        gt_field="ground_truth",
        classes=None,
    ):
        self.samples = fiftyone_dataset
        self.transforms = transforms
        self.gt_field = gt_field

        self.img_paths = self.samples.values("filepath")

        self.classes = classes
        if not self.classes:
            # Get list of distinct labels that exist in the view
            self.classes = self.samples.distinct(
                "%s.detections.label" % gt_field
            )

        if self.classes[0] != "background":
            self.classes = ["background"] + self.classes

        self.labels_map_rev = {c: i for i, c in enumerate(self.classes)}

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        sample = self.samples[img_path]
        metadata = sample.metadata
        img = Image.open(img_path).convert("RGB")

        boxes = []
        labels = []
        area = []
        iscrowd = []
        detections = sample[self.gt_field].detections
        for det in detections:
            category_id = self.labels_map_rev[det.label]
            coco_obj = fouc.COCOObject.from_label(
                det, metadata, category_id=category_id,
            )
            x, y, w, h = coco_obj.bbox
            boxes.append([x, y, x + w, y + h])
            labels.append(coco_obj.category_id)
            area.append(coco_obj.area)
            iscrowd.append(coco_obj.iscrowd)

        target = {}
        target["boxes"] = torch.as_tensor(boxes, dtype=torch.float32)
        target["labels"] = torch.as_tensor(labels, dtype=torch.int64)
        target["image_id"] = torch.as_tensor([idx])
        target["area"] = torch.as_tensor(area, dtype=torch.float32)
        target["iscrowd"] = torch.as_tensor(iscrowd, dtype=torch.int64)

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.img_paths)

    def get_classes(self):
        return self.classes

The following code loads Faster-RCNN with a ResNet50 backbone from Torchvision and modifies the classifier for the number of classes we are training on:

In [None]:
busy_view = dataset.match(ViewField("ground_truth.detections").length() > 10)

busy_torch_dataset = FiftyOneTorchDataset(busy_view)

In [None]:
session.view = busy_view

In [None]:
train_transforms = T.Compose([T.ToTensor(), T.RandomHorizontalFlip(0.5)])
test_transforms = T.Compose([T.ToTensor()])

In [None]:
# split the dataset in train and test set
train_view = busy_view.take(size=500)
test_view = busy_view.exclude([s.id for s in train_view])

In [None]:
# use our dataset and defined transformations
train_dataset = FiftyOneTorchDataset(train_view, train_transforms,
        classes=classes)
test_dataset = FiftyOneTorchDataset(test_view, test_transforms,
        classes=classes)

## Visualization



In this section, we use the functions and datasets we defined above to initialize, train, and evaluate a model

In [None]:
def coord_trans(bbox, w_pixel, h_pixel, w_amap=7, h_amap=7, mode='a2p'):
  assert mode in ('p2a', 'a2p'), 'invalid coordinate transformation mode!'
  assert bbox.shape[-1] >= 4, 'the transformation is applied to the first 4 values of dim -1'

  if bbox.shape[0] == 0: # corner cases
    return bbox
    assert mode in ('p2a', 'a2p'), 'invalid coordinate transformation mode!'
  assert bbox.shape[-1] >= 4, 'the transformation is applied to the first 4 values of dim -1'

  if bbox.shape[0] == 0: # corner cases
    return bbox

  resized_bbox = bbox.clone()
  # could still work if the first dim of bbox is not batch size
  # in that case, w_pixel and h_pixel will be scalars
  resized_bbox = resized_bbox.view(bbox.shape[0], -1, bbox.shape[-1])
  invalid_bbox_mask = (resized_bbox == -1) # indicating invalid bbox

  if mode == 'p2a':
    # pixel to activation
    width_ratio = w_pixel * 1. / w_amap
    height_ratio = h_pixel * 1. / h_amap
    resized_bbox[:, :, [0, 2]] /= width_ratio.view(-1, 1, 1)
    resized_bbox[:, :, [1, 3]] /= height_ratio.view(-1, 1, 1)
  else:
    # activation to pixel
    width_ratio = w_pixel * 1. / w_amap
    height_ratio = h_pixel * 1. / h_amap
    resized_bbox[:, :, [0, 2]] *= width_ratio.view(-1, 1, 1)
    resized_bbox[:, :, [1, 3]] *= height_ratio.view(-1, 1, 1)

  resized_bbox.masked_fill_(invalid_bbox_mask, -1)
  resized_bbox.resize_as_(bbox)
  return resized_bbox

In [None]:
# Naeimeh added to debug
def rel_error(x, y):
    """Returns relative error between x and y"""
    return torch.max(torch.abs(x - y) / (torch.maximum(torch.abs(x), torch.abs(y)) + 1e-8))

In [None]:
def fix_random_seed(seed_no=0):
  torch.manual_seed(seed_no)
  torch.cuda.manual_seed(seed_no)
  random.seed(seed_no)

In [None]:
# for plotting
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# data type and device for torch.tensor
to_float = {'dtype': torch.float, 'device': 'cpu'}
to_float_cuda = {'dtype': torch.float, 'device': 'cuda'}
to_double = {'dtype': torch.double, 'device': 'cpu'}
to_double_cuda = {'dtype': torch.double, 'device': 'cuda'}
to_long = {'dtype': torch.long, 'device': 'cpu'}
to_long_cuda = {'dtype': torch.long, 'device': 'cuda'}

In [None]:
# Add this code because I want to follow the RCNN tutorial

def img_PIL(img):
  numpy_array = img.permute(1, 2, 0).mul(255).byte().numpy()
  pil_image = Image.fromarray(numpy_array)
  return pil_image

# Get a sample subset to visualize the code



In [None]:
# default examples for visualization
fix_random_seed(1)
batch_size = 3
sampled_idx = torch.linspace(0, len(train_dataset)-1, steps=batch_size).long()
# get the size of each image first
h_list = []
w_list = []
img_list = [] # list of images
MAX_NUM_BBOX = 50
box_list = torch.LongTensor(batch_size, MAX_NUM_BBOX, 4).fill_(-1) # PADDED GT boxes

for idx, i in enumerate(sampled_idx):
  # hack to get the original image so we don't have to load from local again...
  img, target = train_dataset.__getitem__(i)
  img = img_PIL(img)
  img_list.append(img)
  all_bbox = target['boxes']
  if type(all_bbox) == dict:
    all_bbox = [all_bbox]
  for bbox_idx, one_bbox in enumerate(all_bbox):
    bbox = all_bbox[bbox_idx]
    obj_cls = target['labels'][bbox_idx]
    box_list[idx][bbox_idx] = all_bbox[bbox_idx]

  # get sizes
  img = np.array(img)

  w_list.append(img.shape[1])
  h_list.append(img.shape[0])

w_list = torch.tensor(w_list, **to_float_cuda)
h_list = torch.tensor(h_list, **to_float_cuda)
box_list = torch.tensor(box_list, **to_float_cuda)
resized_box_list = coord_trans(box_list, w_list, h_list, mode='p2a')

In [None]:
def data_visualizer(img, idx_to_class, bbox=None, pred=None):
    img_copy = np.array(img).astype('uint8')

    if bbox is not None:
        for bbox_idx in range(bbox.shape[0]):
            one_bbox = bbox[bbox_idx][:4].int()  # Ensure integer type
            cv2.rectangle(img_copy, (int(one_bbox[0]), int(one_bbox[1])), (int(one_bbox[2]), int(one_bbox[3])), (255, 0, 0), 2)
            if bbox.shape[1] > 4:  # if class info provided
                obj_cls = idx_to_class[bbox[bbox_idx][4].item()]
                cv2.putText(img_copy, '%s' % obj_cls, (int(one_bbox[0]), int(one_bbox[1])+15), cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1)

    if pred is not None:
        for bbox_idx in range(pred.shape[0]):
            one_bbox = pred[bbox_idx][:4].int()  # Ensure integer type
            cv2.rectangle(img_copy, (int(one_bbox[0]), int(one_bbox[1])), (int(one_bbox[2]), int(one_bbox[3])), (0, 255, 0), 2)

            if pred.shape[1] > 4:  # if class and conf score info provided
                obj_cls = idx_to_class[pred[bbox_idx][4].item()]
                conf_score = pred[bbox_idx][5].item()
                cv2.putText(img_copy, '%s, %.2f' % (obj_cls, conf_score), (int(one_bbox[0]), int(one_bbox[1])+15), cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1)

    plt.imshow(img_copy)
    plt.axis('off')
    plt.show()

In [None]:
# visualize GT boxes
class_to_idx = {"bottle":0, "wine glass":1, "cup":2, "fork":3, "knife":4,
                "spoon":5, "bowl":6}
idx_to_class = {i:c for c, i in class_to_idx.items()}
for i in range(len(img_list)):
  valid_box = sum([1 if j != -1 else 0 for j in box_list[i][:, 0]])
  data_visualizer(img_list[i], idx_to_class, box_list[i][:valid_box])

# RPN


In [None]:
anchor_list = torch.tensor([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [2, 3], [3, 2], [3, 5], [5, 3]], **to_float_cuda)
print(anchor_list.shape)

## Activated (positive) and negative anchors
When training the RPN, we compare the anchor boxes with the ground-truth boxes in order to determine a ground-truth label for the anchor boxes -- should each anchor predict object or background?

We assign a positive label to two kinds of anchors:

(i) the anchor/anchors with the highest Intersection-overUnion (IoU) overlap with a ground-truth box, or

(ii) an anchor that has an IoU overlap higher than 0.7 with any ground-truth box. Note that a single ground-truth box may assign positive labels to multiple anchors.

Usually the second condition is sufficient to determine the positive samples; but we still adopt the first condition for the reason that in some rare cases the second condition may find no positive sample.

We assign a negative label to a non-positive anchor if its IoU ratio is lower than 0.3 for all ground-truth boxes. Anchors that are neither positive nor negative do not contribute to the training objective

In [None]:
#helper function
def IoU(proposals, bboxes):
  iou_mat = None
  B, A, H, W, _ = proposals.shape
  proposals = proposals.reshape(B, A*H*W, 4)
  tl = torch.max(proposals[:, :, :2].unsqueeze(2), bboxes[:, :, :2].unsqueeze(1))
  br = torch.min(proposals[:, :, 2:].unsqueeze(2), bboxes[:, :, 2:4].unsqueeze(1))
  intersect = torch.prod(br - tl, dim=3) * (tl < br).all(dim=3)
  a = torch.prod(bboxes[:, :, 2:4] - bboxes[:, :, :2], dim=2)
  b = torch.prod(proposals[:, :, 2:] - proposals[:, :, :2], dim=2)
  iou_mat = torch.div(intersect, a.unsqueeze(1) + b.unsqueeze(2) - intersect)

  return iou_mat



def GenerateGrid(batch_size, w_amap=7, h_amap=7, dtype=torch.float32, device='cuda'):
  """
  Return a grid cell given a batch size (center coordinates).

  Inputs:
  - batch_size, B
  - w_amap: or W', width of the activation map (number of grids in the horizontal dimension)
  - h_amap: or H', height of the activation map (number of grids in the vertical dimension)
  - W' and H' are always 7 in our case while w and h might vary.

  Outputs:
  grid: A float32 tensor of shape (B, H', W', 2) giving the (x, y) coordinates
        of the centers of each feature for a feature map of shape (B, D, H', W')
  """
  w_range = torch.arange(0, w_amap, dtype=dtype, device=device) + 0.5
  h_range = torch.arange(0, h_amap, dtype=dtype, device=device) + 0.5

  w_grid_idx = w_range.unsqueeze(0).repeat(h_amap, 1)
  h_grid_idx = h_range.unsqueeze(1).repeat(1, w_amap)
  grid = torch.stack([w_grid_idx, h_grid_idx], dim=-1)
  grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1)

  return grid



def GenerateAnchor(anc, grid):
    anchors = None
    B, H, W, _ = grid.shape
    A, _ = anc.shape
    anchors = torch.zeros((B, A, H, W, 4), device = grid.device, dtype = grid.dtype)
    for a in range(A):
      anchors[:,a,:,:,0] = grid[:,:,:,0] - anc[a,0]/2
      anchors[:,a,:,:,1] = grid[:,:,:,1] - anc[a,1]/2
      anchors[:,a,:,:,2] = grid[:,:,:,0] + anc[a,0]/2
      anchors[:,a,:,:,3] = grid[:,:,:,1] + anc[a,1]/2

    return anchors


def ReferenceOnActivatedAnchors(anchors, bboxes, grid, iou_mat, pos_thresh=0.7, neg_thresh=0.3):
    B, A, h_amap, w_amap, _ = anchors.shape
    N = bboxes.shape[1]

    # activated/positive anchors
    max_iou_per_anc, max_iou_per_anc_ind = iou_mat.max(dim=-1)
    max_iou_per_box = iou_mat.max(dim=1, keepdim=True)[0]
    activated_anc_mask = (iou_mat == max_iou_per_box) & (max_iou_per_box > 0)
    activated_anc_mask |= (iou_mat > pos_thresh) # using the pos_thresh condition as well
    # if an anchor matches multiple GT boxes, choose the box with the largest iou
    activated_anc_mask = activated_anc_mask.max(dim=-1)[0] # Bx(AxH’xW’)
    activated_anc_ind = torch.nonzero(activated_anc_mask.view(-1)).squeeze(-1)

    # GT conf scores
    GT_conf_scores = max_iou_per_anc[activated_anc_mask] # M

    # # GT class
    # box_cls = bboxes[:, :, 4].view(B, 1, N).expand((B, A*h_amap*w_amap, N))
    # GT_class = torch.gather(box_cls, -1, max_iou_per_anc_ind.unsqueeze(-1)).squeeze(-1) # M
    # GT_class = GT_class[activated_anc_mask].long()

    bboxes_expand = bboxes[:, :, :4].view(B, 1, N, 4).expand((B, A*h_amap*w_amap, N, 4))
    bboxes = torch.gather(bboxes_expand, -2, max_iou_per_anc_ind.unsqueeze(-1) \
      .unsqueeze(-1).expand(B, A*h_amap*w_amap, 1, 4)).view(-1, 4)
    bboxes = bboxes[activated_anc_ind]

    print('number of pos proposals: ', activated_anc_ind.shape[0])
    activated_anc_coord = anchors.view(-1, 4)[activated_anc_ind]

    # GT offsets
    # bbox and anchor coordinates are x_tl, y_tl, x_br, y_br
    # offsets are t_x, t_y, t_w, t_h
    wh_offsets = torch.log((bboxes[:, 2:4] - bboxes[:, :2]) \
      / (activated_anc_coord[:, 2:4] - activated_anc_coord[:, :2]))

    xy_offsets = (bboxes[:, :2] + bboxes[:, 2:4] - \
      activated_anc_coord[:, :2] - activated_anc_coord[:, 2:4]) / 2.

    xy_offsets /= (activated_anc_coord[:, 2:4] - activated_anc_coord[:, :2])

    GT_offsets = torch.cat((xy_offsets, wh_offsets), dim=-1)

    # negative anchors
    negative_anc_mask = (max_iou_per_anc < neg_thresh) # Bx(AxH’xW’)
    negative_anc_ind = torch.nonzero(negative_anc_mask.view(-1)).squeeze(-1)
    negative_anc_ind = negative_anc_ind[torch.randint(0, negative_anc_ind.shape[0], (activated_anc_ind.shape[0],))]
    negative_anc_coord = anchors.view(-1, 4)[negative_anc_ind.view(-1)]

    # activated_anc_coord and negative_anc_coord are mainly for visualization purposes
    return activated_anc_ind, negative_anc_ind, GT_conf_scores, GT_offsets, \
          activated_anc_coord, negative_anc_coord

In [None]:
# visualization
# simply create an activation grid where the cells are in green and the centers in red
# you should see the entire image divided by a 7x7 grid, with no gaps on the edges

grid_list = GenerateGrid(w_list.shape[0])

center = torch.cat((grid_list, grid_list), dim=-1)
grid_cell = center.clone()
grid_cell[:, :, :, [0, 1]] -= 1. / 2.
grid_cell[:, :, :, [2, 3]] += 1. / 2.
center = coord_trans(center, w_list, h_list)
grid_cell = coord_trans(grid_cell, w_list, h_list)

for img, anc, grid in zip(img_list, center, grid_cell):
  data_visualizer(img, idx_to_class, anc.reshape(-1, 4), grid.reshape(-1, 4))

In [None]:
fix_random_seed(0)

grid_list = GenerateGrid(w_list.shape[0])
anc_list = GenerateAnchor(anchor_list, grid_list)
iou_mat = IoU(anc_list, resized_box_list)
activated_anc_ind, negative_anc_ind, GT_conf_scores, GT_offsets, \
  activated_anc_coord, negative_anc_coord = ReferenceOnActivatedAnchors(anc_list, resized_box_list, grid_list, iou_mat)

In [None]:
# visualize the activated anchors
anc_per_img = torch.prod(torch.tensor(anc_list.shape[1:-1]))

print('*'*80)
print('Activated (positive) anchors:')
for img, bbox, idx in zip(img_list, box_list, torch.arange(box_list.shape[0])):
  anc_ind_in_img = (activated_anc_ind >= idx * anc_per_img) & (activated_anc_ind < (idx+1) * anc_per_img)
  print('{} activated anchors!'.format(torch.sum(anc_ind_in_img)))
  data_visualizer(img, idx_to_class, bbox[:, :4], coord_trans(activated_anc_coord[anc_ind_in_img], w_list[idx], h_list[idx]))

print('*'*80)
print('Negative anchors:')
for img, bbox, idx in zip(img_list, box_list, torch.arange(box_list.shape[0])):
  anc_ind_in_img = (negative_anc_ind >= idx * anc_per_img) & (negative_anc_ind < (idx+1) * anc_per_img)
  print('{} negative anchors!'.format(torch.sum(anc_ind_in_img)))
  data_visualizer(img, idx_to_class, bbox[:, :4], coord_trans(negative_anc_coord[anc_ind_in_img], w_list[idx], h_list[idx]))

In [None]:
# visualization
print('*'*80)
print('All nine anchors should be exactly centered:')
anc_list = GenerateAnchor(anchor_list, grid_list[:, 3:4, 3:4])
for img, anc in zip(img_list, coord_trans(anc_list, w_list, h_list)):
  print(anc.shape)
  data_visualizer(img, idx_to_class, anc.reshape(-1, 4))

print('*'*80)
print('All anchors of the image (cluttered):')
anc_list = GenerateAnchor(anchor_list, grid_list) # all
for img, anc in zip(img_list, coord_trans(anc_list, w_list, h_list)):
  print(anc.shape)
  data_visualizer(img, idx_to_class, anc.reshape(-1, 4))

### TRAINING RPN

In [None]:
def GenerateProposal(anchors, offsets):
  proposals = torch.zeros_like(anchors)
  anc_trans = torch.zeros_like(anchors)
  anc_trans[:, :, :, :, 2:] = (anchors[:, :, :, :, 2:] - anchors[:, :, :, :, :2]) # w, h = br - tl
  anc_trans[:, :, :, :, :2] = (anchors[:, :, :, :, 2:] + anchors[:, :, :, :, :2]) / 2 # (br + tl) / 2
  new_anc_trans = anc_trans.clone() # avoid inplace operation
  new_anc_trans[:, :, :, :, :2] = anc_trans[:, :, :, :, :2] + offsets[:, :, :, :, :2] * anc_trans[:, :, :, :, 2:]
  new_anc_trans[:, :, :, :, 2:] = torch.mul(anc_trans[:, :, :, :, 2:], torch.exp(offsets[:, :, :, :, 2:]))

  # tansform back
  proposals[:, :, :, :, :2] =  new_anc_trans[:, :, :, :, :2] - (new_anc_trans[:, :, :, :, 2:] / 2)
  proposals[:, :, :, :, 2:] =  new_anc_trans[:, :, :, :, :2] + (new_anc_trans[:, :, :, :, 2:] / 2)
  # print("From 1")
  return proposals

In [None]:
class ProposalModule(nn.Module):
  def __init__(self, in_dim, hidden_dim=256, num_anchors=9, drop_ratio=0.3):
    super().__init__()

    assert(num_anchors != 0)
    self.num_anchors = num_anchors

    self.predictHead = nn.Sequential(
          nn.Conv2d(in_dim,hidden_dim,3,padding=1),
          nn.Dropout(drop_ratio),
          nn.LeakyReLU(),
          nn.Conv2d(hidden_dim,6*self.num_anchors,1)
        )

  def _extract_anchor_data(self, anchor_data, anchor_idx):
    """
    Inputs:
    - anchor_data: Tensor of shape (B, A, D, H, W) giving a vector of length
      D for each of A anchors at each point in an H x W grid.
    - anchor_idx: int64 Tensor of shape (M,) giving anchor indices to extract

    Returns:
    - extracted_anchors: Tensor of shape (M, D) giving anchor data for each
      of the anchors specified by anchor_idx.
    """
    B, A, D, H, W = anchor_data.shape
    anchor_data = anchor_data.permute(0, 1, 3, 4, 2).contiguous().view(-1, D)
    extracted_anchors = anchor_data[anchor_idx]
    return extracted_anchors

  def forward(self, features, pos_anchor_coord=None, \
              pos_anchor_idx=None, neg_anchor_idx=None):
    
    if pos_anchor_coord is None or pos_anchor_idx is None or neg_anchor_idx is None:
      mode = 'eval'
    else:
      mode = 'train'
    conf_scores, offsets, proposals = None, None, None

    anchor_features=self.predictHead(features) #Bx(Ax6)x7x7
    # split features into conf_package and offsets_package
    B,_,H,W = anchor_features.shape
    anchor_features = anchor_features.reshape(B,self.num_anchors,6,H,W)
    conf_package = anchor_features[:,:,:2,:,:]
    offsets_package = anchor_features[:,:,2:,:,:]
    if mode == 'eval':
      conf_scores, offsets = conf_package, offsets_package
    else:
      # train mode
      extracted_conf_package_pos = self._extract_anchor_data(conf_package,pos_anchor_idx)
      extracted_conf_package_neg = self._extract_anchor_data(conf_package,neg_anchor_idx)
      conf_scores = torch.cat((extracted_conf_package_pos,extracted_conf_package_neg), dim=0)[:,0:2]

      offsets = self._extract_anchor_data(offsets_package,pos_anchor_idx)

      M,_=pos_anchor_coord.shape

      proposals = GenerateProposal(pos_anchor_coord.reshape(1,1,1,M,4),
                                   offsets.reshape(1,1,1,M,4)).reshape(M,4)

    if mode == 'train':
      return conf_scores, offsets, proposals
    elif mode == 'eval':
      return conf_scores, offsets

# Loss Function

In [None]:

def ConfScoreRegression(conf_scores, batch_size):
  """
  Binary cross-entropy loss

  Inputs:
  - conf_scores: Predicted confidence scores, of shape (2M, 2). Assume that the
    first M are positive samples, and the last M are negative samples.

  Outputs:
  - conf_score_loss: Torch scalar
  """
  # the target conf_scores for positive samples are ones and negative are zeros
  M = conf_scores.shape[0] // 2
  GT_conf_scores = torch.zeros_like(conf_scores)
  GT_conf_scores[:M, 0] = 1.
  GT_conf_scores[M:, 1] = 1.

  conf_score_loss = F.binary_cross_entropy_with_logits(conf_scores, GT_conf_scores, \
                                     reduction='sum') * 1. / batch_size
  return conf_score_loss
def BboxRegression(offsets, GT_offsets, batch_size):
  """"
  Use SmoothL1 loss as in Faster R-CNN

  Inputs:
  - offsets: Predicted box offsets, of shape (M, 4)
  - GT_offsets: GT box offsets, of shape (M, 4)

  Outputs:
  - bbox_reg_loss: Torch scalar
  """
  bbox_reg_loss = F.smooth_l1_loss(offsets, GT_offsets, reduction='sum') * 1. / batch_size
  return bbox_reg_loss

This is the feature extractor for mobilenet v2, not for AlexNet

In [None]:
class FeatureExtractor(nn.Module):
  """
  Image feature extraction with MobileNet.
  """
  def __init__(self, reshape_size=224, pooling=False, verbose=False):
    super().__init__()

    self.mobilenet = models.mobilenet_v2(pretrained=True)
    self.mobilenet = nn.Sequential(*list(self.mobilenet.children())[:-1]) # Remove the last classifier

    # average pooling
    if pooling:
      self.mobilenet.add_module('LastAvgPool', nn.AvgPool2d(math.ceil(reshape_size/32.))) # input: N x 1280 x 7 x 7

    for i in self.mobilenet.named_parameters():
      i[1].requires_grad = True # fine-tune all

    if verbose:
      summary(self.mobilenet.cuda(), (3, reshape_size, reshape_size))

  def forward(self, img, verbose=False):
    """
    Inputs:
    - img: Batch of resized images, of shape Nx3x224x224

    Outputs:
    - feat: Image feature, of shape Nx1280 (pooled) or Nx1280x7x7
    """
    num_img = img.shape[0]

    img_prepro = img

    feat = []
    process_batch = 500
    for b in range(math.ceil(num_img/process_batch)):
      feat.append(self.mobilenet(img_prepro[b*process_batch:(b+1)*process_batch]
                              ).squeeze(-1).squeeze(-1)) # forward and squeeze
    feat = torch.cat(feat)

    if verbose:
      print('Output feature shape: ', feat.shape)

    return feat

In [None]:
FeatureExtractor(verbose=True)

This is the feature extractor for AlexNet

In [None]:
#It is the code to extract feature from transferlearning Alexnet
class AlexFeatureExtractor(nn.Module):
  """
  Image feature extraction with MobileNet.
  """
  def __init__(self, reshape_size=224, pooling=False, verbose=False):
    super().__init__()

    self.alexnet = models.alexnet(pretrained=True)
    self.alexnet = nn.Sequential(*list(self.alexnet.children())[:-1]) # Remove the last classifier

    # average pooling
    if pooling:
      self.alexnet.add_module('LastAvgPool', nn.AvgPool2d(math.ceil(reshape_size/32.))) # input: N x 256 x 7 x 7

    for i in self.alexnet.named_parameters():
      i[1].requires_grad = True # fine-tune all

    if verbose:
      summary(self.alexnet.cuda(), (3, reshape_size, reshape_size))

  def forward(self, img, verbose=False):
    """
    Inputs:
    - img: Batch of resized images, of shape Nx3x224x224

    Outputs:
    - feat: Image feature, of shape Nx1280 (pooled) or Nx1280x7x7
    """
    num_img = img.shape[0]

    img_prepro = img

    feat = []
    process_batch = 500
    for b in range(math.ceil(num_img/process_batch)):
      feat.append(self.alexnet(img_prepro[b*process_batch:(b+1)*process_batch]
                              ).squeeze(-1).squeeze(-1)) # forward and squeeze
    feat = torch.cat(feat)
    if verbose:
      print('Output feature shape: ', feat.shape)
    return feat

In [None]:
class RPN(nn.Module):
  def __init__(self):
    super().__init__()

    # READ ONLY
    self.anchor_list = torch.tensor([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [2, 3], [3, 2], [3, 5], [5, 3]])
    self.feat_extractor = FeatureExtractor()
    self.prop_module = ProposalModule(1280, num_anchors=self.anchor_list.shape[0]) #input size for mobile net 1280

  def forward(self, images, bboxes):

    # weights to multiply to each loss term
    w_conf = 1 # for conf_scores
    w_reg = 5 # for offsets
    total_loss = None
    conf_scores, proposals, features, GT_class, pos_anchor_idx, anc_per_img = \
      None, None, None, None, None, None
    # i) Image feature extraction
    features = self.feat_extractor(images)

    # ii) Grid and anchor generation
    batch_size = images.shape[0]
    grid = GenerateGrid(batch_size)
    # anchors = GenerateAnchor(self.anchor_list.cuda(), grid)
    anchors = GenerateAnchor(self.anchor_list.to(grid.device, grid.dtype), grid) # why this affects the # of proposals

    # iii-1) Compute IoU between anchors and GT boxes
    iou_mat = IoU(anchors, bboxes)
    # iii-2) determine activated & negative anchors, and GT_offsets, GT_class
    pos_anchor_idx, negative_anc_ind, _, GT_offsets,activated_anc_coord,_ = \
      ReferenceOnActivatedAnchors(anchors, bboxes, grid, iou_mat)

    # iv) Compute conf_scores, offsets, proposals through the prediction network
    conf_scores, offsets, proposals = self.prop_module(features,activated_anc_coord,
                                                       pos_anchor_idx, negative_anc_ind)
    anc_per_img = torch.prod(torch.tensor(anchors.shape[1:-1]))

    # v) Compute total loss
    conf_loss = ConfScoreRegression(conf_scores, features.shape[0]) # conf_loss
    # print(conf_scores)
    reg_loss = BboxRegression(offsets, GT_offsets, features.shape[0]) # reg_loss
    total_loss = w_conf * conf_loss + w_reg * reg_loss
    
    return total_loss
  
  
  def inference(self, images, thresh=0.5, nms_thresh=0.7, mode='RPN'):
    
    assert mode in ('RPN', 'FasterRCNN'), 'invalid inference mode!'

    features, final_conf_probs, final_proposals = None, None, None

    # Here we predict the RPN proposal coordinates `final_proposals` and        #
    # confidence scores `final_conf_probs`.                                     #
    # The overall steps are similar to the forward pass but now you do not need  #
    # to decide the activated nor negative anchors.                              #
    # Threshold the conf_scores based on the threshold value `thresh`.     #
    # Then, apply NMS to the filtered proposals given the threshold `nms_thresh`.#


    final_conf_probs, final_proposals = [],[]
    # i) Image feature extraction
    features = self.feat_extractor(images)

    # ii) Grid and anchor generation
    batch_size = images.shape[0]
    grid = GenerateGrid(batch_size)
    # anchors = GenerateAnchor(self.anchor_list.cuda(), grid)
    anchors = GenerateAnchor(self.anchor_list.to(grid.device, grid.dtype), grid)

    # iii) Compute conf_scores, proposals, class_prob through the prediction network
    conf_scores, offsets = self.prop_module(features)
    #offsets: (B, A, 4, H', W')
    #conf_scores: (B, A, 2, H', W')
    B,A,_,H,W = conf_scores.shape

    offsets = offsets.permute((0,1,3,4,2))
    proposals = GenerateProposal(anchors, offsets) #proposals:B,A,H,W,4
    # transform
    conf_scores = torch.sigmoid(conf_scores[:,:,0,:,:]) # only look at the 1st confidence score which represent obj_conf
    conf_scores = conf_scores.permute((0,2,3,1)).reshape(batch_size,-1)
    proposals = proposals.permute((0,2,3,1,4)).reshape(batch_size,-1,4)

    for i in range(batch_size):
      # get proposals, confidence scores for i-th image
      sub_conf_scores = conf_scores[i]
      sub_proposals = proposals[i]

      # filter by conf_scores
      mask1 = sub_conf_scores > thresh
      sub_conf_scores = sub_conf_scores[mask1]
      sub_proposals = sub_proposals[mask1,:]

      # filter by nms
      mask2 = nms(sub_proposals, sub_conf_scores, iou_threshold=nms_thresh)
      # append result
      final_proposals.append(sub_proposals[mask2,:])
      final_conf_probs.append(sub_conf_scores[mask2].unsqueeze(1))


    if mode == 'RPN':
      features = [torch.zeros_like(i) for i in final_conf_probs] # dummy class
    return final_proposals, final_conf_probs, features

# COCO dataset collate function

In [None]:
def coco_collate_fn(batch_lst, reshape_size=224):
  preprocess = transforms.Compose([
    transforms.Resize((reshape_size, reshape_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

  batch_size = len(batch_lst)

  img_batch = torch.zeros(batch_size, 3, reshape_size, reshape_size)

  max_num_box = max(len(batch_lst[i][1]['labels']) \
                    for i in range(batch_size))

  box_batch = torch.Tensor(batch_size, max_num_box, 5).fill_(-1.)# create box_batch filled with -1 due to inconsistency of box numbers
  w_list = []
  h_list = []
  img_id_list = []  

  for i in range(batch_size):
    img, ann = batch_lst[i]
    w_list.append(img.size[0]) # image width
    h_list.append(img.size[1]) # image height
    img_id_list.append(ann['image_id'].item()) #Image Id
    img_batch[i] = preprocess(img)
    all_bbox = ann['boxes']
    for bbox_idx, one_bbox in enumerate(all_bbox):
      bbox = one_bbox
      obj_cls = idx_to_class[ann['labels'][bbox_idx].item()-1]
      box_batch[i][bbox_idx] =torch.Tensor([float(bbox[0]), float(bbox[1]),
            float(bbox[2]), float(bbox[3]), class_to_idx[obj_cls]])
  h_batch = torch.tensor(h_list)
  w_batch = torch.tensor(w_list)

  return img_batch, box_batch, w_batch, h_batch, img_id_list


In [None]:
def DetectionSolver(detector, torch_dataset, learning_rate=3e-3,
                    lr_decay=1, num_epochs=20, checkpoint_path='detector_checkpoint.pth'):
    # I changed the code since I don't use dataloader
    # and simply input torch_dataset for this function
    train_loader = torch.utils.data.DataLoader(
        torch_dataset, batch_size=2, shuffle=True, num_workers=0,
        collate_fn=coco_collate_fn)

    # ship model to GPU
    detector.to(**to_float_cuda)

    # optimizer setup
    optimizer = optim.Adam(
        filter(lambda p: p.requires_grad, detector.parameters()),
        learning_rate)  # leave betas and eps by default
    lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                               lambda epoch: lr_decay ** epoch)

    # sample minibatch data
    loss_history = []

    detector.train()

    for epoch in range(num_epochs):
        start_time = time.time()

        for iter_num, data_batch in enumerate(train_loader):
            images, boxes, w_batch, h_batch, _ = data_batch
            resized_boxes = coord_trans(boxes, w_batch, h_batch, mode='p2a')
            images = images.to(**to_float_cuda)
            resized_boxes = resized_boxes.to(**to_float_cuda)

            # Forward pass
            loss = detector(images, resized_boxes)
            
            # Backward pass and optimization step
            optimizer.zero_grad()
            loss.backward()
            loss_history.append(loss.item())
            optimizer.step()

            # Print progress
            print('(Epoch {}/{} Iter {}/{}) Loss: {:.4f}'.format(
                epoch + 1, num_epochs, iter_num + 1, len(train_loader), loss.item()))

        # Save checkpoint
        torch.save({
            'epoch': epoch,
            'model_state_dict': detector.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss.item(),
        }, checkpoint_path)

        end_time = time.time()
        print('(Epoch {}/{}): Time per epoch: {:.2f}s'.format(
            epoch + 1, num_epochs, end_time - start_time))

        # Learning rate scheduling
        lr_scheduler.step()

    # plot the training losses
    plt.plot(loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training Loss History')
    plt.show()

--------------------------------------------------------------------

# later

In [None]:

RPNSolver = DetectionSolver
num_sample = 10
for lr in [1e-3]:
  print('lr: ', lr)
  rpn = RPN()
  RPNSolver(rpn, busy_torch_dataset, learning_rate=lr, num_epochs=50)

# Overfitting to small dataset

In [None]:
def nms(boxes, scores, iou_threshold=0.5, topk=None):
  """
  Non-maximum suppression removes overlapping bounding boxes.

  Inputs:
  - boxes: top-left and bottom-right coordinate values of the bounding boxes
    to perform NMS on, of shape Nx4
  - scores: scores for each one of the boxes, of shape N
  - iou_threshold: discards all overlapping boxes with IoU > iou_threshold; float
  - topk: If this is not None, then return only the topk highest-scoring boxes.
    Otherwise if this is None, then return all boxes that pass NMS.

  Outputs:
  - keep: torch.long tensor with the indices of the elements that have been
    kept by NMS, sorted in decreasing order of scores; of shape [num_kept_boxes]
  """

  if (not boxes.numel()) or (not scores.numel()):
    return torch.zeros(0, dtype=torch.long)

  keep = None

  keep = []
  # print(keep.dtype)
  indexing = torch.argsort(scores, descending=True)
  boxes_sort = boxes[indexing, :]
  # print(boxes_sort)
  areas = torch.prod(boxes[:, 2:] - boxes[:, :2], dim=1)
  # print(areas.shape)
  while indexing.size()[0] > 0:
    # still left
    # print(indexing.size()[0])
    idx = indexing[0]
    max_box = boxes[idx] # current max
    # print(keep)
    # print(idx)
    #torch.cat((keep, idx))
    keep.append(idx)
    # compute iou:
    tl = torch.max(max_box[:2], boxes[indexing][:, :2]) # should broadcast
    # print("tl is", tl)
    br = torch.min(max_box[2:], boxes[indexing][:, 2:])
    #print(torch.prod(br - tl, dim=3))
    intersect = torch.prod(br - tl, dim=1) * (tl < br).all(dim=1)
    # print(intersect.shape)
    a = areas[idx] # (1, )
    b = areas #(N, 1)

    iou_mat = torch.div(intersect, a + b[indexing] - intersect).squeeze() #(N, )
    # print(iou_mat)
    left = torch.where(iou_mat <= iou_threshold)
    indexing = indexing[left]
    # print(indexing.shape)
    # print(left)
  if topk is None:
    pass
  else:
    keep = keep[:topk]
  keep = torch.tensor(keep, **to_long_cuda).to(scores.device)
  return keep

In [None]:

def DetectionInference(detector, torch_dataset, idx_to_class, thresh=0.8, nms_thresh=0.3, output_dir=None):
  data_loader = torch.utils.data.DataLoader(
    torch_dataset, batch_size=2, shuffle=True, num_workers=0,
    collate_fn=coco_collate_fn)
  # ship model to GPU
  detector.to(**to_float_cuda)

  detector.eval()
  start_t = time.time()

  if output_dir is not None:
    det_dir = 'mAP/input/detection-results'
    gt_dir = 'mAP/input/ground-truth'
    if os.path.exists(det_dir):
      shutil.rmtree(det_dir)
    os.mkdir(det_dir)
    if os.path.exists(gt_dir):
      shutil.rmtree(gt_dir)
    os.mkdir(gt_dir)

  for iter_num, data_batch in enumerate(data_loader):
    images, boxes, w_batch, h_batch, img_ids = data_batch
    images = images.to(**to_float_cuda)

    final_proposals, final_conf_scores, final_class = detector.inference(images, thresh=thresh, nms_thresh=nms_thresh)

    # clamp on the proposal coordinates
    batch_size = len(images)
    for idx in range(batch_size):
      torch.clamp_(final_proposals[idx][:, 0::2], min=0, max=w_batch[idx])
      torch.clamp_(final_proposals[idx][:, 1::2], min=0, max=h_batch[idx])

      # visualization
      # get the original image
      # hack to get the original image so we don't have to load from local again...
      i = batch_size*iter_num + idx
      img, _ = torch_dataset.__getitem__(i)

      valid_box = sum([1 if j != -1 else 0 for j in boxes[idx][:, 0]])
      final_all = torch.cat((final_proposals[idx], \
        final_class[idx].float(), final_conf_scores[idx]), dim=-1).cpu()
      resized_proposals = coord_trans(final_all, w_batch[idx], h_batch[idx])

      # write results to file for evaluation (use mAP API https://github.com/Cartucho/mAP for now...)
      if output_dir is not None:
        file_name = img_ids[idx].replace('.jpg', '.txt')
        with open(os.path.join(det_dir, file_name), 'w') as f_det, \
          open(os.path.join(gt_dir, file_name), 'w') as f_gt:
          print('{}: {} GT bboxes and {} proposals'.format(img_ids[idx], valid_box, resized_proposals.shape[0]))
          for b in boxes[idx][:valid_box]:
            f_gt.write('{} {:.2f} {:.2f} {:.2f} {:.2f}\n'.format(idx_to_class[b[4].item()], b[0], b[1], b[2], b[3]))
          for b in resized_proposals:
            f_det.write('{} {:.6f} {:.2f} {:.2f} {:.2f} {:.2f}\n'.format(idx_to_class[b[4].item()], b[5], b[0], b[1], b[2], b[3]))
      else:
        data_visualizer(img, idx_to_class, boxes[idx][:valid_box], resized_proposals)

  end_t = time.time()
  print('Total inference time: {:.1f}s'.format(end_t-start_t))

In [None]:
RPNInference = DetectionInference

In [None]:
with torch.no_grad():
  RPNInference(rpn, busy_torch_dataset, idx_to_class, thresh=0.8, nms_thresh=0.3)