In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import model
import torch
import numpy as np
import functools
import utils
from voc2012_dataset.dataset import VOC2012ClassSegmentation

torch.set_default_tensor_type('torch.DoubleTensor')
device = torch.device("cuda")

### Get A Test Image

Get a test image from voc2012_dataset, by using the **VOC2012ClassSegmentation** class, which is a subclass of **torch.utils.data.Dataset**.

In [2]:
voc_dataset = VOC2012ClassSegmentation('/home/louis/datasets/VOCdevkit/VOC2012')
image, gt_bboxes, gt_labels, gt_masks = voc_dataset[5]

# convert the numpy arrays to torch.tensors
image = torch.tensor(image, device=device, dtype=torch.float64).permute(2,0,1).unsqueeze(0)
gt_bboxes = torch.tensor(gt_bboxes, device=device, dtype=torch.float64)
gt_labels = torch.tensor(gt_labels, device=device, dtype=torch.float64)
gt_masks = torch.tensor(gt_masks, device=device, dtype=torch.float64)

### Get Shared Feature Maps

Get feature maps by feeding the image to **FPN**. The output is an array consisted by outputs of different layers of FPN, from p2 to p5, bottom to up(If input image is 224x224, then p2 is 56x56 and p5 is 7x7).

The feature maps will be used as input for **RPN**, **ClassificationNetwork** and **MaskNetwork**, this is why it is called **shared**

In [3]:
resnet = model.ResNet50().to(device)
fpn = model.FPN(*resnet.layers(), out_planes=256).to(device)
fpn_feature_maps = fpn(image) # we will use fpn_feature_maps a lot in the following code

### Generate Regions by RPN

First, get regions by feeding the feature maps to **RPN** network.  The **RPN** generates 3 results:

1. rpn_bboxes
2. rpn_class_logits
3. rpn_class_softmax

In [4]:
rpn = model.RPN(256, 512).to(device)

fpn_feature_maps = fpn(image)
rpn_regions = []
for o in fpn_feature_maps:
    rpn_regions.append( rpn(o) )

In [5]:
bbox_deltas, class_logits, softmax = zip(*rpn_regions)

# flattern the outputs 
def merge(x, y):
    return torch.cat( (x, y), dim=1 )

bbox_deltas = functools.reduce(merge, bbox_deltas[1:], bbox_deltas[0])
scores = functools.reduce(merge, class_logits[1:], class_logits[0])
class_logits = functools.reduce(merge, class_logits[1:], class_logits[0])

### Get default anchors

Now we have bbox_deltas and relative scores from RPN, we should generate anchors and apply the RPN result to them.

**Important** make sure everything is generated from bottom to up in the FPN, and the size generated from RPN should be the same as anchors.

In [6]:
scales = [4, 8, 16, 32]
ratios = [0.5, 1, 2]
feature_strides =[4, 8, 16, 32]
input_image_shape = 512
anchors = torch.tensor(utils.generate_pyramid_anchors(scales, ratios, input_image_shape, feature_strides), dtype=torch.float64, device=device)
# make sure the size is the same! and generate everything from bottom to up

### Generate RoIs.

Generate Regions of Interest, the process is: combine the output of `RPN` and anchors we got by `generate_pyramid_anchors` to get the final `fixed sized`, `refined` anchors. The anchors filter process involves the `nms`.

In [7]:
# output: [1, size_of_anchors, 4]
rp = model.RegionProposal()
rois = rp.forward(bbox_deltas, scores, anchors)

# the rois will be regard as "ground truth" from now, do not do backpropagation
# remove this will also encounter an error when doing backpropagation
rois = rois.detach()

### Generate Target
Using output of RoI and ground truth bbox, class, mask to generate targets, which will used to compute the loss.

In [8]:
gen_targets = model.GenerateTarget()

In [9]:
generate_rpn_targets = model.GenerateRPNTargets()
rpn_class, rpn_bounding_delta = generate_rpn_targets.forward(anchors, gt_bboxes)

In [10]:
rois, bbox, mask, classes = gen_targets.forward(rois, gt_bboxes, gt_masks, gt_labels) 

### Predict!!

Now we have all the rois and corresponding ground truth, it's time to do predictions.

We'll reuse the feature maps from FPN, predict bbox delta, corresponding class, and mask on each RoI, by using 2 different parallel networks.

In [11]:
cls_and_reg = model.Classifier(21).to(device)

mrcnn_class_logits, mrcnn_probs, mrcnn_bbox = cls_and_reg(fpn_feature_maps, rois)

In [12]:
Mask = model.Mask(21).to(device)
preds = Mask(fpn_feature_maps, rois)

### Now it's time to compute loss!

In [14]:
rpn_class_loss = model.compute_rpn_class_loss(rpn_class, class_logits)
rpn_bbox_loss = model.compute_rpn_bbox_loss(rpn_bounding_delta, bbox_deltas, rpn_class)
classes = classes.long()
mrcnn_class_loss = model.compute_mrcnn_class_loss(  classes, mrcnn_class_logits )
mrcnn_bbox_loss = model.compute_mrcnn_bbox_loss(bbox, classes, mrcnn_bbox)
mrcnn_mask_loss = model.compute_mrcnn_mask_loss(mask, classes, preds)

torch.Size([2000, 4])
torch.Size([65280, 4])
torch.Size([65280])
torch.float64
tensor([ 1.,  1.,  1.,  ...,  0.,  0.,  0.], device='cuda:0')


In [None]:
loss = rpn_class_loss + rpn_bbox_loss + mrcnn_class_loss + mrcnn_bbox_loss + mrcnn_mask_loss