In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import model
import torch
import numpy as np
import functools
import utils
from voc2012_dataset.dataset import VOC2012ClassSegmentation

torch.set_default_tensor_type('torch.DoubleTensor')
device = torch.device("cuda")

In [2]:
_input = np.random.rand(3, 512, 512).astype(np.float64)
_input = torch.tensor(_input, dtype=torch.float64, device=device).unsqueeze(0)

In [3]:
resnet = model.ResNet50().to(device)
fpn = model.FPN(*resnet.layers(), out_planes=256).to(device)
rpn = model.RPN(256, 512).to(device)

fpn_feature_maps = fpn(_input)
rpn_regions = []
for o in fpn_feature_maps:
    rpn_regions.append( rpn(o) )

In [4]:
# bbox_delta, class_logits, softmax, flatten rpn_regions
bbox_delta, class_logits, softmax = zip(*rpn_regions)

def merge(x, y):
    return torch.cat( (x, y), dim=1 )

bbox_delta = functools.reduce(merge, bbox_delta[1:], bbox_delta[0])
score = functools.reduce(merge, class_logits[1:], class_logits[0])
class_logits = functools.reduce(merge, class_logits[1:], class_logits[0])

### Get default anchors

Now we have bbox_delta and relative score from RPN, we should generate anchors and apply the RPN result to them.

**Important** make sure everything is generated from bottom to up in the FPN, and the size generated from RPN should be the same as anchors.

In [5]:
scales = [4, 8, 16, 32]
ratios = [0.5, 1, 2]
feature_strides =[4, 8, 16, 32]
input_image_shape = 512
anchors = torch.tensor(utils.generate_pyramid_anchors(scales, ratios, input_image_shape, feature_strides), dtype=torch.float64, device=device)
# make sure the size is the same! and generate everything from bottom to up

### Generate RoIs.

Generate Regions of Interest, the process is: combine the output of `RPN` and anchors we got by `generate_pyramid_anchors` to get the final `fixed sized`, `refined` anchors. The anchors filter process involves the `nms`.

In [6]:
# output: [1, size_of_anchors, 4]
rp = model.RegionProposal()
rois = rp.forward(bbox_delta, score, anchors)

### Generate Target
Using output of RoI and ground truth bbox, class, mask to generate targets, which will used to compute the loss.

In [7]:
gen_targets = model.GenerateTarget()
voc_dataset = VOC2012ClassSegmentation('/home/louis/datasets/VOCdevkit/VOC2012')

image, bboxes, labels, bbox_masks = voc_dataset[5]

image = torch.tensor(image, device=device, dtype=torch.float64)
bboxes = torch.tensor(bboxes, device=device, dtype=torch.float64)
labels = torch.tensor(labels, device=device, dtype=torch.float64)
bbox_masks = torch.tensor(bbox_masks, device=device, dtype=torch.float64)

In [8]:
generate_rpn_targets = model.GenerateRPNTargets()
rpn_class, rpn_bounding_delta = generate_rpn_targets.forward(anchors, bboxes)

In [9]:
rois, bbox, mask, classes = gen_targets.forward(rois, bboxes, bbox_masks, labels) 

In [10]:
print(rois.size())
print(bbox.size())
print(mask.size())
print(classes.size())

torch.Size([48, 4])
torch.Size([48, 4])
torch.Size([48, 28, 28])
torch.Size([48])


### Predict!!

Now we have all the rois and corresponding ground truth, it's time to do predictions.

We'll reuse the feature maps from FPN, predict bbox delta, corresponding class, and mask on each RoI, by using 2 different parallel networks.

In [11]:
cls_and_reg = model.ClsAndReg(21).to(device)

mrcnn_class_logits, mrcnn_probs, mrcnn_bbox = cls_and_reg(fpn_feature_maps, rois)

In [13]:
Mask = model.MaskPredict(21).to(device)
preds = Mask(fpn_feature_maps, rois)

### Now it's time to compute loss!

In [19]:
rpn_class_loss = model.compute_rpn_class_loss(rpn_class, class_logits)

In [20]:
rpn_bbox_loss = model.compute_rpn_bbox_loss(rpn_bounding_delta, bbox_delta, rpn_class)

In [21]:
classes = classes.long()
mrcnn_class_loss = model.compute_mrcnn_class_loss(  classes, mrcnn_class_logits )

In [22]:
mrcnn_bbox_loss = model.compute_mrcnn_bbox_loss(bbox, classes, mrcnn_bbox)

In [23]:
mrcnn_mask_loss = model.compute_mrcnn_mask_loss(mask, classes, preds)