# Faster RCNN


The faster Regional CNN is a developemet on the fast RCNN. It introduces the region proposal network that predicts the bounding boxes. This is opposed to the expensive selective search that Fast RCNN uses. The region proposal network uses the features extracted by the convolutional backbone to preict the probability that an object exists within a given anchor. It provides the input for the ROI pool layer. 

Read more from the paper: https://arxiv.org/abs/1506.01497


# Utility Functions

In [4]:
import numpy as np
import torch 
import torchvision
from torchvision import transforms
from torch import nn
import torch_snippets
from torch_snippets import *
from torchvision.datasets import VOCDetection
from torchvision.transforms import ToTensor
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torch import optim
import selectivesearch
from IPython.display import clear_output
from torch_snippets import Report
from torchvision.ops import nms
import os
import pickle
import glob
import cv2
import pandas as pd
import xmltodict

# As the RoIPool layer not implemented on mps 
# Must use either cuda or cpu
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
device

device(type='cuda')

# Hyperparameters

In [5]:
# Optimation
LEARNING_RATE = 0.005
WEIGHT_DECAY = 0.0005
MOMENTUM = 0.9

# Training params 
EPOCHS = 5
BATCH_SIZE = 4



# Utilities

In [32]:
def get_bounding_boxes(y):
    objects = y['annotation']['object']
    bboxs = []
    for obj in objects:
        bbox = list(obj['bndbox'].values())
        bbox = [int(i) for i in bbox]
        bboxs.append(bbox)
    return torch.tensor(bboxs, dtype=torch.float32)

In [33]:
def get_class_labels(y, encode_labels=None):
    objects = y['annotation']['object']
    class_labels = []
    
    if encode_labels:
        for obj in objects:
            label_name = obj['name']
            label = encode_labels.index(label_name)
            class_labels.append(label)
        return torch.tensor(class_labels)
    else: 
        for obj in objects:
            label = obj['name']
            class_labels.append(str(label))
        return class_labels

# Build the Dataset

In [34]:
class FRCNNDataset(Dataset):
    def __init__(self, root_dir='VOC_data/', set_type='train'):
        self.root = root_dir
        self.ds = pd.read_csv(root_dir + "df_" + set_type + ".csv")
        self.set_type = set_type
        self.image_ids = self.ds["image_idx"].unique()
        self.resize = torchvision.transforms.Resize((224,224))
        
        if set_type=='train':
            self.raw_ds = VOCDetection(root='data/',year ='2012', 
                                       image_set="train", download=True, 
                                       transform=ToTensor())
        elif set_type=='test':
            self.raw_ds = VOCDetection(root='data/',year ='2012', 
                                       image_set="val", download=True, 
                                       transform=ToTensor())
        elif set_type=='val':
            self.raw_ds = VOCDetection(root='data/',year ='2012', 
                                       image_set="trainval", download=True, 
                                       transform=ToTensor())
        else:
            print("set_type must be train, test or trainval")
        
            
        self.labels = ["background", "person", "bird", "cat", "cow", "dog", "horse", "sheep", 
                       "aeroplane", "bicycle", "boat", "bus", "car", "motorbike", "train", 
                       "bottle", "chair", "diningtable", "pottedplant", "sofa", "tvmonitor"]
        
    def __len__(self):
        return len(self.image_ids)
        
    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        image, target = self.raw_ds.__getitem__(image_id)
        image = self.resize(image)
        gtbbs = get_bounding_boxes(target)
        labels = get_class_labels(target,self.labels)
        return image, {"boxes":gtbbs, "labels": labels}      
    
    def collate_fn(self, batch):
        return tuple(zip(*batch))         

In [35]:
train_ds = FRCNNDataset(set_type='train')
test_ds = FRCNNDataset(set_type='test')

Using downloaded and verified file: data/VOCtrainval_11-May-2012.tar
Extracting data/VOCtrainval_11-May-2012.tar to data/
Using downloaded and verified file: data/VOCtrainval_11-May-2012.tar
Extracting data/VOCtrainval_11-May-2012.tar to data/


In [36]:
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, collate_fn=train_ds.collate_fn, drop_last=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, collate_fn=train_ds.collate_fn, drop_last=True)

In [46]:
for img, targ in train_loader:
    break
    


# Model Building 

In [47]:
def get_model():
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, len(train_ds.labels))
    return model


# Model Training

In [48]:
def train_batch(inputs, model, optimizer):
    model.train()
    input, targets = inputs
    input = list(image.to(device) for image in input)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
    optimizer.zero_grad()
    losses = model(input, targets)
    loss = sum(loss for loss in losses.values())
    loss.backward()
    optimizer.step()
    return loss, losses 

In [49]:
@torch.no_grad()
def validate_batch(inputs, model):
    model.train()
    input, targets = inputs 
    input = list(image.to(device) for image in input)
    targets = [{k: v.to(device) for k,v in t.items} for t in targets]
    optimizer.zero_grad()
    losses = model(input, targets)
    loss = sum(loss for loss in losses.values())
    return loss, losses

In [52]:
model = get_model().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE,
                            momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

log = Report(EPOCHS)

In [53]:
for epoch in range(EPOCHS):
    _n = len(train_loader)
    for ix, inputs in enumerate(train_loader):
        loss, losses = train_batch(inputs, model, optimizer)
        loc_loss, regr_loss, loss_objectness, loss_rpn_box_reg = [losses[k] for k in ['loss_classifier',
                                                                                      'loss_box_reg', 
                                                                                      'loss_objectness',
                                                                                      'loss_rpn_box_reg']]
        pos = (epoch + (ix+1)/_n)
        log.record(pos, trn_loss=loss.item(),
                  trn_loc_loss=loc_loss.item(),
                  trn_regr_loss=regr_loss.item(),
                  trn_objectness_loss=loss_objectness.item(),
                  trn_rpn_box_reg_loss=loss_rpn_box_reg.item(), end='\r')
        
    _n = len(test_loader)
    PATH = "saved_models/FasterRCNN_EPOCH_" + str(epoch) + "_accuracy_" + "{0:.4g}".format(accs.mean())
    torch.save(frcnn.state_dict(), PATH)
    for ix, inputs in enumerate(test_loader):
        loss, losses = train_batch(inputs, model, optimizer)
        loc_loss, regr_loss, loss_objectness, loss_rpn_box_reg = [losses[k] for k in ['loss_classifier',
                                                                                      'loss_box_reg', 
                                                                                      'loss_objectness',
                                                                                      'loss_rpn_box_reg']]
        pos = (epoch + (ix+1)/_n)
        log.record(pos, val_loss=loss.item(),
                  val_loc_loss=loc_loss.item(),
                  val_regr_loss=regr_loss.item(),
                  val_objectness_loss=loss_objectness.item(),
                  val_rpn_box_reg_loss=loss_rpn_box_reg.item(), end='\r')
                

EPOCH: 0.054  trn_loss: 8.706  trn_loc_loss: 0.130  trn_regr_loss: 0.033  trn_objectness_loss: 0.894  trn_rpn_box_reg_loss: 7.649  (20.38s - 1853.40s remaining)))

KeyboardInterrupt: 

# Evaluate 

In [None]:
def decode_output(output):
    labels = np.array([train_ds.labels[i] for i in output['labels'].cpu().detach().numpy()])
    bbs = output['boxes'].cpu().detach().numpy().astype(np.uint16)
    confs = output['scores'].cpu().detach().numpy()
    ixs = nms(torch.tensor(bbs.astype(np.float32)), torch.tensor(confs), 0.05)
    bbs, confs, labels = [tensor[ixs] for tensor in [bbs, confs, labels]]
    if len(ixs) == 1:
        bbs, confs, labels = [np.array([tensor]) for tensor in [bbs, confs, labels]]
    return bbs.tolist(), confs.tolist(), labels.tolist()
        
                    

In [None]:
model.eval()
for ix, (images, targets) in enumerate(train_loader):
    if ix==3: break
    images = [im for im in images]
    outputs = model(images)
    print(outputs[0])
    for ix, output in enumerate(outputs):
        bbs, confs, labels = decode_output(output)
        info = [f'{l}@{c:.2f}' for l,c in zip(labels, confs)]
        show(images[ix].cpu().permute(1,2,0), bbs=bbs, texts=labels, sz=5, text_sz=12)