<a href="https://colab.research.google.com/github/murthy95/colab/blob/master/YOLO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##You Only Look Once: Unified, Real-Time Object Detection

[here](https://arxiv.org/pdf/1506.02640.pdf)

###Summary


*   The network outputs a tensor of shape (7, 7, 10+n_classes). At each box in the 7x7 grid the network predicts two bounding boxes each box predicts noramlized offset of cooridnates of the bounding box from the grid, normalized width and height of detection box and objectivity. The class logits, vector of size n_classes corresponds to remaining channels
*   Loss function has three terms which includes classfiication loss, localization loss, object loss and no object loss 
*   All the boxes with class objectivity score  < 0.25 are ignored 
*   The right box corresponding to the box in the label is identified using non maximal supression where the predictions of same class with iou greater than threshold are suppressed and only the box with maximum objectivity is tacken as prediction and identified as match to evalaute loss




In [0]:
#dataset loading
n_classes = 20
n_boxes = 2

#transform bounding boxes when resizing images

class rescale_bbox(object):
  '''reshapes the boudning box coordinates in coordination with the input image
  Args:
      image_size(int, tuple) accepts int or tuple of ints: image shape and width
   
  '''
  def __init__(self, image_shape, transform):
    self.resize = image_shape
    self.factor = (1, 1)
    self.transform = transform
    self.class_dict = {'aeroplane' : 0,
                    'bicycle' : 1,
                    'bird' : 2,
                    'boat' : 3,
                    'bottle' : 4,
                    'bus' : 5,
                    'car' : 6,
                    'cat' : 7,
                    'chair' : 8,
                    'cow' : 9,
                    'diningtable' : 10,
                    'dog' : 11,
                    'horse' : 12,
                    'motorbike' : 13,
                    'person' : 14,
                    'pottedplant' : 15,
                    'sheep' : 16,
                    'sofa' : 17,
                    'train' : 18,
                    'tvmonitor' : 19}
    
  def __call__(self, img, target):
    
    label = target
    shape_x = int(label['annotation']['size']['width'])
    shape_y = int(label['annotation']['size']['height'])
    self.factor = (self.resize/shape_x, self.resize/shape_y)
    objects = label['annotation']['object']
    transformed_boxes = []
    try:
      for obj in objects:
        transformed_boxes.append(self.return_transformed_box(obj))
    except:
      transformed_boxes.append(self.return_transformed_box(objects))
    return self.transform(img), transformed_boxes
    
  def return_transformed_box(self, obj):
    class_id = self.class_dict[obj['name']]
    bndbox = [int(obj['bndbox']['xmin']),
             int(obj['bndbox']['xmax']),
             int(obj['bndbox']['ymin']),
             int(obj['bndbox']['ymax']),]
    
    return ((bndbox[0] + bndbox[1])/2 * self.factor[0],\
            (bndbox[2] + bndbox[3])/2 * self.factor[1],\
            (bndbox[1] - bndbox[0]) * self.factor[0],\
            (bndbox[3] - bndbox[2]) * self.factor[1], class_id)
       
    
#loading data
import torchvision
import torch
transform = torchvision.transforms.Compose(
    [torchvision.transforms.Resize(448),
     torchvision.transforms.ToTensor(),
     torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.VOCDetection(root='./data', year='2007', 
                                             image_set='train',
                                        download=True, transforms=rescale_bbox(448, transform))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.VOCDetection(root='./data', year='2007', 
                                             image_set='val',
                                       download=True, transforms=rescale_bbox(448, transform))
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

In [0]:
#network definition 
import torch
import torch.nn as nn

class conv_pool_block(nn.Module):
  def __init__(self, kernel_sizes, in_maps, out_maps, n_repeat, pooling=False):
    assert len(kernel_sizes) == len(out_maps), 'Inconsistent kernel and filter lengths'
     
    super(conv_pool_block, self).__init__()
    self.layers = []
    for _ in range(n_repeat):
      self.layers.append(nn.Conv2d(in_maps, out_maps[0], 
                                    kernel_sizes[0], stride=1, 
                                    padding = kernel_sizes[0]//2))
      for i in range(1, len(out_maps)):
        self.layers.append(nn.Conv2d(out_maps[i-1], out_maps[i], 
                                    kernel_sizes[i], stride=1, 
                                    padding = kernel_sizes[i]//2))
    if pooling:
      self.layers.append(nn.MaxPool2d(2,2))
      
    self.model = nn.Sequential(*self.layers)
      
  def forward(self, x):
    return self.model(x)
    

class yolonet(nn.Module):
  def __init__(self):
    super(yolonet, self).__init__()
    self.model = nn.Sequential(nn.Sequential(nn.Conv2d(3, 64, 7,stride=2, padding=3),
                              nn.MaxPool2d(2, stride=2)), 
                              conv_pool_block([3], 64, [192], 1, pooling=True), 
                              conv_pool_block([1, 3, 1, 3], 192, 
                                  [128, 256, 256, 512],
                                  1, pooling=True),
                              conv_pool_block([1, 3], 512, 
                                  [256, 512], 4), 
                              conv_pool_block([1, 3], 512, 
                                  [512, 1024],
                                  1, pooling=True), 
                              conv_pool_block([1, 3], 1024, 
                                  [512, 1024], 2),
                              conv_pool_block([3], 1024, 
                                  [1024], 1), 
                              nn.Conv2d(1024, 1024, 3, stride=2, padding=1),
                              conv_pool_block([3, 3], 1024, 
                                  [1024, 1024], 1))
    self.Linear1 = nn.Linear(49*(n_boxes*5 + n_classes), 4096)
    self.Linear2 = nn.Linear(4096, 1470)
    
  def forward(self, x):
    x = self.model(x)
    x = x.view(x.size(0), -1)
    x = self.Linear1(x)
    x = self.Linear2(x)
    return x.view((x.size(0), 30, 7, 7))
  

In [0]:
#creating a network instance and moving to gpu
from torchsummary import summary
net = yolonet()

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
net = net.to(device)
summary(net,(3, 448,448))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           9,472
         MaxPool2d-2         [-1, 64, 112, 112]               0
            Conv2d-3        [-1, 192, 112, 112]         110,784
         MaxPool2d-4          [-1, 192, 56, 56]               0
   conv_pool_block-5          [-1, 192, 56, 56]               0
            Conv2d-6          [-1, 128, 56, 56]          24,704
            Conv2d-7          [-1, 256, 56, 56]         295,168
            Conv2d-8          [-1, 256, 56, 56]          65,792
            Conv2d-9          [-1, 512, 56, 56]       1,180,160
        MaxPool2d-10          [-1, 512, 28, 28]               0
  conv_pool_block-11          [-1, 512, 28, 28]               0
           Conv2d-12          [-1, 256, 28, 28]         131,328
           Conv2d-13          [-1, 512, 28, 28]       1,180,160
           Conv2d-14          [-1, 256,

In [0]:
#defining loss fuction
lambda_coord = 0.1
lambda_noobj = 0.5
'''
classification loss
assuming we get labels and predicitons after performing 
non maximal supression using iou evaluation
preds : torch tensor consisting of center x, center y w, h normalized , 
         objectivity, 20 class probabilities 
'''
center_x_preds, center_y_preds, w_preds,\
h_preds, objectivity, class_logits = torch.split(prediction, [1,1,1,1,1,20], dim=1)
classification_criterion = 
localization_criterion = 

def yolo_loss(nn.Module):
  lambda_coord = 0.5
  lambda_noobj = 0.5
  def __init__(self):
    super(yolo_loss, self).__init__()
    self.classification_loss = nn.CrossEntropyLoss()
    self.bbloss_x = nn.MSELoss()
    self.bbloss_y = nn.MSELoss()
    self.bbloss_w = nn.MSELoss()
    self.bbloss_h = nn.MSELoss()
    self.noobjloss = nn.MSELoss()
    self.objloss = nn.MSELoss()
    
  def forward(self, x, target):
    bbx, bby, bbw, bbh, logits, obj = torch.split(x, [1,1,1,1,1,20], dim=1)
    _bbx, _bby, _bbw, _bbh, label = torch.split(target, 1, dim=1)
    return self.classification_loss(logits, label) + \
              self.lambda_coord * self.bbloss_x(bby, _bby) + \
              self.lambda_coord * self.bbloss_w(F.sqrt(bbw), F.sqrt(_bbw)) + \
              self.lambda_coord * self.bbloss_h(F.sqrt(bbw), F.sqrt(_bbw)) + \
              self.objloss(obj, torch.ones(obj.size(0)))
  

In [0]:
#preparing the labels using non maximum supression 
#input assumed to be pytorch tensor
image_width = 448
image_height = 448
def IoU(bb1, bb2):
  '''
  bb1 : input bounding box 1 the coordinates corresponds to 
        center_x center_y width height
  bb2 : bounding box 2 coordinates in the same order as bb1
  '''
  minx = min(bb1[0] - bb1[2] // 2, bb2[0] - bb2[2] // 2)
  maxx = max(bb1[0] + bb1[2] // 2, bb2[0] + bb2[2] // 2)
  miny = min(bb1[1] - bb1[3] // 2, bb2[1] - bb2[3] // 2)
  maxy = max(bb1[1] + bb1[3] // 2, bb2[1] + bb2[3] // 2)
  
  diffx = (maxx - minx) - (bb1[2] + bb2[2]) 
  diffy = (maxy - miny) - (bb1[3] + bb2[3])
  if diffx < 0  and diffy < 0 :
      intersection = diffx * diffy
      return intersection / (bb1[2] * bb1[3] + bb2[2] * bb2[3] - intersection) 
  return 0
  
def get_labels_with_predicitons(ground_truth, pred_tensor):
  '''
  ground truth : a tensor with shape length of boxes and columns 
                equal to 5 corresponding to bounding box params and class label
  pred_tensor  : a tensor with shape 30, 7, 7
  '''
  #need to make a grid of values corresponding to coordinates
  x = torch.tensor(torch.arange(7))
  y = torch.tensor(torch.arange(7))
  grid_x, grid_y = torch.meshgrid(x, y)
  pred_tensor = pred_tensor.view((-1, 30))
  preds1, preds2, _ = torch.split(pred_tensor, [5, 5, 20], dim=1)
  
  x_min_hat = pred_tensor[:, 0]