In [1]:
from loss import *
from darknet import *
from yolo import *

In [133]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [162]:
class Loss(nn.Module):
    '''
    Loss function as described in https://arxiv.org/pdf/1506.02640.pdf
    Inputs: Tensors of size [b, S, S, B * 5 + C]
    Output: scalar loss
    
    More details regarding the loss function in the forward function
    '''
    def __init__(self):
        super().__init__()
        self.S = 7
        self.B = 2
        self.C = 20
        self.N = self.B * 5 + self.C
        
        self.lambda_coord = 5
        self.lambda_noobj = 0.5
        
    def compute_iou(self, bbox1, bbox2):
        """ Compute the IoU (Intersection over Union) of two set of bboxes, each bbox format: [x1, y1, x2, y2].
        Args:
            bbox1: (Tensor) bounding bboxes, sized [N, 4].
            bbox2: (Tensor) bounding bboxes, sized [M, 4].
        Returns:
            (Tensor) IoU, sized [N, M].
        """
        # adapted from motokimura's github, in my implementation the bounding boxes will always have size(0) == 1
        bbox1, bbox2 = bbox1.unsqueeze(0), bbox2.unsqueeze(0)
        
        N = bbox1.size(0)
        M = bbox2.size(0)

        # Compute left-top coordinate of the intersections
        lt = torch.max(
            bbox1[:, :2].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]
            bbox2[:, :2].unsqueeze(0).expand(N, M, 2)  # [M, 2] -> [1, M, 2] -> [N, M, 2]
        )
        # Conpute right-bottom coordinate of the intersections
        rb = torch.min(
            bbox1[:, 2:].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]
            bbox2[:, 2:].unsqueeze(0).expand(N, M, 2)  # [M, 2] -> [1, M, 2] -> [N, M, 2]
        )
        # Compute area of the intersections from the coordinates
        wh = rb - lt   # width and height of the intersection, [N, M, 2]
        wh[wh < 0] = 0 # clip at 0
        inter = wh[:, :, 0] * wh[:, :, 1] # [N, M]

        # Compute area of the bboxes
        area1 = (bbox1[:, 2] - bbox1[:, 0]) * (bbox1[:, 3] - bbox1[:, 1]) # [N, ]
        area2 = (bbox2[:, 2] - bbox2[:, 0]) * (bbox2[:, 3] - bbox2[:, 1]) # [M, ]
        area1 = area1.unsqueeze(1).expand_as(inter) # [N, ] -> [N, 1] -> [N, M]
        area2 = area2.unsqueeze(0).expand_as(inter) # [M, ] -> [1, M] -> [N, M]

        # Compute IoU from the areas
        union = area1 + area2 - inter # [N, M, 2]
        iou = inter / union           # [N, M, 2]

        return iou
        
    def forward(self, pred, target):
        '''
        Note: Loss in each cell is only computed for the bounding box predictor that is 'responsible', i.e has the highest IOU 
              The Loss function only penalizes classification error if an object is present in that grid cell

        λ_coord * Summation [0, S^2) Summation [0, B): [(x - x_hat)^2 + (y - y_hat)^2] 
            + λ_coord * Summation [0, S^2) Summation [0, B): [(sqrt(w) - sqrt(w_hat))^2 + (sqrt(h) - sqrt(h_hat))^2] 
                + Summation [0, S^2) Summation [0, B): [(C - C_hat)^2]
                    + λ_noobj * Summation [0, S^2) Summation [0, B): [(C - C_hat)^2]
                        + Summation [0, S^2) Summation [0, num_classes): [(p(c) - p(c_hat)^2]
        '''
        batch_size = pred.size(0)
        coord_mask = target[:, :, :, 4] > 0
        noobj_mask = target[:, :, :, 4] == 0
        coord_mask = coord_mask.unsqueeze(-1).expand_as(target)
        noobj_mask = noobj_mask.unsqueeze(-1).expand_as(target)

        coord_pred = pred[coord_mask].view(-1, self.N)
        coord_target = target[coord_mask].view(-1, self.N)
        
        coord_pred = coord_pred[:, :10].view(-1, self.B, 5)
        coord_target = coord_target[:, :10].view(-1, self.B, 5)
        
        # add some assertions i.e coord_pred.size(0) cannot be greater than batch_size * self.S ** 2
        noobj_pred = pred[noobj_mask].view(-1, self.N)
        noobj_target = target[noobj_mask].view(-1, self.N)
        
        noobj_pred = noobj_pred[:, :10].view(-1, self.B, 5)
        noobj_target = noobj_target[:, :10].view(-1, self.B, 5)

        coord_class_prob_pred = coord_pred[:, 10:]
        coord_class_prob_target = coord_target[:, 10:]

        # loss for lines 1 - 3
        bbox_loss = 0
        for i in range(min(self.S ** 2, coord_pred.size(0))): # only calculate bounding box loss for cells with objects
            info = torch.empty((2, 4)) # use to determine the predictor bounding box

            for j in range(self.B):
                # rescale normalized bounding boxes for the image
                coord_pred[i, j, :2] = coord_pred[i, j, :2] / float(self.S) - 0.5 * coord_pred[i, j, 2:4]
                coord_pred[i, j, 2:4] = coord_pred[i, j, :2] / float(self.S) + 0.5 * coord_pred[i, j, 2:4]

                coord_target[i, j, :2] = coord_target[i, j, :2] / float(self.S) - 0.5 * coord_target[i, j, 2:4]
                coord_target[i, j, 2:4] = coord_target[i, j, :2] / float(self.S) + 0.5 * coord_target[i, j, 2:4]

                l1 = self.lambda_coord * (coord_target[i, j, 0] - coord_pred[i, j, 0]) ** 2 + (coord_target[i, j, 1] - coord_pred[i, j, 1]) ** 2
                l2 = self.lambda_coord * (coord_target[i, j, 2] ** 0.5 - coord_pred[i, j, 2] ** 0.5) ** 2 + (coord_target[i, j, 3] ** 0.5 - coord_pred[i, j, 3] ** 0.5) ** 2
                l3 = (coord_target[i, j, 4] - coord_pred[i, j, 4]) ** 2

                iou = self.compute_iou(coord_pred[i, j, :4], coord_target[i, j, :4])
                
                info[j, 0], info[j, 1], info[j, 2], info[j, 3] = l1, l2, l3, iou

            bbox_loss += sum(info[info[:, 3].argmax()])

        # loss for line 4
        noobj_loss = self.lambda_noobj * F.mse_loss(noobj_pred[:, :, 4], noobj_target[:, :, 4], reduction='sum')
        
        # loss for line 5
        class_prob_loss = F.mse_loss(coord_class_prob_pred, coord_class_prob_target, reduction='sum')
        
        total_loss = bbox_loss + noobj_loss + class_prob_loss
        return total_loss

In [163]:
loss = Loss()

In [167]:
loss(torch.rand(1, 7, 7, 30), torch.zeros(1, 7, 7, 30))

tensor(19.0468)

In [179]:
mp = nn.MaxPool2d(2)

In [183]:
layer = torch.rand(1, 3, 54, 54)

In [184]:
mp(layer).shape

torch.Size([1, 3, 27, 27])

In [261]:
class Darknet(nn.Module):
    def __init__(self):
        # network blocks are built in the same fashion as Figure 3 (https://arxiv.org/pdf/1506.02640.pdf)
        # last 4 conv layers belong to YOLO
        super().__init__()
        
        self.block1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1, inplace=True),
            nn.MaxPool2d(2)
        )
        
        self.block2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=192, kernel_size=3, padding=1),
            nn.BatchNorm2d(192),
            nn.LeakyReLU(0.1, inplace=True),
            nn.MaxPool2d(2)
        )
        
        self.block3 = nn.Sequential(
            nn.Conv2d(in_channels=192, out_channels=128, kernel_size=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            nn.MaxPool2d(2)
        )
        
        self.block4 = nn.Sequential(
            # repeat these 2 layers 4 times
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True),
            nn.MaxPool2d(2)
        )
        
        self.block5 = nn.Sequential(
            # repeat these 2 layers 2 times
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True),
        )
        
        self.fc = nn.Sequential(
            nn.AvgPool2d(7),
            nn.Flatten(),
            nn.Linear(1024, 1000)
        )
        
    def forward(self, x):
        '''
        Inputs: Tensor of shape [b, 3, 224, 224]
        Outputs: Tensor of shape [b, 1000]
        '''
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)
        x = self.fc(x)
        return x

In [262]:
class YOLOv1(nn.Module):
    def __init__(self, grid_size, num_bboxes, num_classes):
        super().__init__()
        self.S = grid_size
        self.B = num_bboxes
        self.C = num_classes
        
        self.darknet = Darknet()
        self.darknet.fc = nn.Identity() # remove fc layer
        
        # YOLO has 4 additional convolutional layers
        self.relu = nn.LeakyReLU(0.1, inplace=True)
        self.bn = nn.BatchNorm2d(1024)
        
        self.conv1 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=2)
        self.conv3 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1)
        
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(7 * 7 * 1024, 4096),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Dropout(0.5),
            nn.Linear(4096, self.S * self.S * (self.B * 5 + self.C)),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        x = self.darknet(x)
        x = self.relu(self.bn(self.conv1(x)))
        x = self.relu(self.bn(self.conv2(x)))
        x = self.relu(self.bn(self.conv3(x)))
        x = self.relu(self.bn(self.conv4(x)))
        x = self.fc(x)
        
        x = x.view(-1, self.S, self.S, self.B * 5 + self.C)
        return x

In [269]:
import torchvision.models as models

In [270]:
resnet = models.resnet50()

In [263]:
dn = Darknet()

In [264]:
dn.fc = nn.Identity()

In [265]:
lol = torch.rand(10, 3, 448, 448)

In [266]:
dn(lol).shape

torch.Size([10, 1024, 14, 14])

In [267]:
yolo = YOLOv1(7, 2, 20)

In [268]:
yolo(lol).shape

torch.Size([10, 7, 7, 30])

In [37]:
np.argmax(np.empty((2, 4)))

5

In [39]:
np.empty((2, 4))[:, 3]

array([4.67492429e-310, 0.00000000e+000])

In [42]:
yes = torch.empty((2, 4))

In [44]:
yes[0] = torch.tensor([1, 2, 3, 4])

In [45]:
yes

tensor([[1.0000e+00, 2.0000e+00, 3.0000e+00, 4.0000e+00],
        [1.1210e-43, 0.0000e+00, 1.1210e-43, 0.0000e+00]])

In [47]:
sum(yes[yes[:, 3].argmax()])

tensor(10.)

In [2]:
ex = torch.rand(3, 7, 7, 30)

In [4]:
coord_mask = ex[:, :, :, 4] > 0

In [7]:
coord_mask = coord_mask.unsqueeze(-1).expand_as(ex) # [b, s, s, n], expands True/False along the last dimension

In [14]:
ex[coord_mask].view(-1, 30).shape # pred tensor on which cells contain objects (cannot exceed batch_size * 49)

torch.Size([147, 30])

In [33]:
coord_tensor = ex[coord_mask].view(-1, 2, 30)

RuntimeError: shape '[-1, 2, 30]' is invalid for input of size 4410

In [34]:
coord_tensor.shape

torch.Size([147, 30])

In [24]:
bbox_pred = coord_tensor[:, :10].contiguous().view(-1, 5) # rearrange predictions where each row is [x, y, w, h, c]
# cannot exceed 2 * batch_size * 49 

In [23]:
coord_tensor[:, 10:].contiguous().view(-1, 20) # rearrange predictions where each row in [p1, p2, p3] class-prediction probabilities

tensor([[0.3103, 0.9990, 0.1813,  ..., 0.7750, 0.3578, 0.6521],
        [0.9908, 0.9281, 0.1904,  ..., 0.9622, 0.6246, 0.7205],
        [0.1489, 0.4374, 0.6778,  ..., 0.8772, 0.5053, 0.9776],
        ...,
        [0.0501, 0.0574, 0.5785,  ..., 0.8874, 0.0271, 0.4839],
        [0.6005, 0.1547, 0.7346,  ..., 0.3301, 0.9680, 0.7063],
        [0.1751, 0.0064, 0.5625,  ..., 0.8553, 0.5265, 0.6781]])

In [29]:
bbox_pred2 = coord_tensor[:, :10].view(-1, 2, 5)

In [30]:
bbox_pred2

tensor([[[0.6695, 0.1184, 0.5340, 0.7720, 0.6951],
         [0.6203, 0.8119, 0.5678, 0.2015, 0.2547]],

        [[0.1451, 0.3023, 0.4500, 0.5360, 0.1922],
         [0.0606, 0.0996, 0.5777, 0.0309, 0.5509]],

        [[0.1015, 0.2533, 0.8717, 0.7552, 0.0876],
         [0.8835, 0.9294, 0.2872, 0.3540, 0.7213]],

        ...,

        [[0.0428, 0.8953, 0.1482, 0.1101, 0.5496],
         [0.6678, 0.4397, 0.7340, 0.3082, 0.7076]],

        [[0.9469, 0.4161, 0.4693, 0.8817, 0.6018],
         [0.5183, 0.3621, 0.7801, 0.5793, 0.9798]],

        [[0.1479, 0.5408, 0.7987, 0.7272, 0.4864],
         [0.2645, 0.1258, 0.9490, 0.3677, 0.2526]]])

In [31]:
bbox_pred

tensor([[0.6695, 0.1184, 0.5340, 0.7720, 0.6951],
        [0.6203, 0.8119, 0.5678, 0.2015, 0.2547],
        [0.1451, 0.3023, 0.4500, 0.5360, 0.1922],
        ...,
        [0.5183, 0.3621, 0.7801, 0.5793, 0.9798],
        [0.1479, 0.5408, 0.7987, 0.7272, 0.4864],
        [0.2645, 0.1258, 0.9490, 0.3677, 0.2526]])

In [27]:
bbox_pred.shape

torch.Size([294, 5])

In [25]:
bbox_pred[0:2]

tensor([[0.6695, 0.1184, 0.5340, 0.7720, 0.6951],
        [0.6203, 0.8119, 0.5678, 0.2015, 0.2547]])

In [None]:
'''
bbox_pred
class_pred

bbox_target
class_target
'''

In [None]:
'''

'''