<a href="https://colab.research.google.com/github/mohripan/Object-Detection/blob/main/YOLOV1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/mohripan/Object-Detection.git
!pip install --upgrade --force-reinstall --no-deps -q -U albumentations
!pip install --upgrade --force-reinstall --no-deps -q -U opencv-python
!pip install qudida

Cloning into 'Object-Detection'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 24 (delta 13), reused 9 (delta 3), pack-reused 0[K
Unpacking objects: 100% (24/24), done.
[K     |████████████████████████████████| 116 kB 5.0 MB/s 
[K     |████████████████████████████████| 60.9 MB 1.1 MB/s 
[?25hLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
%cd Object-Detection

/content/Object-Detection


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import random
from torch.utils.data import DataLoader, Dataset
from utils import intersection_over_union, iou_width_height, non_max_suppression, mean_average_precision
import numpy as np
import matplotlib.pyplot as plt
import torchvision

In [4]:
architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]

In [5]:
class CNNBlock(nn.Module):
  def __init__(self, in_channels, out_channels, **kwargs):
    super(CNNBlock, self).__init__()
    self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
    self.batchnorm = nn.BatchNorm2d(out_channels)
    self.leakyrelu = nn.LeakyReLU(0.1)

  def forward(self, x):
    return self.leakyrelu(self.batchnorm(self.conv(x)))

class Yolov1(nn.Module):
  def __init__(self, in_channels=3, **kwargs):
    super(Yolov1, self).__init__()
    self.architecture = architecture_config
    self.in_channels = in_channels
    self.darknet = self._create_conv_layers(self.architecture)
    self.fcs = self._create_fcs(**kwargs)

  def forward(self, x):
    x = self.darknet(x)
    return self.fcs(torch.flatten(x, start_dim=1))

  def _create_conv_layers(self, architecture):
    layers = []
    in_channels = self.in_channels

    for x in architecture:
      if type(x) == tuple:
        layers += [CNNBlock(in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3])]
        in_channels = x[1]

      elif type(x) == str:
        layers += [nn.MaxPool2d(2, 2)]

      elif type(x) == list:
        conv1 = x[0]
        conv2 = x[1]
        num_repeats = x[2]

        for _ in range(num_repeats):
          layers += [CNNBlock(in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2], padding=conv1[3])]

          layers += [CNNBlock(conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3])]

          in_channels = conv2[1]
        
    return nn.Sequential(*layers)

  def _create_fcs(self, split_size, num_boxes, num_classes):
    S, B, C = split_size, num_boxes, num_classes
    return nn.Sequential(nn.Flatten(),
                         nn.Linear(1024*S*S, 512),
                         nn.Dropout(0.0),
                         nn.LeakyReLU(0.1),
                         nn.Linear(512, S*S*(C+B*5)),)

In [6]:
def test(S=7, B=2, C=20):
  model = Yolov1(split_size=S, num_boxes=B, num_classes=C)
  x = torch.randn((2, 3, 448, 448))
  print(model(x).shape)

In [9]:
class YoloLoss(nn.Module):
  def __init__(self, S=7, B=2, C=20):
    super(YoloLoss, self).__init__()
    self.mse = nn.MSELoss(reduction='sum')
    self.S = S
    self.B = B
    self.C = C
    self.lambda_noobj = 0.5
    self.lambda_coord = 5

  def forward(self, predictions, target):
    predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B*5)
    iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
    iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 26:30])
    ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
    iou_maxes, best_box = torch.max(ious, dim=0)
    exists_box = target[..., 20].unsqueeze(3)

    box_predictions = exists_box * (best_box*predictions[..., 26:30] + (1-best_box)*predictions[..., 21:25])

    box_targets = exists_box * target[..., 21:25]
    box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(torch.abs(box_predictions[..., 2:4] + 1e-6))

    box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

    box_loss = self.mse(torch.flatten(box_predictions, end_dim=-2), torch.flatten(box_targets, end_dim=-2))

    pred_box = (best_box*predictions[..., 25:26] + (1-best_box) * predictions[..., 20:21])
    object_loss = self.mse(torch.flatten(exists_box*pred_box), torch.flatten(exists_box*target[..., 20:21]))

    no_object_loss = self.mse(torch.flatten((1-exists_box)*predictions[..., 20:21], start_dim=1), torch.flatten((1-exists_box)*target[..., 20:21], start_dim=1))

    no_object_loss += self.mse(torch.flatten((1-exists_box)*predictions[..., 25:26], start_dim=1), torch.flatten((1-exists_box)*target[..., 20:21], start_dim=1))

    class_loss = self.mse(torch.flatten(exists_box*predictions[..., :20], end_dim=-2), torch.flatten(exists_box*target[..., :20], end_dim=-2))

    loss = (self.lambda_coord * box_loss + object_loss + self.lambda_noobj * no_object_loss + class_loss)

    return loss