# Construction of YOLO v3 Model

In [69]:
import torch
import torch.nn as nn

In [68]:
_x = torch.randn(1, 3, 416, 416)
_x = nn.Conv2d(3, 3, 3, 1, padding=1)(_x)
_x = nn.Conv2d(3, 3, 3, 2, padding=1)(_x)
_x = nn.Conv2d(3, 3, 3, 2, padding=1)(_x)
_x = nn.Conv2d(3, 3, 3, 2, padding=1)(_x)
print(_x.shape)
_x = nn.Conv2d(3, 3, 3, 2, padding=1)(_x)
print(_x.shape)
_x = nn.Conv2d(3, 3, 3, 2, padding=1)(_x)
print(_x.shape)


torch.Size([1, 3, 52, 52])
torch.Size([1, 3, 26, 26])
torch.Size([1, 3, 13, 13])


In [59]:
""" 
Information about architecture config:
Tuple is structured by (filters, kernel_size, stride) 
Every conv is a same convolution. 
List is structured by "B" indicating a residual block followed by the number of repeats
"S" is for scale prediction block and computing the yolo loss
"U" is for upsampling the feature map and concatenating with a previous layer
"""
#416x416
config = [
    (32, 3, 1),     #416x416, i.e., padding="same" convolution
    (64, 3, 2),     #208x208, i.e., resize by half convolution
    ["B", 1],       #208x208
    (128, 3, 2),    #104x104
    ["B", 2],       #104x104
    (256, 3, 2),    #52x52
    ["B", 8],       #52x52
    (512, 3, 2),    #26x26
    ["B", 8],       #26x26
    (1024, 3, 2),   #None, 1024, 13, 13
    ["B", 4],       # To this point is Darknet-53  #13x13
    (512, 1, 1),    #None, 512, 13, 13
    (1024, 3, 1),   #None, 1024, 13, 13
    "S",
    (256, 1, 1),
    "U",            #None, 256+512 = 3*256, x, x
    (256, 1, 1),
    (512, 3, 1),
    "S",
    (128, 1, 1),
    "U",            #None, 128+256 = 3*128
    (128, 1, 1),
    (256, 3, 1),
    "S",
]

In [60]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=not bn_act, **kwargs)
        self.bn = nn.BatchNorm2d(out_channels)
        self.leaky = nn.LeakyReLU(0.1)
        self.use_bn_act = bn_act
    
    def forward(self, x):
        x = self.conv(x)

        if not self.use_bn_act:
            return x
        else:
            x = self.bn(x)
            x = self.leaky(x)
            return x

In [61]:
class ResidualBlock(nn.Module):
    def __init__(self, channels, use_skip_connection=True, num_repeats=1):
        super(ResidualBlock, self).__init__()
        self.layers = nn.ModuleList()
        for _ in range(num_repeats):
            # spatial dimension is preserved
            self.layers += [
                nn.Sequential(
                    CNNBlock(channels, channels//2, kernel_size = 1),
                    CNNBlock(channels//2, channels, kernel_size = 3, padding = 1)
                )
            ]
        
        self.use_skip_connection = use_skip_connection
        self.num_repeats =  num_repeats
    
    def forward(self, x):
        for layer in self.layers:
            if self.use_skip_connection:
                x = layer(x) + x
            else: 
                x = layer(x)

        return x

In [62]:
# Scales mean 13x13, 26x26, 52x52
class ScalePrediction(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(ScalePrediction, self).__init__()
        # pred preserve spatial dimension
        self.pred = nn.Sequential(
            CNNBlock(in_channels, 2 * in_channels, kernel_size=3, padding=1),
            CNNBlock(2 * in_channels, 3 * (num_classes + 5), bn_act=False, kernel_size=1)
        )
        self.num_classes = num_classes
    
    def forward(self, x):
        """
        Finally reshape and permute into: 
            [
                batch_size, 
                anchor_nums,
                num_of_vertical_cells, 
                num_of_horizontal_cells
                class_scores + bounding_box_predictions, 
            ]
        """
        return (
            self
            .pred(x)
            .reshape(x.shape[0], 3, self.num_classes + 5, x.shape[2], x.shape[3])
            .permute(0, 1, 3, 4, 2)            
        )

In [63]:
class YOLOv3(nn.Module):
    def __init__(self, in_channels=3, num_classes=20):
        super(YOLOv3, self).__init__()
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.layers = self._create_conv_layers()
    
    def forward(self, x):
        outputs = []
        route_connections=[]

        for layer in self.layers:
            if isinstance(layer, ScalePrediction):
                output = layer(x)
                outputs.append(output)
                # i.e., don't treat the ScalePrediction as a chain in our network,
                # just record the result and continue
                continue
            
            # this will be executed as long as layer is not ScalePrediction
            x = layer(x)

            if isinstance(layer, ResidualBlock) and layer.num_repeats==8:
                route_connections.append(x)
                
            elif isinstance(layer, nn.Upsample):
                x=torch.cat([x, route_connections[-1]], dim = 1)
                route_connections.pop()
             
        return outputs

    def _create_conv_layers(self):
        layers = nn.ModuleList()
        # initialize input channels
        in_channels = self.in_channels
        for module in config:
            if isinstance(module, tuple):
                # it is a tuple iff it is a CNNBlock
                out_channels, kernel_size, stride = module
                layers.append(
                    CNNBlock(
                        in_channels=in_channels,
                        out_channels = out_channels,
                        kernel_size = kernel_size,
                        stride = stride,
                        padding = 1 if kernel_size == 3 else 0
                    )
                )
                in_channels = out_channels
            
            elif isinstance(module, list):
                # it is a list iff it is a resudial block
                num_repeats = module[1]
                layers.append(
                    ResidualBlock(
                        in_channels, 
                        num_repeats=num_repeats
                    )
                )
            
            elif isinstance(module, str):
                # Scale Prediction
                if module == "S":
                    layers += [
                        ResidualBlock(
                            in_channels,
                            use_skip_connection=False,
                            num_repeats=1
                        ),
                        CNNBlock(
                            in_channels, 
                            in_channels//2,
                            kernel_size=1
                        ),
                        ScalePrediction(
                            in_channels//2, 
                            num_classes=self.num_classes
                        )
                    ]
                    # Scale prediction will not be counted in the chain of conv nets, 
                    # it will be stored and "continued" in the for loop
                    # so the "current num of channels" is the CNNBlock's one
                    in_channels = in_channels//2
                # Upsampling
                elif module == "U":
                    layers.append(nn.Upsample(scale_factor=2))
                    # every time a ["B", 8] is executed, when then double the in_channel
                    # the output of ["B", 8] then get concated to this upsampled output
                    # this *3 is a summarized pattern of the network
                    in_channels = in_channels*3
        
        return layers

In [64]:
def test():
    num_classes=20
    IMAGE_SIZE=416
    
    model = YOLOv3(num_classes=num_classes)
    x = torch.randn((2, 3, IMAGE_SIZE, IMAGE_SIZE))
    assert model(x)[0].shape == (2, 3, IMAGE_SIZE//32, IMAGE_SIZE//32, num_classes + 5)
    assert model(x)[1].shape == (2, 3, IMAGE_SIZE//16, IMAGE_SIZE//16, num_classes + 5)
    assert model(x)[2].shape == (2, 3, IMAGE_SIZE//8, IMAGE_SIZE//8, num_classes + 5)
    print("success")

In [65]:
test()

success


# Dataset

In [39]:
import numpy as np
import os
import pandas as pd
import torch
import sys

from PIL import Image, ImageFile
sys.path.append(os.getcwd())

from _utils import ( 
  iou_width_height as iou,
  non_max_suppression as nms
)

ImportError: cannot import name 'file_hash' from 'pooch.utils' (C:\Users\user\anaconda3\envs\pytorch\lib\site-packages\pooch\utils.py)