# YOLO v1 

# Setup

In [1]:
import os
import sys
import pathlib
from typing import (
    List,
    Dict,
    Tuple
)

import numpy as np
import tensorflow as tf
from tensorflow import keras   # MUST to make sure of using tensorflow.keras, not keras
from tensorflow.keras.models import (
    Model,
)
from tensorflow.keras.layers import (
    Layer,
    Normalization,
)

## PYTHONPATH

In [2]:
path_to_lib: str = str(pathlib.Path(os.path.join(os.getcwd(), "../../../../../lib")).resolve())
assert isinstance(path_to_lib, str)
sys.path.append(path_to_lib)

In [3]:
%load_ext autoreload
%autoreload 2

from util_tf.nn import (
    LAYER_NAME_NORM,
    LAYER_NAME_CONV2D,
    LAYER_NAME_ACTIVATION,
    LAYER_NAME_MAXPOOL2D,
    LAYER_NAME_DENSE,
    LAYER_NAME_FLAT,
    LAYER_NAME_BN,
    LAYER_NAME_DROP,
    LAYER_NAME_RESHAPE,
    build_nn_model,
    train,
    get_early_stopping_callback,
    get_tensorboard_callback,
)
from constant import (
    DEBUG_LEVEL,
    TYPE_FLOAT,
    YOLO_V1_IMAGE_WIDTH,
    YOLO_V1_IMAGE_HEIGHT,
    YOLO_V1_IMAGE_CHANNELS,
    YOLO_GRID_SIZE,
    YOLO_PREDICTION_NUM_CLASSES,
    YOLO_PREDICTION_NUM_BBOX,
    YOLO_PREDICTION_NUM_PRED,
    YOLO_LEAKY_RELU_SLOPE,
)

# Model

<img src="../image/architecture.png" align="left"/>


* yolo.cfg

```
[net]
batch=64
subdivisions=64
height=448
width=448
channels=3
momentum=0.9
decay=0.0005

learning_rate=0.001
policy=steps
steps=200,400,600,20000,30000
scales=2.5,2,2,.1,.1
max_batches = 40000

[crop]
crop_width=448
crop_height=448
flip=0
angle=0
saturation = 1.5
exposure = 1.5

[convolutional]
filters=64
size=7
stride=2
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
filters=192
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=256
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=1024
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=1024
size=3
stride=1
pad=1
activation=leaky

[convolutional]
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
filters=1024
size=3
stride=1
pad=1
activation=leaky

#######

[convolutional]
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
size=3
stride=2
pad=1
filters=1024
activation=leaky

[convolutional]
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
size=3
stride=1
pad=1
filters=1024
activation=leaky

[connected]
output=4096
activation=leaky

[dropout]
probability=.5

[connected]
output= 1470
activation=linear

[detection]
classes=20
coords=4
rescore=1
side=7
num=2
softmax=0
sqrt=1
jitter=.2

object_scale=1
noobject_scale=.5
class_scale=1
coord_scale=5
```

In [4]:
S: int = 7     # Number of Grid per axis
B: int = 2     # Number of bounding boxes
P: int = 5     # Elements of prediction: (x, y, w, h, c)
C: int = 20    # Number of classes

In [5]:
input_shape: Tuple[int, int, int] = (
    YOLO_V1_IMAGE_WIDTH, YOLO_V1_IMAGE_HEIGHT, YOLO_V1_IMAGE_CHANNELS
)
layers_config = {
    # --------------------------------------------------------------------------------
    # 1st
    # --------------------------------------------------------------------------------
    "conv01": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (7, 7), "filters": 64, "strides": (2, 2), "padding": "same"
    },
    "act01": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "maxpool01": {
        "kind": LAYER_NAME_MAXPOOL2D, "pool_size": (2, 2), "strides": (2, 2), "padding": "valid"
    },
    # --------------------------------------------------------------------------------
    # 2nd
    # --------------------------------------------------------------------------------
    "conv02": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters": 192, "strides": (1, 1), "padding": "same"
    },
    "act02": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "maxpool02": {
        "kind": LAYER_NAME_MAXPOOL2D, "pool_size": (2, 2), "strides": (2, 2), "padding": "valid"
    },
    # --------------------------------------------------------------------------------
    # 3rd
    # --------------------------------------------------------------------------------
    "conv03_1": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (1, 1), "filters": 192, "strides": (1, 1), "padding": "same"
    },
    "act03_1": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv03_2": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters": 256, "strides": (1, 1), "padding": "same"
    },
    "act03_2": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv03_3": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (1, 1), "filters": 256, "strides": (1, 1), "padding": "same"
    },
    "act03_3": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv03_4": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters":512, "strides": (1, 1), "padding": "same"
    },
    "act03_4": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "maxpool03": {
        "kind": LAYER_NAME_MAXPOOL2D, "pool_size": (2, 2), "strides": (2, 2), "padding": "valid"
    },
    # --------------------------------------------------------------------------------
    # 4th
    # --------------------------------------------------------------------------------
    # Repeat 1
    "conv04_1_1": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (1, 1), "filters": 256, "strides": (1, 1), "padding": "same"
    },
    "act04_1_1": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv04_1_2": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters": 512, "strides": (1, 1), "padding": "same"
    },
    "act04_1_2": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    # Repeat 2
    "conv04_2_1": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (1, 1), "filters": 256, "strides": (1, 1), "padding": "same"
    },
    "act04_2_1": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv04_2_2": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters":512, "strides": (1, 1), "padding": "same"
    },
    "act04_2_2": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    # Repeat 3
    "conv04_3_1": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (1, 1), "filters": 256, "strides": (1, 1), "padding": "same"
    },
    "act04_3_1": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv04_3_2": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters":512, "strides": (1, 1), "padding": "same"
    },
    "act04_3_2": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    # Repeat 4
    "conv04_4_1": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (1, 1), "filters": 256, "strides": (1, 1), "padding": "same"
    },
    "act04_4_1": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv04_4_2": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters": 512, "strides": (1, 1), "padding": "same"
    },
    "act04_4_2": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    # rest
    "conv04_5": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (1, 1), "filters": 512, "strides": (1, 1), "padding": "same"
    },
    "act04_5": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv04_6": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters": 1024, "strides": (1, 1), "padding": "same"
    },
    "act04_6": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "maxpool04": {
        "kind": LAYER_NAME_MAXPOOL2D, "pool_size": (2, 2), "strides": (2, 2), "padding": "valid"
    },
    # --------------------------------------------------------------------------------
    # 5th
    # --------------------------------------------------------------------------------
    # Repeat 1
    "conv05_1_1": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (1, 1), "filters":512, "strides": (1, 1), "padding": "same"
    },
    "act05_1_1": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv05_1_2": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters":1025, "strides": (1, 1), "padding": "same"
    },
    "act05_1_2": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    # Repeat 2
    "conv05_2_1": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (1, 1), "filters":512, "strides": (1, 1), "padding": "same"
    },
    "act05_2_1": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv05_2_2": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters":1024, "strides": (1, 1), "padding": "same"
    },
    "act05_2_2": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    # rest
    "conv05_3": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters":1024, "strides": (1, 1), "padding": "same"
    },
    "act05_3": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv05_4": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters":1024, "strides": (2, 2), "padding": "same"
    },
    "act05_4": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    # --------------------------------------------------------------------------------
    # 6th
    # --------------------------------------------------------------------------------
    "conv06_1": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters":1024, "strides": (1, 1), "padding": "same"
    },
    "act06_1": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    "conv06_2": {
        "kind": LAYER_NAME_CONV2D, "kernel_size": (3, 3), "filters":1024, "strides": (1, 1), "padding": "same"
    },
    "act06_2": {
        "kind": LAYER_NAME_ACTIVATION, "activation": "leaky_relu", "slope": YOLO_LEAKY_RELU_SLOPE
    },
    # --------------------------------------------------------------------------------
    # Fully Connected
    # --------------------------------------------------------------------------------
    "flat": {
        "kind": LAYER_NAME_FLAT, "data_format": "channels_last"
    },
    "full01": {
        "kind": LAYER_NAME_DENSE, "units": 4096, "activation": "relu", "l2": 1e-2
    },
    "drop01": {
        "kind": LAYER_NAME_DROP, "rate": TYPE_FLOAT(0.5),
    },
    # To be able to reshape into (S, S, (C + B * P))
    "full02": {
        "kind": LAYER_NAME_DENSE, "units": (S * S * (C + B * P)), "activation": "relu", "l2": 1e-2
    },
    # --------------------------------------------------------------------------------
    # Rehape into (S, S, (C + B * P))
    # --------------------------------------------------------------------------------
    "reshape": {
        "kind": LAYER_NAME_RESHAPE, "target_shape": (S, S, (C + B * P))
    }
}


In [6]:
model: Model = build_nn_model(model_name="yolo_v1", input_shape=input_shape, layers_config=layers_config)

In [7]:
model.summary()

Model: "yolo_v1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 448, 448, 3)]     0         
                                                                 
 conv01 (Conv2D)             (None, 224, 224, 64)      9472      
                                                                 
 maxpool01 (MaxPooling2D)    (None, 112, 112, 64)      0         
                                                                 
 conv02 (Conv2D)             (None, 112, 112, 192)     110784    
                                                                 
 maxpool02 (MaxPooling2D)    (None, 56, 56, 192)       0         
                                                                 
 conv03_1 (Conv2D)           (None, 56, 56, 192)       37056     
                                                                 
 conv03_2 (Conv2D)           (None, 56, 56, 256)       4426