<a href="https://colab.research.google.com/github/mralamdari/CV-Yolo/blob/main/YOLO_v3_ObjectDetection_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import PIL
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
from seaborn import color_palette
from IPython.display import display

In [2]:
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive'
!kaggle datasets download -d aruchomu/data-for-yolo-v3-kernel
!unzip \*.zip && rm *.zip

Downloading data-for-yolo-v3-kernel.zip to /content
 92% 246M/267M [00:01<00:00, 228MB/s]
100% 267M/267M [00:01<00:00, 214MB/s]
Archive:  data-for-yolo-v3-kernel.zip
replace coco.names? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [3]:
_BATCH_NORM_DECAY = 0.9
_BATCH_NORM_EPSILON = 1e-5
_LEAKY_RELU = 0.1
_ANCHORS = [(10, 13), (16, 30), (33, 23),
            (30, 61), (62, 45), (59, 119),
            (116, 90), (156, 198), (373, 326)]

_MODEL_SIZE = (416, 416)

In [87]:
def batch_norm(inputs, training, data_format):
    # return tf.compat.v1.layers.batch_normalization(
    #                                    axis=1 if data_format == 'channels_first' else 3,
    #                                    momentum=_BATCH_NORM_DECAY,
    #                                    epsilon=_BATCH_NORM_EPSILON,
    #                                    scale=True,
    #                                    training=training)(inputs)


    # return  tf.keras.layers.BatchNormalization(
    #                                    axis=1 if data_format == 'channels_first' else 3,
    #                                    momentum=_BATCH_NORM_DECAY,
    #                                    epsilon=_BATCH_NORM_EPSILON,
    #                                    scale=True,
    #                                    trainable=training)(inputs)

 
    return tf.nn.batch_normalization(x=inputs, axis=1 if data_format == 'channels_first' else 3,
                                    momentum=_BATCH_NORM_DECAY,
                                    epsilon=_BATCH_NORM_EPSILON,
                                    scale=True,
                                    trainable=training
                                     )

In [85]:
def fixed_padding(inputs, kernel_size, data_format):
    pad_total = kernel_size - 1
    pad_beg = pad_total // 2
    pad_end = pad_total - pad_beg

    if data_format == 'channels_first':
      padded_inputs = tf.pad(inputs, [[0, 0], [0, 0],
                                      [pad_beg, pad_end],
                                       [pad_beg, pad_end]])  
    else:
      padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
                                       [pad_beg, pad_end], [0, 0]])
    return padded_inputs

In [6]:
def conv2d_fixed_padding(inputs, filters, kernel_size, data_format, strides=1):
    if strides > 1:
      inputs = fixed_padding(inputs, kernel_size, data_format)
      return tf.layers.conv2d(inputs=inputs, 
                              filters=filters,
                              kernel_size=kernel_size,
                              strides=strides,
                              padding=('SAME' if strides == 1 else 'VALID'),
                              use_bias=False,
                              data_format=data_format)

# Feature extraction: Darknet-53


In [7]:
def darknet53_residual_block(inputs, filters, training, data_format, strides=1):
  shortcut = inputs
  inputs = conv2d_fixed_padding(inputs,
                               filters=filters,
                               kernel_size=1,
                               strides=strides,
                               data_format=data_format)
  
  inputs = batch_norm(inputs,
                      training=training, 
                      data_format=data_format)
  
  inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  inputs = conv2d_fixed_padding(inputs,
                                filters=2 * filters,
                                kernel_size=3,
                                strids=strides,
                                data_format=data_format)
  
  inputs = batch_norm(inputs, 
                      training=training, 
                      data_format=data_format)
  
  inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  inputs += shortcut

  return inputs

In [120]:
def darknet53(inputs, training, data_format):
  print(inputs)
  inputs = conv2d_fixed_padding(inputs,
                                filters=32,
                                kernel_size=3,
                                data_format=data_format)
  
  
  # inputs = batch_norm(inputs,
  #                     training=training,
  #                     data_format=data_format)


  print(22222222222222222222222222222222)
  print(inputs)
  inputs = tf.keras.layers.BatchNormalization(axis=1 if data_format == 'channels_first' else 3,
                                              momentum=_BATCH_NORM_DECAY,
                                              epsilon=_BATCH_NORM_EPSILON,
                                              scale=True,
                                              trainable=training)(inputs)
  print(888888888888888888888888888888)
  
  inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  inputs = conv2d_fixed_padding(inputs,
                                filters=64, 
                                kernel_size=3,
                                strides=2,
                                data_format=data_format)
  
  inputs = batch_norm(inputs, 
                      training=training,
                      data_format=data_format)
  
  inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  inputs = darknet53_residual_block(inputs, 
                                    filters=32,
                                    training=training,
                                    data_format=data_format)
  
  inputs = conv2d_fixed_padding(inputs,
                                filters=128,
                                kernel_size=3, 
                                strides=2,
                                data_format=data_format)
  
  inputs = batch_norm(inputs, 
                      training=training,
                      data_format=data_format)

  inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  for i in range(2):
    inputs = darknet53_residual_block(inputs,
                                      filters=64,
                                      training=training,
                                      data_format=data_format)
    
    inputs = conv2d_fixed_padding(inputs,
                                  filters=256,
                                  kernel_size=3,
                                  strides=2,
                                  data_format=data_format)
    
    inputs = batch_norm(inputs,
                        training=training,
                        data_format=data_format)
    
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  for i in range(8):
    inputs = darknet53_residual_block(inputs, 
                                      frilters=128,
                                      training=training,
                                      data_format=data_format)
    route1 = inputs

    inputs = conv2d_fixed_padding(inputs,
                                  filters=512,
                                  kernel_size=3,
                                  strides=2,
                                  data_format=data_format)
    
    inputs = batch_norm(inputs,
                        training=training,
                        data_format=data_format)
    
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  for i in range(8):
    inputs = darknet53_residual_block(inputs,
                                      filters=256,
                                      training=training,
                                      pad_formet=data_format)
    
    route2=inputs

    inputs = conv2d_fixed_padding(inputs,
                                  filters=1024,
                                  kernel_size=3, 
                                  strides=2,
                                  data_format=data_format)
    
    inputs = batch_norm(inputs,
                        training=training,
                        data_format=data_format)
    
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  for i in range(4):
    inputs = darknet53_residual_block(inputs, 
                                      filters=512,
                                      training=training,
                                      data_format=data_format)
 
  return route1, route2, inputs

In [121]:
model(inputs, training=False)

Tensor("yolo_v3_model_16/truediv:0", shape=(2, 3, 416, 416), dtype=float32)
22222222222222222222222222222222
None


ValueError: ignored

# Convolution layers


In [102]:
def yolo_convolution_block(inputs,  filters, training, data_format):
  inputs = conv2d_fixed_padding(inputs,
                                filters=filters,
                                kernel_size=1,
                                data_format=data_format)
  
  inputs = batch_norm(inputs,
                      training=training,
                      data_format=data_format)
  
  inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  inputs = conv2d_fixed_padding(inputs,
                                filters=2*filters,
                                kernel_size=3,
                                data_format=data_format)
  
  inputs = batch_norm(inputs, 
                      training=training,
                      data_format=data_format)
  
  inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  inputs = conv2d_fixed_padding(inputs,
                                filters=filters,
                                kernel_size=1, 
                                data_format=data_format)
  
  inputs = batch_norm(inputs, 
                      training=training,
                      data_format=data_format)
  
  inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  inputs = conv2d_fixed_padding(inputs,
                                filters=2 * filters,
                                kernel_size=3,
                                data_format=data_format)
  
  inputs = batch_norm(inputs, 
                      training=training,
                      data_format=data_format)
  
  inputs = tf.nn.leakt_relu(inputs, alpha=_LEAKY_RELU)

  inputs = batch_norm(inputs,
                      training=training,
                      data_format=data_format)
  
  ipnuts = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)
  
  route = inputs

  inputs = conv2d_fixed_padding(inputs,
                                filters=2 * filters,
                                kernel_size=3,
                                data_format=data_format)
  inputs = batch_norm(inputs,
                      training=training,
                      data_format=data_format)
  
  inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

  return route, inputs

# Detection layers


In [103]:
def yolo_layer(inputs, n_classes, anchors, img_size, data_format):
  n_anchors = len(anchors)
  inputs = tf.layers.conv2d(inputs,
                            filters=n_anchors * (5 + n_classes),
                            kernel_size=1,
                            strides=1,
                            use_bias=True,
                            data_format=data_format)
  
  shape = inputs.get_shape().as_list()
  grid_shape = shape[2: 4] if data_format == 'channels_fist' else shape[1: 3]
  
  if data_format == 'channels_first':
    inputs = tf.transpose(inputs, [0, 2, 3, 1])
  inputs = tf.reshape(inputs, [-1, n_anchors*grid_shape[0]*grid_shape[1], 5+n_classes])

  strides = (img_size[0] // grid_shape[0], img_size[1]//grid_shape[1])

  box_centers, box_shapes, confidence, classes = tf.split(inputs, [2,2,1,n_classes], axis=-1)

  x = tf.range(grid_shape[0], dtype=tf.float32)
  y = tf.range(grid_shape[1], dtype=tf.float32)
  x_offset, y_offset = tf.meshgrid(x, y)
  x_offset = tf.reshape(x_offset, (-1, 1))
  y_offset = tf.reshape(y_offset, (-1, 1))
  x_y_offset = tf.concat([x_offset, y_offset], axis=-1)
  x_y_offset = tf.tile(x_y_offset, [1, n_anchors])
  x_y_offset = tf.reshape(x_y_offset, [1, -1, 2])
  box_centers = tf.nn.sigmoid(box_centers)
  box_centers = (box_centers + x_y_offset) * strides

  anchors = tf.tile(anchors, [grid_shape[0]*grid_shape[1], 1])
  box_shapes = tf.exp(box_shapes) * tf.to_float(anchors)
  confidence = tf.nn.sigmoid(classes)
  inputs = tf.concat([box_centers, box_shapes, confidence, classes], axis=-1)
  return inputs

# Upsample layer

In [104]:
def upsample(inputs, out_shape, data_format):
  if data_format == 'channels_first':
    inputs = tf.transpose(inputs, [0, 2, 3, 1])
    new_height = out_shape[3]
    new_width = out_shape[2]
  else:
    new_height = out_shape[2]
    new_width = out_shape[1]
  
  inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width))

  if data_format == 'channels_first':
    inputs = tf.transpose(inputs, [0, 3, 1, 2])
  
  return inputs

# Non-max suppression

In [105]:
def build_boxes(inputs):
  center_x, center_y, width, height, confidence, classes = tf.split(inputs, [1,1,1,1,1,-1], axis=-1)
  top_left_x = center_x - width / 2
  top_left_y = center_y - height / 2
  bottom_right_x = center_x + width / 2
  bottom_right_y = center_y + height / 2

  boxes = tf.concat([top_left_x, top_left_y, bottom_right_x, bottom_right_y, confidence, classes], axis=-1)

In [106]:
def non_max_suppression(inputs, n_classes, max_output_size, iou_threshold, confidence_threshold):
  batch = tf.unsrack(inputs)
  boxes_dicts = []
  for boxes in batch:
    boxes = tf.boolean_mask(boxes, boxes[:, 4]>confidence_threshold)
    classes = tf.argmax(boxes[:, 5:], axis=-1)
    classes = tf.expand_dims(tf.to_float(classes), axis=-1)
    boxes = tf.concat([boxes[:, :5], classes], axis=-1)

    boxes_dict = dict()
    for cls in range(n_classes):
      mask = tf.equal(boxes[:, 5], cls)
      mask_shape = mask.get_shape()
      if mask_shape.ndims != 0:
        class_boxes = tf.boolean_mask(boxes, mask)
        boxes_coords, boxes_conf_scores, _ = tf.split(class_boxes, [4, 1, -1], axis=-1)
        boxes_conf_scores = tf.reshape(boxes_conf_scores, [-1])
        indices = tf.image.non_max_supperssion(boxes_coords, boxes_conf_scores, max_output_size, iou_threshold)
        class_boxes = tf.gather(class_boxes, indices)
        boxes_dict[cls] = class_boxes[:, :5]
        boxes_dicts.append(boxes_dict)
        return boxes_dicts

# Final model class


In [107]:
class Yolo_v3:
  def __init__(self, n_classes, model_size, max_output_size, iou_threshold, confidence_threshold, data_format=None):
    if not data_format:
      data_format = 'channels_first'
    else:
      data_format = 'channles_last'
    
    self.n_classes = n_classes
    self.model_size = model_size
    self.max_output_size = max_output_size
    self.iou_threshold = iou_threshold
    self.confidence_threshold = confidence_threshold
    self.data_format = data_format

  def __call__(self, inputs, training):


    with tf.compat.v1.variable_scope('yolo_v3_model'):
      if self.data_format == 'channels_first':
        inputs = tf.transpose(inputs, [0, 3, 1, 2])
      inputs = inputs / 255
      
      route1, route2, inputs = darknet53(inputs, 
                                         training=training, 
                                         data_format=self.data_format)
      
      print(self.data_format == 'channels_first', inputs)
      route, inputs = yolo_convolution_block(inputs,
                                             filters=512, 
                                             training=training, 
                                             data_format=self.data_format)
      
      detect1 = yolo_layer(inputs, 
                           n_classes=self.n_classes,
                           img_size=self.model_size,
                           anchors=_ANCHORS[6: 9],
                           data_format=self.data_format)
      
      inputs = conv2d_fixed_padding(route, 
                                    filters=256, 
                                    kernel_size=1,
                                    data_format=self.data_format)
      
      inputs = batch_norm(inputs,
                          training=training,
                          data_format=self.data_format)
      
      inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

      upsample_size = route2.get_shape().as_list()
      inputs = upsample(inputs,
                        out_shape=upsample_size,
                        data_format=self.data_format)
      
      axis= 1 if self.data_format == 'channels_first' else 3

      inputs = tf.concat([inputs, route2], axis=axis)

      rout, inputs = yolo_convolution_block(inputs,
                                            filters=256,
                                            training=training,
                                            data_format=self.data_format)
      
      print(2222222222222222222222222222222)
      detect2 = yolo_layer(inputs, 
                           n_classes=self.n_classes,
                           img_size=self.model_size,
                           anchors=_ANCHORS[3: 6],
                           data_format=self.data_format)
      
      inputs = conv2d_fixed_padding(route, 
                                    filters=128, 
                                    kernel_size=1,
                                    data_format=self.data_format)
      
      inputs = batch_norm(inputs,
                          training=training,
                          data_format=self.data_format)
      
      inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

      upsample_size = route1.get_shape().as_list()

      inputs = upsample(inputs,
                        out_shape=upsample_size,
                        data_format=self.data_format)
      

      inputs = tf.concat([inputs, route1], axis=axis)

      route, inputs = yolo_convolution_block(inputs,
                                            filters=128,
                                            training=training,
                                            data_format=self.data_format)
      
      detect3 = yolo_layer(inputs, 
                           n_classes=self.n_classes,
                           img_size=self.model_size,
                           anchors=_ANCHORS[0: 3],
                           data_format=self.data_format)
      
      inputs = tf.concat([detect1, detect2, detect3], axis=1)
      
      inputs = build_boxes(inputs)    
      
      boxes_dicts = non_max_supperssion(inputs,
                                        n_classess=self.n_classes,
                                        max_output_size=self.max_output_size,
                                        iou_threshold=self.iou_threshold,
                                        confidence_threshold=self.confidence_threshold)

# 5. Utility functions


In [108]:
def load_images(img_names, model_size):
  imgs = []
  for img_name in img_names:
    img = PIL.Image.open(img_name)
    img = img.resize(size=model_size)
    img = np.array(img, dtype=np.float32)
    img = np.expand_dims(img, axis=0)
    imgs.append(img)

  imgs = np.concatenate(imgs)

  return imgs

In [109]:
def load_class_names(file_name):
  with open(file_name, 'r') as f:
    class_names = f.read().splitlines()
  return class_names

In [110]:
def draw_boxes(img_names, box_dicts, class_names, model_size):
  colors = ((np.array(color_palette('hls', 80)) * 255)).astype(np.uint8)
  for num ,img_name, boxes_dict in zip(range(len(img_names)), img_names, boxes_dicts):
    img = PIL.Image.open(img_name)
    draw = PIL.ImageDraw.Draw(img)
    font = PIL.ImageFont.truetype(font='/content/futur.ttf', size=(img.size[0]+img.size[1])//100)
    # font = PIL.ImageFont.truetype(font='../input/futur.ttf', size=(img.size[0]+img.size[1])//100)
    resize_factor = (img.size[0]/model_size[0], img.size[1]/model_size[1])

    for cls in range(len(class_names)):
      boxes = boxes_dict[cls]    
      if np.size(boxes) != 0:
        color = colors[cls]
        for box in boxes:
          xy, confidence = box[:4], box[4]
          xy = [xy[i] * resize_factor[i%2] for i in range(4)] 
          x0, y0 = xy[0], xy[1]
          thickness = (img.size[0]+img.size[1]) // 200
          for t in np.linespace(0, 1, thickness):
            xy[0], xy[1] = xy[0] + t, xy[1] + t
            xy[2], xy[3] = xy[2] - y, xy[3] - t
            draw.rectangle(xy, outline=tuple(color))
          text = f'{class_names[cls]} {confidence*100:.1f}%'
          text_size = draw.textsize(text, font=font)
          draw.rectangle([x0, y0-text_size[1], x0+text_size[0], y0], fill=tuple(color))
          draw.text((x0, y0-text_size[1]), text, fill='black', fint=font)
    display(img)

# 6. Converting weights to Tensorflow format

In [111]:
def load_weights(variables, file_name):
  with open(file_name, 'rb') as f:
    np.fromfile(f, dtype=np.int32, count=5)
    weights = np.fromfile(f, dtype=np.floate32)

    assign_ops = []
    ptr = 0

    for i in range(52):
      conv_var = variables[5 * i]
      gamma, beta, mean, variance = variables[5 * i + 1: 5 * i + 5]
      batch_norm_vars = [beta, gamma, mean, variance]

      for var in batch_norm_vars:
        shape = var.shape.as_list()
        num_params = np.prod(shape)
        var_weights = weights[ptr: ptr+num_params].reshape(shape)
        ptr += num_params
        assign_ops.append(tf.assign(var, var_weights))
      
      shape = conv_var.shape.as_list()
      num_params = np.prod(shape)
      var_weights = weights[ptr: ptr+num_params].reshape(shape[3], shape[2], shape[0], shape[1])
      var_weights = np.transpose(var_weights, (2, 3, 1, 0))
      ptr += num_params
      assign_ops.append(tf.assign(conv_var, var_weights))

    ranges = [range(0, 6), range(6, 13), range(13, 20)]
    unnormalized = [6, 13, 20]  
    for j in range(3):
      for i in ranges[j]:
        current = 52*5 + 5*i + 2*j
        conv_car = variables[current]
        gamma, beata, mean, variance = variables[current+1: current+5]
        batch_norm_vars = [beta, gamma, mean, variance]

        for var in batch_norm_vars:
          shape = var.shape.as_list()
          num_params = np.prod(shape)
          var_weights = weights[ptr: ptr + num_params].reshape(shape)
          ptr += num_params
          assign_ops.append(tf.assign(var, var_weights))
        
        shape = conv_var.shape.as_list()
        num_params = np.prod(shape)
        var_weights = weights[ptr: ptr+num_params].reshape(shape[3], shape[2], shape[0], shape[1])
        var_weights = np.transpose(var_weights, (2, 3, 1, 0))
        ptr += num_params
        assign_ops.append(tf.assign(conv_var, var_weights))
      
      bias = variables[52*5 + unnormalized[j]*5 + 2*j]
      shape = bias.shape.as_list()
      num_params = np.prod(shape)
      var_weights = weights[ptr: ptr+num_params].reshape(shape)
      ptr += num_params
      assign_ops.append(tf.assign(bias, var_weights))

      conv_var = variables[52*5 + unnormalized[j]*5 + j*2]
      shape = conv_var.shape.as_list()
      num_params = np.prod(shape)
      var_weights = weights[ptr: ptr + num_params].reshape(shape[3], shape[2], shape[0], shape[1])
      var_weights = np.transpose(var_weights, (2, 3, 1, 0))
      ptr += num_params
      assign_ops.append(tf.assign(conv_var, var_weights))
  return assign_ops

# Sample images


In [112]:
# img_names = ['../input/dog.jpg', '../input/office.jpg']
img_names = ['/content/dog.jpg', '/content/office.jpg']

# for img in img_names: display(PIL.Image.open(img))

# Detections

In [119]:
batch_size = len(img_names)
batch = load_images(img_names, model_size=_MODEL_SIZE)
# class_names = load_class_names('../input/coco.names')
class_names = load_class_names('/content/coco.names')
n_classes = len(class_names)
max_output_size = 10
iou_threshold = 0.5
confidence_threshold = 0.5
model = Yolo_v3(n_classes=n_classes,
                model_size=_MODEL_SIZE,
                max_output_size = max_output_size,
                iou_threshold = iou_threshold,
                confidence_threshold = confidence_threshold)

tf.compat.v1.disable_eager_execution()
inputs = tf.compat.v1.placeholder(tf.float32, shape=[batch_size, 416, 416, 3])


detections = model(inputs, training=False)

model_vars = tf.global_variables(scope='yolo_v3_model')
# assign_ops = load_weights(model_vars, '../input/yolov3.weights')
assign_ops = load_weights(model_vars, 'yolov3.weights')

with tf.Session() as sess:
  sess.run(assign_ops)
  detection_results = sess.run(detections, feed_dic={input: batch})

draw_boxes(img_names, detection_results, class_names, _MODEL_SIZE)

22222222222222222222222222222222
None


ValueError: ignored