In [2]:
import tensorflow as tf
import cv2
import numpy as np
from moviepy.editor import VideoFileClip

In [3]:
import sys
sys.path.append("../")

In [4]:
from preprocessing import ssd_vgg_preprocessing
from nets import ssd_vgg_300, np_methods
from notebooks import visualization

In [6]:
slim = tf.contrib.slim

In [7]:
class_names = ["background", "aeroplane", "bicycle", "bird", "boat",
               "bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
               "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
               "sofa", "train", "monitor"]

In [5]:
isess = tf.InteractiveSession()

In [8]:
# Input placeholder.
net_shape = (300, 300)
data_format = 'NHWC'
img_input = tf.placeholder(tf.uint8, shape=(None, None, 3))

In [9]:
# Evaluation pre-processing: resize to SSD net shape.
image_pre, labels_pre, bboxes_pre, bbox_img = ssd_vgg_preprocessing.preprocess_for_eval(
    img_input, None, None, net_shape, data_format, resize=ssd_vgg_preprocessing.Resize.WARP_RESIZE)
image_4d = tf.expand_dims(image_pre, 0)

In [10]:
# Define the SSD model.
ssd_net = ssd_vgg_300.SSDNet()
with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)):
    predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False)

In [11]:
# Restore SSD model.
ckpt_filename = '../checkpoints/ssd_300_vgg.ckpt'
#ckpt_filename = '../checkpoints/VGG_VOC0712_SSD_300x300_ft_iter_120000.ckpt'
isess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(isess, ckpt_filename)

In [12]:
# SSD default anchor boxes.
ssd_anchors = ssd_net.anchors(net_shape)

In [14]:
# Create unique and somewhat visually distinguishable bright
# colors for the different classes.
num_classes = len(class_names)
class_colors = []
for i in range(0, num_classes):
    # This can probably be written in a more elegant manner
    hue = 255*i/num_classes
    col = np.zeros((1,1,3)).astype("uint8")
    col[0][0][0] = hue
    col[0][0][1] = 128 # Saturation
    col[0][0][2] = 255 # Value
    cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR)
    col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2]))
    class_colors.append(col) 

In [25]:
def ssd_process_frame(img, select_threshold=0.5, nms_threshold=0.2):
    """Process a video frame through SSD network, apply NMS algorithm and draw bounding boxes.
    
    Arguments:
      img: Numpy array containing an image.
      select_threshold: Classification threshold (i.e. probability threshold for car detection).
      nms_threshold: NMS threshold.
    Return:
      image with bounding boxes.
    """
    to_draw = img
    
    # Run SSD network.
    rimg, rpredictions, rlocalisations, rbbox_img = isess.run(
        [image_4d, predictions, localisations, bbox_img],
        feed_dict={img_input: to_draw})
    
    # Get classes and bboxes from the net outputs.
    rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select(
            rpredictions, rlocalisations, ssd_anchors,
            select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True)

    # Remove other classes than cars.
    idxes = (rclasses == 7)
    #idxes = ((rclasses == 2) | (rclasses == 12))
    rclasses = rclasses[idxes]
    rscores = rscores[idxes]
    rbboxes = rbboxes[idxes]
    
    rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes)
    rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400)
    rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold)
    
    # Resize bboxes to original image shape. Note: useless for Resize.WARP!
    rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes)
    
    height = to_draw.shape[0]
    width = to_draw.shape[1]

    for i in range(rclasses.shape[0]):
        cls_id = int(rclasses[i])
        if cls_id >= 0:
            score = rscores[i]
            ymin = int(rbboxes[i, 0] * height)
            xmin = int(rbboxes[i, 1] * width)
            ymax = int(rbboxes[i, 2] * height)
            xmax = int(rbboxes[i, 3] * width)
            cv2.rectangle(to_draw, (xmin, ymin), (xmax, ymax), class_colors[cls_id], 2)

            text_top = (xmin, ymin)
            text_bot = (xmin + 80, ymin + 15)
            text_pos = (xmin + 5, ymin + 10)
            cv2.rectangle(to_draw, text_top, text_bot, class_colors[cls_id], -1)
            
            text = class_names[cls_id] + " " + ('%.2f' % score)
            cv2.putText(to_draw, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1)

    return to_draw

In [28]:
# Selection parameters.
select_threshold=0.17
nms_threshold=0.45

In [29]:
clip1 = VideoFileClip("challenge_video.mp4")
white_clip = clip1.fl_image(lambda x: ssd_process_frame(x, select_threshold, nms_threshold))
%time white_clip.write_videofile('challenge_video_cars.mp4', audio=False)

[MoviePy] >>>> Building video challenge_video_cars.mp4
[MoviePy] Writing video challenge_video_cars.mp4


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:50<00:00,  9.69it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: challenge_video_cars.mp4 

Wall time: 50.9 s
