# Video Object Detection Project
by Amanda Ma

This project uses [OpenCV](https://opencv.org/) for video and image 
processing, as well as [SSD MobileNet V1 Coco](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md), a trained tensorflow model from [TensorflowZoo](https://github.com/tensorflow/models), to identify objects from videos. Pictures and videos are obtained from [pixabay](https://pixabay.com/).


In [0]:
import matplotlib.pyplot as plt
import tensorflow as tf
import cv2 as cv

#### Load the Labels
The object labels are stored in a text file with the following format: <br>
0: unlabeled, 1: person, 2: bicycle ...... 75: remote, 76: keyboard ...... 180: window-blind, 181: window-other, 182: wood <br> <br>
Objects and their associated label numbers are stored in the dictionary _labels_ for future lookup. (key = label number, value = label)

In [0]:
labels = {}
with open('labels.txt', 'r') as labelsFile:
  for line in labelsFile.readlines():
    split = line.strip().split(': ')
    labels[int(split[0])] = split[1]
  print(labels)

#### Obtain the Model File

In [0]:
import urllib.request
import os

In [0]:
base_url = 'http://download.tensorflow.org/models/object_detection/'
file_name = 'ssd_mobilenet_v1_coco_2018_01_28.tar.gz'
url = base_url + file_name
urllib.request.urlretrieve(url, file_name)
os.listdir()

#### Extract the Model Data

In [0]:
import tarfile
import shutil

In [0]:
dir_name = file_name[0:-len('.tar.gz')]
if os.path.exists(dir_name):
  shutil.rmtree(dir_name) 
tarfile.open(file_name, 'r:gz').extractall('./')
os.listdir(dir_name)

#### Load the Model
The model file is read in binary format and parsed into a GraphDef object

In [0]:
frozen_graph = os.path.join(dir_name, 'frozen_inference_graph.pb')
with tf.gfile.FastGFile(frozen_graph,'rb') as f:
  graph_def = tf.GraphDef()
  graph_def.ParseFromString(f.read());

#### Video Processing and Frame Labeling

In [0]:
from PIL import Image

In [0]:
def process_video(input_filename, output_filename, frames): 
  """ Takes an input video and creates an output video with labels
      on various objects detected in the video. 
  """
  assert type(input_filename) is str 
  assert type(output_filename) is str
  assert type(frames) is int 
  
  input_video = cv.VideoCapture(input_filename)
  height = int(vid.get(cv.CAP_PROP_FRAME_HEIGHT))
  width = int(vid.get(cv.CAP_PROP_FRAME_WIDTH))
  fps = int(vid.get(cv.CAP_PROP_FPS))
  total_frames = int(vid.get(cv.CAP_PROP_FRAME_COUNT))
  
  fourcc = cv.VideoWriter_fourcc(*'MP4V')
  output = cv.VideoWriter(output_filename, fourcc, fps, (width, height))
  
  with tf.Session() as sess: 
    sess.graph.as_default()
    tf.import_graph_def(graph_def, name = '')
    
    for i in range(0, total_frames, frames): 
      input_video.set(cv.CAP_PROP_POS_FRAMES, i)
      ret, frame = input_video.read()
      if not ret: 
        raise Exception("Problem reading frame", i, " from video")
      else: 
        labeled_frame = labelFrame(frame, sess)
        output.write(labeled_frame)
        
  input_video.release()
  output.release()

In [0]:
def labelFrame(frame, sess): 
  """ Takes a single frame and draws bounding boxes with labels around 
      detected objects (in the context of the tensorflow session).
  """
  frame_scaled = cv.resize(frame, (300, 300))
  
  detections = sess.run(
                [
                  sess.graph.get_tensor_by_name('num_detections:0'), 
                  sess.graph.get_tensor_by_name('detection_scores:0'), 
                  sess.graph.get_tensor_by_name('detection_boxes:0'), 
                  sess.graph.get_tensor_by_name('detection_classes:0'),
                ],
                  feed_dict = {
                    'image_tensor:0': frame_scaled.reshape(1, 
                      frame_scaled.shape[0], frame_scaled.shape[1], 3)
                  }
               )
  
  num_detections = int(detections[0][0])
  
  height, width = frame.shape[0], frame.shape[1]
  r_frame, g_frame, b_frame = 0, 0, 255
  r_text, g_text, b_text = 255, 255, 255
  scale = 0.8
  
  threshold = 0.4 
  
  for i in range(num_detections): 
    confidence = detections[1][0][i]
    if confidence > threshold: 
      box = detections[2][0][i]
      left_h, left_w = int(box[0]*height), int(box[1]*width)
      right_h, right_w = int(box[2]*height), int(box[3]*width)
      
      cv.rectangle(frame, (left_w, left_h), (right_w, right_h), (r_frame, g_frame, b_frame), 2)
      cv.rectangle(frame, (left_w, left_h), (right_w, left_h - 35), (r_frame, g_frame, b_frame), cv.FILLED)
      
      label_id = int(detections[3][0][i])
      label = labels[label_id]
      
      cv.putText(frame, label, (left_w, left_h - 3), cv.FONT_HERSHEY_SIMPLEX, scale, [r_text, g_text, b_text], 2
                )
    else: 
      break 
      
  return frame 

#### Video Samples
Here are a few video samples uploaded on Vimeo. 

In [0]:
from IPython.display import IFrame

In [0]:
process_video('cars.mp4', 'cars_labeled.mp4', 1)

In [0]:
IFrame('https://player.vimeo.com/video/357028716', width = '660', height = '515')

Here, the labels are harder to see. The dancers are classified as 'person's.

In [0]:
process_video('dance.mp4', 'dance_labeled.mp4', 1)

In [0]:
IFrame('https://player.vimeo.com/video/357039770', width = '660', height = '515')

Not all labels are perfect! In the labeled video below, some animals were mistakenly labeled as dogs or sports balls.

In [0]:
process_video('rooster.mp4', 'roster_labeled.mp4', 1)

In [0]:
IFrame('https://player.vimeo.com/video/357029971', width = '660', height = '515')