# Object Detection Demo
Welcome to the object detection inference walkthrough!  This notebook will walk you step by step through the process of using a pre-trained model to detect objects in an image. Make sure to follow the [installation instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md) before you start.

# Imports

In [1]:
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
import time 

import sys 
sys.path.remove('/opt/ros/kinetic/lib/python2.7/dist-packages')
import cv2 
sys.path.append('/opt/ros/kinetic/lib/python2.7/dist-packages')

from distutils.version import StrictVersion
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image

# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
from object_detection.utils import ops as utils_ops

if StrictVersion(tf.__version__) < StrictVersion('1.12.0'):
    raise ImportError('Please upgrade your TensorFlow installation to v1.12.*.')

## added by me
%matplotlib inline
%pylab inline 

#Object detection imports
from utils import label_map_util
from utils import visualization_utils as vis_util

Populating the interactive namespace from numpy and matplotlib


# Model preparation 

## Variables

Any model exported using the `export_inference_graph.py` tool can be loaded here simply by changing `PATH_TO_FROZEN_GRAPH` to point to a new .pb file.  

By default we use an "SSD with Mobilenet" model here. See the [detection model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md) for a list of other models that can be run out-of-the-box with varying speeds and accuracies.

In [2]:
# What model to download.

##change the model name and place the tar.gz file in the same directory 
##make sure that the graph.pb file is also in the same path 

MODEL_NAME = 'ssd_mobilenet_v1_coco_2018_01_28'
MODEL_FILE = 'Iterations/' + MODEL_NAME + '.tar.gz'
#DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'

# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_FROZEN_GRAPH = 'Iterations/' + MODEL_NAME + '/frozen_inference_graph.pb'

# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')

## Download Model

In [3]:
#opener = urllib.request.URLopener()
#opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
tar_file = tarfile.open(MODEL_FILE)
print(MODEL_FILE)
for file in tar_file.getmembers():
    file_name = os.path.basename(file.name)
    if 'frozen_inference_graph.pb' in file_name:
        #tar_file.extract(file, os.getcwd())
        tar_file.extract(file, 'Iterations/')

Iterations/ssd_mobilenet_v1_coco_2018_01_28.tar.gz


## Load a (frozen) Tensorflow model into memory.

In [4]:
detection_graph = tf.Graph()
with detection_graph.as_default():
    od_graph_def = tf.GraphDef()
    with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
        serialized_graph = fid.read()
        od_graph_def.ParseFromString(serialized_graph)
        tf.import_graph_def(od_graph_def, name='')

## Loading label map
Label maps map indices to category names, so that when our convolution network predicts `5`, we know that this corresponds to `airplane`.  Here we use internal utility functions, but anything that returns a dictionary mapping integers to appropriate string labels would be fine

In [None]:
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)

# Detection

In [6]:
def run_inference_for_single_image(image, graph, counter):
    if 'detection_masks' in tensor_dict:
        # The following processing is only for single image
        detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
        detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
        # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
        real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
        detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
        detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
        detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
            detection_masks, detection_boxes, image.shape[0], image.shape[1])
        detection_masks_reframed = tf.cast(
            tf.greater(detection_masks_reframed, 0.5), tf.uint8)
        # Follow the convention by adding back the batch dimension
        tensor_dict['detection_masks'] = tf.expand_dims(
            detection_masks_reframed, 0)
    image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')

    # Run inference
    output_dict = sess.run(tensor_dict,
                            feed_dict={image_tensor: np.expand_dims(image, 0)})

    # all outputs are float32 numpy arrays, so convert types as appropriate
    output_dict['num_detections'] = int(output_dict['num_detections'][0])
    
    output_dict['detection_classes'] = output_dict[
        'detection_classes'][0].astype(np.uint8)
    output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
    output_dict['detection_scores'] = output_dict['detection_scores'][0]
    
    #Centroid of BB in non normalized format 
    #print((output_dict['detection_boxes'][0][1]*720+
    #      output_dict['detection_boxes'][0][3]*720)/2,
    #      (output_dict['detection_boxes'][0][0]*480+
    #      output_dict['detection_boxes'][0][2]*480)/2)
    
    #BB coordinates in normalized format 
    #print(output_dict['detection_boxes'][0])
    
    #BB coordinates in non normalized format
    #print('y min ',output_dict['detection_boxes'][0][0]*480)
    #print('x min ',output_dict['detection_boxes'][0][1]*720)
    #print('y max ',output_dict['detection_boxes'][0][2]*480)
    #print('x max ',output_dict['detection_boxes'][0][3]*720)
    #bounding box is of the form of ymin,xmin,ymax,xmax in tensorflow api
    if 'detection_masks' in output_dict:
        output_dict['detection_masks'] = output_dict['detection_masks'][0]
    
    #print(output_dict['num_detections'])
    #print('class is: ',output_dict['detection_classes'][0])
    
    #print('score is: ',output_dict['detection_scores'][0])
    
    return output_dict

In [7]:
#taken from stuttgart tracking website 

from pykalman import KalmanFilter

Transition_Matrix = [[1,0,1,0],[0,1,0,1],[0,0,1,0],[0,0,0,1]]
Observation_Matrix = [[1,0,0,0],[0,1,0,0]]
initcovariance = 1.0e-3*np.eye(4)
transistionCov = 1.0e-4*np.eye(4)
observationCov = 1.0e-1*np.eye(2)


def Kalman(x_mid_inference,y_mid_inference,counter,bounding_box_length,bounding_box_height): 
    
    Measured = [np.asarray(x_mid_inference),np.asarray(y_mid_inference)]
    Measured = np.asarray(Measured).T
    #print('Measured: \n',Measured)
    #prediction matrix
    MarkedMeasure = np.ma.masked_less(Measured,0)
    #print('Marked measure: \n',MarkedMeasure)
    
    #print('Transition Matrix is:')
    xinit = MarkedMeasure[0,0]
    yinit = MarkedMeasure[0,1]

    vxinit = (MarkedMeasure[1,0] - MarkedMeasure[0,0])
    vyinit = (MarkedMeasure[1,1] - MarkedMeasure[0,1])
    #print(Transition_Matrix)
    #print('vxinit: ',vxinit)
    #print('vyinit: ',vyinit)

    initstate = [xinit,yinit,vxinit,vyinit]
    
    #read https://pykalman.github.io/class_docs.html for more documentation 
    kf = KalmanFilter(transition_matrices = Transition_Matrix,
            observation_matrices = Observation_Matrix,
            initial_state_mean = initstate,
            initial_state_covariance = initcovariance,
            transition_covariance = transistionCov,
            observation_covariance = observationCov)
    (filtered_state_means, filtered_state_covariances) = kf.filter(MarkedMeasure)
    
    cap.set(1, counter)
    ret, img = cap.read()
    #if ret == True:
    #    img = cv2.resize(img,(720,480),fx=0,fy=0, interpolation = cv2.INTER_CUBIC)
    #img = cap.read()
    #print('filtered state: ',filtered_state_means)
    #print(filtered_state_means)
    #print(filtered_state_covariances)
    #x = np.floor(filtered_state_means[counter - 1][0]) # counter starts from 1 but the index of the list begins from 0 (-2 added to adjust for this)
    #y = np.floor(filtered_state_means[counter - 1][1]) # only x min and ymin are considered because the bb coordinates are obtained after adding the width and height of detections
    
    
    x = np.floor(filtered_state_means[-1][0])
    y = np.floor(filtered_state_means[-1][1])
    
    bounding_box_x_min = int(x - bounding_box_length/2)
    bounding_box_y_min = int(y - bounding_box_height/2)
    bounding_box_x_max = int(x + bounding_box_length/2)
    bounding_box_y_max = int(y + bounding_box_height/2)
    #print('tracking',counter)
    #print(bounding_box_x_min,bounding_box_y_min,bounding_box_x_max,bounding_box_y_max)
   
    
    #green bounding box 
    cv2.rectangle(img, (bounding_box_x_min ,bounding_box_y_min), 
                   (bounding_box_x_max, bounding_box_y_max),
                   (0,255,0), 2 )
    
    #cv2.imshow('object_detection',img)
    #cv2.waitKey(500)
    #cv2.destroyAllWindows()
    
    #counter += 1 
    #print(x,y)
    return x,y   


In [8]:
#provide path to the video file 
#CHANGE HERE 
cap = cv2.VideoCapture(filename)
counter = 1 

#CHANGE HERE
#recording the detected bounding boxes 
#detect_track_time = open(filename + 'detect_track_time.txt', 'w')
#sys.stdout = detect_track_time

#detect_track_coordinates = open(filename + 'detect_track_coordinates.txt', 'w')
#sys.stdout = detect_track_coordinates

x_mid_inference = [] 
y_mid_inference = [] 

#fps_coupled = open(filename + 'fps_coupled.txt', 'w')
#sys.stdout = fps_coupled

try:
    with detection_graph.as_default():
        with tf.Session() as sess:
                # Get handles to input and output tensors
                ops = tf.get_default_graph().get_operations()
                all_tensor_names = {output.name for op in ops for output in op.outputs}
                tensor_dict = {}
                for key in [
                  'num_detections', 'detection_boxes', 'detection_scores',
                  'detection_classes', 'detection_masks'
                ]:
                    tensor_name = key + ':0'
                    if tensor_name in all_tensor_names:
                        tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(
                      tensor_name)
                
                absolute_start = time.time()
                while True:
                    ret, image_np = cap.read()
                    #if ret == True:
                    #    image_np = cv2.resize(image_np,(720,480),fx=0,fy=0, interpolation = cv2.INTER_CUBIC)
                    # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
                    # Actual detection.
                    #sys.stdout = f 
                    #detecting on every frame till 5 and after that on every 10th frame 
                    if counter <= 5 or counter % 10 == 0 : 
                        
                        #print('detection #:',counter)
                        #detect_start_time = time.time()
                        output_dict = run_inference_for_single_image(image_np, detection_graph,counter)
                        # Visualization of the results of a detection.
                        vis_util.visualize_boxes_and_labels_on_image_array(
                            image_np,
                            output_dict['detection_boxes'],
                            output_dict['detection_classes'],
                            output_dict['detection_scores'],
                            category_index,
                            instance_masks=output_dict.get('detection_masks'),
                            use_normalized_coordinates=True,
                            line_thickness=1)
                        #detect_end_time = time.time()
                        #print('detection',counter)
                        #print('time for detection operation: ',detect_end_time-detect_start_time,' seconds')
                        #BOUNDING BOX IS OF THE FORM OF FORM of ymin,xmin,ymax,xmax IN TENSORFLOW API
                        
                        # Check if all are written out or only detections
                        # 87 for scissor
                        # 47 for cup 
                        
                        if output_dict['detection_classes'][0] == 87 or output_dict['detection_classes'][0] == 47:
                            
                            #coordinates of bounding box in non normalized format
                            y_min = int(output_dict['detection_boxes'][0][0]*480)
                            x_min = int(output_dict['detection_boxes'][0][1]*720)
                            y_max = int(output_dict['detection_boxes'][0][2]*480)
                            x_max = int(output_dict['detection_boxes'][0][3]*720)
                            
                            x_mid_inference.append((x_min + x_max)/2)
                            y_mid_inference.append((y_min + y_max)/2)
                            #detect_end_time = time.time()
                            #print((detect_end_time-detect_start_time)*1000)
                            
                            #store the detected values
                            #print(x_min,y_min,x_max,y_max)
                            bounding_box_length = x_max-x_min
                            bounding_box_height = y_max-y_min
                            
                        #for time recordings replace the '0,0,0,0' with a '0'
                        #else: 
                            #print('0 0 0 0')
                            
                        
                    else:        
                        #occlusion values
                        x_mid_inference.append(x_mid_inference[-1])
                        y_mid_inference.append(y_mid_inference[-1])
                        #kalman_start_time = time.time()
                        x_mid, y_mid = Kalman(x_mid_inference,y_mid_inference,
                                              counter,bounding_box_length,bounding_box_height) 
                        
                        #print('kal',x_mid,y_mid)
                        
                        #if the tracker starts drifting replace the tracker values with detection values
                        #tolerance of the tracker can be changed here 
                        #when tracker drifts it is replaced with the coordinates of last detection
                        if (abs(x_mid_inference[-1] - x_mid > 5) or abs(y_mid_inference[-1] - y_mid) > 5):
                            #print('tracker not correct')
                            x_mid = x_mid_inference[-1]
                            y_mid = y_mid_inference[-1]
                            #kalman_end_time = time.time()
                            #print((kalman_end_time-kalman_start_time)*1000)
                            bounding_box_x_min = int(x_mid - bounding_box_length/2)
                            bounding_box_y_min = int(y_mid - bounding_box_height/2)
                            bounding_box_x_max = int(x_mid + bounding_box_length/2)
                            bounding_box_y_max = int(y_mid + bounding_box_height/2)
                            #print(bounding_box_x_min,
                            #      bounding_box_y_min,
                            #      bounding_box_x_max,
                            #      bounding_box_y_max)
                            cap.set(1, counter)
                            ret, img = cap.read()
                            #blue bounding box 
                            cv2.rectangle(img, (bounding_box_x_min ,bounding_box_y_min), 
                                               (bounding_box_x_max, bounding_box_y_max),
                                               (255,0,0), 2 )
                            #cv2.imshow('object_detection',img)
                            
                            #cv2.waitKey(1000)
                            
                        #the tracker is performing satisfactorily 
                        else:
                            x_mid = x_mid
                            y_mid = y_mid
                            #kalman_end_time = time.time()
                            #print((kalman_end_time-kalman_start_time)*1000)
                            bounding_box_x_min = int(x_mid - bounding_box_length/2)
                            bounding_box_y_min = int(y_mid - bounding_box_height/2)
                            bounding_box_x_max = int(x_mid + bounding_box_length/2)
                            bounding_box_y_max = int(y_mid + bounding_box_height/2)
                            #print(bounding_box_x_min,
                            #      bounding_box_y_min,
                            #      bounding_box_x_max,
                            #      bounding_box_y_max)
                            
                            #tracking_continuation_Kalman(x_mid,y_mid)
                            #print('tracker correct')
                    
                    
                    
                    #cv2.imshow('object_detection_1', cv2.resize(image_np, (720, 480)))
                    #cv2.imwrite(path + 'Test/'+str(counter)+'.jpg',image_np)
                    counter += 1
                    #100 for time measurements and 50 for iou measurements 
                    if counter > 100: 
                        absolute_end = time.time()
                        cv2.destroyAllWindows()

                        break
                        
                    
                    if cv2.waitKey(1000) & 0xFF == ord('q'):
                        cap.release()
                        cv2.destroyAllWindows()
                        break
                
                              
except Exception as e:
    print(e)
    cap.release()

print(absolute_end-absolute_start)
print(100/(absolute_end-absolute_start))  
