# SSD300 Inference Tutorial

This is a brief tutorial that shows how to use a trained SSD300 for inference on the Pascal VOC datasets. If you'd like more detailed explanations, please refer to [`ssd300_training.ipynb`](https://github.com/pierluigiferrari/ssd_keras/blob/master/ssd300_training.ipynb)

In [1]:
from keras import backend as K
from keras.models import load_model
from keras.preprocessing import image
from keras.optimizers import Adam
from scipy.stats import entropy
from imageio import imread
import numpy as np
from matplotlib import pyplot as plt
import os
import os.path as p
import glob
from PIL import Image

from models.keras_ssd300 import ssd_300
from keras_loss_function.keras_ssd_loss import SSDLoss
from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
from keras_layers.keras_layer_DecodeDetections import DecodeDetections
from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast
from keras_layers.keras_layer_L2Normalization import L2Normalization
from keras.layers import TimeDistributed
from keras.models import Model
from ssd_encoder_decoder.ssd_output_decoder_dropout import decode_detections
from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter

from data_generator.object_detection_2d_data_generator import DataGenerator
from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels

from bounding_box_utils.bounding_box_utils import iou

from tqdm import tqdm_notebook as tqdm

import cv2

%matplotlib inline

Using TensorFlow backend.


In [2]:
# Set the image size.
img_height = 300
img_width = 300

## 1. Load a trained SSD

Either load a trained model or build a model and load trained weights into it. Since the HDF5 files I'm providing contain only the weights for the various SSD versions, not the complete models, you'll have to go with the latter option when using this implementation for the first time. You can then of course save the model and next time load the full model directly, without having to build it.

You can find the download links to all the trained model weights in the README.

### 1.1. Build the model and load trained weights into it

In [3]:
# Define parameters
img_height = 300 # Height of the model input images
img_width = 300 # Width of the model input images
img_channels = 3 # Number of color channels of the model input images
mean_color = [123, 117, 104] # The per-channel mean of the images in the dataset. Do not change this value if you're using any of the pre-trained weights.
swap_channels = [2, 1, 0] # The color channel order in the original SSD is BGR, so we'll have the model reverse the color channel order of the input images.
n_classes = 20 # Number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO
scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05] # The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets
scales_coco = [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05] # The anchor box scaling factors used in the original SSD300 for the MS COCO datasets
scales = scales_pascal
aspect_ratios = [[1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5]] # The anchor box aspect ratios used in the original SSD300; the order matters
two_boxes_for_ar1 = True
steps = [8, 16, 32, 64, 100, 300] # The space between two adjacent anchor box center points for each predictor layer.
offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] # The offsets of the first anchor box center points from the top and left borders of the image as a fraction of the step size for each predictor layer.
clip_boxes = False # Whether or not to clip the anchor boxes to lie entirely within the image boundaries
variances = [0.1, 0.1, 0.2, 0.2] # The variances by which the encoded target coordinates are divided as in the original implementation
normalize_coords = True
n_boxes = 8732
N = 20 # Number of passes through the network

In [4]:
# 1: Build the Keras model

K.clear_session() # Clear previous models from memory.

model = ssd_300(image_size=(img_height, img_width, img_channels),
                n_classes=n_classes,
                mode='training',
                l2_regularization=0.0005,
                scales=scales,
                aspect_ratios_per_layer=aspect_ratios,
                two_boxes_for_ar1=two_boxes_for_ar1,
                steps=steps,
                offsets=offsets,
                clip_boxes=clip_boxes,
                variances=variances,
                normalize_coords=normalize_coords,
                subtract_mean=mean_color,
                swap_channels=swap_channels)


# 2: Load the trained weights into the model.

# TODO: Set the path of the trained weights.
weights_path = 'good_dropout_model/ssd300_dropout_PASCAL2012_train_+12_epoch-58_loss-3.8960_val_loss-5.0832.h5'

model.load_weights(weights_path, by_name=True)

# 3: Compile the model so that Keras won't complain the next time you load it.

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)

model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

W1122 21:05:34.424905 140538163132160 deprecation_wrapper.py:119] From /home/vision/msmith/localDrive/msmith/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:95: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.

W1122 21:05:34.425534 140538163132160 deprecation_wrapper.py:119] From /home/vision/msmith/localDrive/msmith/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:98: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W1122 21:05:34.437798 140538163132160 deprecation_wrapper.py:119] From /home/vision/msmith/localDrive/msmith/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:102: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1122 21:05:34.438585 140538163132160 deprecation_wrapper.py:119] From /home/vision/msmith/localDrive/msmith/anaconda3/lib/python

In [5]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
identity_layer (Lambda)         (None, 300, 300, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
input_mean_normalization (Lambd (None, 300, 300, 3)  0           identity_layer[0][0]             
__________________________________________________________________________________________________
input_channel_swap (Lambda)     (None, 300, 300, 3)  0           input_mean_normalization[0][0]   
__________________________________________________________________________________________________
conv1_1 (C

Or

### 1.2. Load a trained model

In [None]:
# # TODO: Set the path to the `.h5` file of the model to be loaded.
# model_path = 'good_dropout_model/ssd300_dropout_pascal_07+12_epoch-114_loss-4.3685_val_loss-4.5034.h5'

# # We need to create an SSDLoss object in order to pass that to the model loader.
# ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

# K.clear_session() # Clear previous models from memory.

# model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes,
#                                                'L2Normalization': L2Normalization,
#                                                'DecodeDetections': DecodeDetections,
#                                                'compute_loss': ssd_loss.compute_loss})

## 2. Load some images

Load some images for which you'd like the model to make predictions.

In [6]:
ROOT_PATH = '/usr/local/data/msmith/APL/Datasets/PASCAL/'
# The directories that contain the images.
VOC_2007_images_dir      = p.join(ROOT_PATH,'VOCdevkit/VOC2007/JPEGImages/')
VOC_2012_images_dir      = p.join(ROOT_PATH,'VOCdevkit/VOC2012/JPEGImages/')

# The directories that contain the annotations.
VOC_2007_annotations_dir      = p.join(ROOT_PATH,'VOCdevkit/VOC2007/Annotations/')
VOC_2012_annotations_dir      = p.join(ROOT_PATH,'VOCdevkit/VOC2012/Annotations/')

# The paths to the image sets.
VOC_2007_train_image_set_filename    = p.join(ROOT_PATH,'VOCdevkit/VOC2007/ImageSets/Main/train.txt')
VOC_2012_train_image_set_filename    = p.join(ROOT_PATH,'VOCdevkit/VOC2012/ImageSets/Main/train.txt')
VOC_2007_val_image_set_filename      = p.join(ROOT_PATH,'VOCdevkit/VOC2007/ImageSets/Main/val.txt')
VOC_2012_val_image_set_filename      = p.join(ROOT_PATH,'VOCdevkit/VOC2012/ImageSets/Main/val.txt')
VOC_2007_trainval_image_set_filename = p.join(ROOT_PATH,'VOCdevkit/VOC2007/ImageSets/Main/trainval.txt')
VOC_2012_trainval_image_set_filename = p.join(ROOT_PATH,'VOCdevkit/VOC2012/ImageSets/Main/trainval.txt')
VOC_2007_test_image_set_filename     = p.join(ROOT_PATH,'VOCdevkit/VOC2007/ImageSets/Main/test.txt')

classes = ['background',
           'aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat',
           'chair', 'cow', 'diningtable', 'dog',
           'horse', 'motorbike', 'person', 'pottedplant',
           'sheep', 'sofa', 'train', 'tvmonitor']

dataset = DataGenerator(load_images_into_memory=False)
dataset.parse_xml(images_dirs=[VOC_2012_images_dir],
                  image_set_filenames=[VOC_2012_val_image_set_filename],
                  annotations_dirs=[VOC_2012_annotations_dir],
                  classes=classes,
                  include_classes='all',
                  exclude_truncated=False,
                  exclude_difficult=False,
                  ret=False,
                  )

Processing image set 'val.txt': 100%|██████████| 5823/5823 [00:42<00:00, 135.54it/s]


In [7]:
# Redefine resize because default version has weird behaviour
class Resize:
    '''
    Resizes images to a specified height and width in pixels.
    '''

    def __init__(self,
                 height,
                 width,
                 interpolation_mode=cv2.INTER_LINEAR,
                 box_filter=None,
                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
        '''
        Arguments:
            height (int): The desired height of the output images in pixels.
            width (int): The desired width of the output images in pixels.
            interpolation_mode (int, optional): An integer that denotes a valid
                OpenCV interpolation mode. For example, integers 0 through 5 are
                valid interpolation modes.
            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
                the validity of the bounding boxes is not checked.
            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
        '''
        if not (isinstance(box_filter, BoxFilter) or box_filter is None):
            raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.")
        self.out_height = height
        self.out_width = width
        self.interpolation_mode = interpolation_mode
        self.box_filter = box_filter
        self.labels_format = labels_format

    def __call__(self, image, labels=None, return_inverter=False):

        img_height, img_width = image.shape[:2]

        xmin = self.labels_format['xmin']
        ymin = self.labels_format['ymin']
        xmax = self.labels_format['xmax']
        ymax = self.labels_format['ymax']

        image = cv2.resize(image,
                           dsize=(self.out_width, self.out_height),
                           interpolation=self.interpolation_mode)

        if return_inverter:
            def inverter(labels):
                labels = np.copy(labels)
                labels[:, [ymin, ymax]] = np.round(labels[:, [ymin, ymax]] * (img_height / self.out_height), decimals=0)
                labels[:, [xmin, xmax]] = np.round(labels[:, [xmin, xmax]] * (img_width / self.out_width), decimals=0)
                return labels

        if labels is None:
            if return_inverter:
                return image, inverter
            else:
                return image
        else:
            labels = np.copy(labels)
            labels[:, [ymin, ymax]] = np.round(labels[:, [ymin, ymax]] * (self.out_height / img_height), decimals=0)
            labels[:, [xmin, xmax]] = np.round(labels[:, [xmin, xmax]] * (self.out_width / img_width), decimals=0)

            if not (self.box_filter is None):
                self.box_filter.labels_format = self.labels_format
                labels = self.box_filter(labels=labels,
                                         image_height=self.out_height,
                                         image_width=self.out_width)

            if return_inverter:
                return image, labels, inverter
            else:
                return image, labels

In [8]:
convert_to_3_channels = ConvertTo3Channels()
resize = Resize(height=img_height,width=img_width, labels_format={'class_id': 0, 'xmin': -4, 'ymin': -3, 'xmax': -2, 'ymax': -1})

transformations = [convert_to_3_channels, resize]

batch_size = 128
generator = dataset.generate(batch_size=batch_size,
                                         shuffle=False,
                                         transformations=transformations,
                                         label_encoder=None,
                                         returns={'original_images',
                                                  'processed_images',
                                                  'image_ids',
                                                  'evaluation-neutral',
                                                  'inverse_transform',
                                                  'original_labels'},
                                         keep_images_without_gt=True,
                                         degenerate_box_handling='remove')

## 3. Make predictions

In [9]:
n_images = dataset.get_dataset_size()
#n_images = 4*batch_size
n_batches = int(np.ceil(n_images / batch_size))

In [10]:
save_dir = '/usr/local/data/msmith/APL/dropout_object_detect/'
os.makedirs(save_dir, exist_ok=True)

# Set the colors for the bounding boxes
colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()
classes = ['background',
           'aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat',
           'chair', 'cow', 'diningtable', 'dog',
           'horse', 'motorbike', 'person', 'pottedplant',
           'sheep', 'sofa', 'train', 'tvmonitor']

In [11]:
for _ in tqdm(range(n_batches)):
    batch_X, batch_image_ids, batch_eval_neutral, batch_inverse_transforms, original_X, batch_orig_labels = next(generator)
    bs = len(batch_X)
    
    # After below for loop: nested list (Num passes, batch size)
    # Each element : np array (num_detections, num_class+6)
    model_out = []
    for n in range(N):
        model_out.append(decode_detections(model.predict(batch_X), confidence_thresh=0.1, 
                                      iou_threshold=0.1, top_k=200, normalize_coords=normalize_coords, 
                                      img_height=img_height, img_width=img_width))
        
    # Now list is an array (Num passes, batch size)
    model_out = np.array(model_out)
        
    # Indexing : [image #][observation #]
    # Note that each image will have different numbers of observations
    # Each observation is an array with shape (num_detections, num_class+6)
    observations_per_img = []
    
    # Iterate over each batch image
    for i in range(bs):
        concated =  np.concatenate([c for c in model_out[:,i] if 0 not in c.shape])
        # Concated is now an array of shape (total num detections, num_class + 6)
        # Note that there will be many overlaps; we are now trying to remove them
        
        observations = []
        while concated.shape[0] > 0:
            # Get first bounding box
            box = concated[0, :]
            # Calculate IoU for between said first box an all others
            ious = iou(concated[:, -4:], box[-4:], coords='corners', border_pixels='half', mode='element-wise')
            # Get matches, guaranteed one match min. to itself
            keep_idx = ious >= 0.65
            # Add box to list of boxes
            # This way we are grouping all overlapping boxes
            observations.append(concated[keep_idx, :])
            # Remove the boxes we just found were overlapping (min. itself) and keep going
            concated = concated[np.invert(keep_idx),:]
        observations_per_img.append(observations)
        
    # Get label probabilities
    observations_decoded_per_img = []
    for i in range(bs):
        observations_decoded = []
        for obs in observations_per_img[i]:
            # in observation: index 0-class id of max, 1-confidence of said class, 2-23 inclusive: softmax values, 24-27: box coordinates
            means = np.mean(obs[:,2:], axis=0) # Get mean values of softmax and boxes per paper
            ent = entropy(means[:-4]) # Get uncertainty estimate using entropy of mean (total uncertainty)
            # Max entropy of 21 values slightly greater than 3
            # Min is 0 obviously
            new_class = np.argmax(means[1:-4]) + 1 # Get max class (ignoring background)
            new_obs = np.empty((7))
            new_obs[0:3] = [new_class, means[new_class] , ent]
            new_obs[3:7] = means[-4:]
            # New format: class id, associated class softmax confidence, entropy value, bounding box values
            observations_decoded.append(new_obs)
        observations_decoded = np.array(observations_decoded)
        observations_decoded = observations_decoded[observations_decoded[:,1].argsort()]
        observations_decoded_per_img.append(observations_decoded)
        # Index format: [img #][observation #]

    # Now transformed to original bounding box coord for display
    observations_decoded_per_img = apply_inverse_transforms(observations_decoded_per_img, batch_inverse_transforms)
        
    
    # Disable showing plot
    plt.ioff()
    #plt.ion()

    # Display the image and draw the predicted boxes onto it.
    for i in range(bs):
        observations_decoded = observations_decoded_per_img[i]

        plt.figure(figsize=(20,12))
        plt.imshow(original_X[i])
        plt.axis('off')

        current_axis = plt.gca()

        for observation in observations_decoded:
            # Transform the predicted bounding boxes for the 300x300 image to the original image dimensions.
            xmin = observation[-4]
            ymin = observation[-3]
            xmax = observation[-2]
            ymax = observation[-1]
            color = colors[int(observation[0])]
            label = '{}: {:.2f} | {:.2f}'.format(classes[int(observation[0])], observation[1], observation[2])
            current_axis.add_patch(plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, color=color, fill=False, linewidth=2))  
            current_axis.text(xmin, ymin, label, size='medium', color='white', bbox={'facecolor':color, 'alpha':1.0})

        plt.savefig(os.path.join(save_dir, batch_image_ids[i] + '.pdf'), bbox_inches='tight', pad_inches=0)

        plt.close()

HBox(children=(IntProgress(value=0, max=46), HTML(value='')))

ValueError: need at least one array to concatenate

## 5. Make predictions on Pascal VOC 2007 Test

Let's use a `DataGenerator` to make predictions on the Pascal VOC 2007 test dataset and visualize the predicted boxes alongside the ground truth boxes for comparison. Everything here is preset already, but if you'd like to learn more about the data generator and its capabilities, take a look at the detailed tutorial in [this](https://github.com/pierluigiferrari/data_generator_object_detection_2d) repository.

In [None]:
# Create a `BatchGenerator` instance and parse the Pascal VOC labels.

dataset = DataGenerator()

# TODO: Set the paths to the datasets here.

VOC_2007_images_dir         = '../../datasets/VOCdevkit/VOC2007/JPEGImages/'
VOC_2007_annotations_dir    = '../../datasets/VOCdevkit/VOC2007/Annotations/'
VOC_2007_test_image_set_filename = '../../datasets/VOCdevkit/VOC2007/ImageSets/Main/test.txt'

# The XML parser needs to now what object class names to look for and in which order to map them to integers.
classes = ['background',
           'aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat',
           'chair', 'cow', 'diningtable', 'dog',
           'horse', 'motorbike', 'person', 'pottedplant',
           'sheep', 'sofa', 'train', 'tvmonitor']

dataset.parse_xml(images_dirs=[VOC_2007_images_dir],
                  image_set_filenames=[VOC_2007_test_image_set_filename],
                  annotations_dirs=[VOC_2007_annotations_dir],
                  classes=classes,
                  include_classes='all',
                  exclude_truncated=False,
                  exclude_difficult=True,
                  ret=False)

convert_to_3_channels = ConvertTo3Channels()
resize = Resize(height=img_height, width=img_width)

generator = dataset.generate(batch_size=1,
                             shuffle=True,
                             transformations=[convert_to_3_channels,
                                              resize],
                             returns={'processed_images',
                                      'filenames',
                                      'inverse_transform',
                                      'original_images',
                                      'original_labels'},
                             keep_images_without_gt=False)

In [None]:
# Generate a batch and make predictions.

batch_images, batch_filenames, batch_inverse_transforms, batch_original_images, batch_original_labels = next(generator)

i = 0 # Which batch item to look at

print("Image:", batch_filenames[i])
print()
print("Ground truth boxes:\n")
print(np.array(batch_original_labels[i]))

In [None]:
# Predict.

y_pred = model.predict(batch_images)

In [None]:
confidence_threshold = 0.5

# Perform confidence thresholding.
y_pred_thresh = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]

# Convert the predictions for the original image.
y_pred_thresh_inv = apply_inverse_transforms(y_pred_thresh, batch_inverse_transforms)

np.set_printoptions(precision=2, suppress=True, linewidth=90)
print("Predicted boxes:\n")
print('   class   conf xmin   ymin   xmax   ymax')
print(y_pred_thresh_inv[i])

In [None]:
# Display the image and draw the predicted boxes onto it.

# Set the colors for the bounding boxes
colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()

plt.figure(figsize=(20,12))
plt.imshow(batch_original_images[i])

current_axis = plt.gca()

for box in batch_original_labels[i]:
    xmin = box[1]
    ymin = box[2]
    xmax = box[3]
    ymax = box[4]
    label = '{}'.format(classes[int(box[0])])
    current_axis.add_patch(plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, color='green', fill=False, linewidth=2))  
    current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor':'green', 'alpha':1.0})

for box in y_pred_thresh_inv[i]:
    xmin = box[2]
    ymin = box[3]
    xmax = box[4]
    ymax = box[5]
    color = colors[int(box[0])]
    label = '{}: {:.2f}'.format(classes[int(box[0])], box[1])
    current_axis.add_patch(plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, color=color, fill=False, linewidth=2))  
    current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor':color, 'alpha':1.0})