# SSD

## Build model

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
from config import *

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
import os

print(tf.__version__)
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="5, 6"
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

1.4.0
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17329061195881314518
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11332668621
locality {
  bus_id: 2
}
incarnation: 13782030490837513851
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:85:00.0, compute capability: 3.7"
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 11330676327
locality {
  bus_id: 2
}
incarnation: 7995909363795232043
physical_device_desc: "device: 1, name: Tesla K80, pci bus id: 0000:88:00.0, compute capability: 3.7"
]


In [3]:
def jaccard(box_a, box_b):
    x_overlap = max(0, min(box_a[2], box_b[2]) - max(box_a[0], box_b[0]))
    y_overlap = max(0, min(box_a[3], box_b[3]) - max(box_a[1], box_b[1]))
    intersection = x_overlap * y_overlap

    area_box_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
    area_box_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
    union = area_box_a + area_box_b - intersection

    iou = intersection / union
    return iou

In [4]:
def ssd_hook(feature_map, hook_id):
    with tf.variable_scope("ssd_hook" + hook_id):
        confidence = slim.conv2d(feature_map, NUM_PRED_CONF, [3, 3], activation_fn=None, scope="conv_conf")
        confidence = tf.contrib.layers.flatten(confidence)
        
        location = slim.conv2d(feature_map, NUM_PRED_LOC, [3, 3], activation_fn=None, scope="conv_loc")
        location = tf.contrib.layers.flatten(location)
        
    return confidence, location

In [5]:
def ssd_backbone():
    """
    AlexNet
    """
    x = tf.placeholder(tf.float32, [None, IMG_H, IMG_W, NUM_CHANNELS], name="x")
    is_training = tf.placeholder(tf.bool, name="is_training")

    preds_conf = [] 
    preds_loc = []

    with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm, normalizer_params={'is_training': True}, weights_regularizer=slim.l2_regularizer(scale=REG_SCALE)):
        net = slim.conv2d(x, 64, [11, 11], 4, padding='VALID', scope='conv1')
        net = slim.max_pool2d(net, [3, 3], 2, scope='pool1')
        net = slim.conv2d(net, 192, [5, 5], scope='conv2')

        net_conf, net_loc = ssd_hook(net, 'conv2')
        preds_conf.append(net_conf)
        preds_loc.append(net_loc)
    
        net = slim.max_pool2d(net, [3, 3], 2, scope='pool2')
        net = slim.conv2d(net, 384, [3, 3], scope='conv3')
        net = slim.conv2d(net, 384, [3, 3], scope='conv4')
        net = slim.conv2d(net, 256, [3, 3], scope='conv5')

        
        net = slim.conv2d(net, 1024, [3, 3], scope='conv6')
        net = slim.conv2d(net, 1024, [1, 1], scope='conv7')
        net_conf, net_loc = ssd_hook(net, 'conv7')
        
        
        preds_conf.append(net_conf)
        preds_loc.append(net_loc)

        
        net = slim.conv2d(net, 256, [1, 1], scope='conv8')
        net = slim.conv2d(net, 512, [3, 3], 2, scope='conv8_2')
        net_conf, net_loc = ssd_hook(net, 'conv8_2')
        
        
        preds_conf.append(net_conf)
        preds_loc.append(net_loc)

        
        net = slim.conv2d(net, 128, [1, 1], scope='conv9')
        net = slim.conv2d(net, 256, [3, 3], 2, scope='conv9_2')
        net_conf, net_loc = ssd_hook(net, 'conv9_2')
        
        
        preds_conf.append(net_conf)
        preds_loc.append(net_loc)

        
        
    final_pred_conf = tf.concat(preds_conf, axis=1)
    final_pred_loc = tf.concat(preds_loc, axis=1)

    # Return a dictionary of {tensor_name: tensor_reference}
    return_dict = {'x': x,
                   'y_pred_conf': final_pred_conf,
                   'y_pred_loc': final_pred_loc,
                   'is_training': is_training,
    }
    return return_dict

In [6]:
def ssd_loss(y_pred_conf, y_pred_loc):
    """
    Inputs:
    y_pred_conf: [batch_size, num_feature_map_cells * num_defaul_boxes * num_classes]
    y_pred_loc: [batch_size, num_feature_map_cells * num_defaul_boxes * 4]
    """
    num_total_preds = 0
    for fm_size in FM_SIZES:
        num_total_preds += fm_size[0] * fm_size[1] * NUM_DEFAULT_BOXES
        
    num_total_preds_conf = num_total_preds * NUM_CLASSES
    num_total_preds_loc = num_total_preds * 4
    
    
    y_true_conf = tf.placeholder(tf.int32, [None, num_total_preds], name='y_true_conf')  
    y_true_loc  = tf.placeholder(tf.float32, [None, num_total_preds_loc], name='y_true_loc')
    conf_loss_mask = tf.placeholder(tf.float32, [None, num_total_preds], name="conf_loss_mask")
    
    
    # confidence loss
    logits = tf.reshape(y_pred_conf, [-1, num_total_preds, NUM_CLASSES])
    conf_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y_true_conf)

    conf_loss = conf_loss_mask * conf_loss
    conf_loss = tf.reduce_sum(conf_loss)
    
    # location loss
    diff = y_true_loc - y_pred_loc
    
    loc_loss_l2 = 0.5 * (diff ** 2)
    loc_loss_l1 = tf.abs(diff) - 0.5
    
    smooth_l1_condition = tf.less(tf.abs(diff), 1.0)
    loc_loss = tf.where(smooth_l1_condition, loc_loss_l2, loc_loss_l1)
    
    loc_loss_mask = tf.minimum(y_true_conf, 1)
    loc_loss_mask = tf.to_float(loc_loss_mask)
    loc_loss_mask = tf.stack([loc_loss_mask] * 4, axis=2) 
    loc_loss_mask = tf.reshape(loc_loss_mask, [-1, num_total_preds_loc])
    loc_loss = loc_loss_mask * loc_loss
    loc_loss = tf.reduce_sum(loc_loss)
    
    loss = conf_loss + LOC_LOSS_WEIGHT * loc_loss + tf.reduce_sum(slim.losses.get_regularization_losses())
    
    optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.005).minimize(loss)    
    
    probs_all = tf.nn.softmax(logits)
    probs, preds_conf = tf.nn.top_k(probs_all) 
    probs = tf.reshape(probs, [-1, num_total_preds])
    preds_conf = tf.reshape(preds_conf, [-1, num_total_preds])
    
    
    return_dict = {'y_true_conf': y_true_conf,
                   'y_true_loc': y_true_loc,
                   'conf_loss_mask': conf_loss_mask,
                   'optimizer': optimizer,
                   'conf_loss': conf_loss,
                   'loc_loss': loc_loss,
                   'loss': loss,
                   'probs': probs,
                   'preds_conf': preds_conf,
                   'preds_loc': y_pred_loc
    }
    
    return return_dict

In [7]:
def ssd_model():
#     if MODEL == "AlexNet":
#         model = 
    model = ssd_backbone()
    loss = ssd_loss(model["y_pred_conf"], model["y_pred_loc"])
    ssd_model = {}
    for k in model.keys():
        ssd_model[k] = model[k]
    
    for k in loss.keys():
        ssd_model[k] = loss[k]
        
    return ssd_model

In [8]:
def nms(y_pred_conf, y_pred_loc, prob):
    class_boxes = {} 
    with open('signnames.csv', 'r') as f:
        for line in f:
            cls, _ = line.split(',')
            class_boxes[float(cls)] = []

    y_idx = 0
    for fm_size in FM_SIZES:
        fm_h, fm_w = fm_size
        for row in range(fm_h):
            for col in range(fm_w):
                for db in DEFAULT_BOXES:
                    # if class confidence > CONF_THRESH
                    # and not background class
                    if prob[y_idx] > CONF_THRESH and y_pred_conf[y_idx] > 0.:
                        # absolute coordinates
                        xc, yc = col + 0.5, row + 0.5
                        center_coords = np.array([xc, yc, xc, yc])
                        abs_box_coords = center_coords + y_pred_loc[y_idx*4 : y_idx*4 + 4] 

                        # box coordinates in actual image
                        scale = np.array([IMG_W/fm_w, IMG_H/fm_h, IMG_W/fm_w, IMG_H/fm_h])
                        box_coords = abs_box_coords * scale
                        box_coords = [int(round(x)) for x in box_coords]

                        # compare this box to all previous boxes of this class
                        cls = y_pred_conf[y_idx]
                        cls_prob = prob[y_idx]
                        box = (box_coords, cls, cls_prob)
                        if len(class_boxes[cls]) == 0:
                            class_boxes[cls].append(box)
                        else:
                            suppressed = False 
                            overlapped = False 
                            for other_box in class_boxes[cls]:
                                iou = calc_iou(box[:4], other_box[:4])
                                if iou > NMS_IOU_THRESH:
                                    overlapped = True
                                    # if current box has higher confidence than other box
                                    if box[5] > other_box[5]:
                                        class_boxes[cls].remove(other_box)
                                        suppressed = True
                            
                            if suppressed or not overlapped:
                                class_boxes[cls].append(box)

                    y_idx += 1

    boxes = []
    for cls in class_boxes.keys():
        for class_box in class_boxes[cls]:
            boxes.append(class_box)
    boxes = np.array(boxes)

    return boxes

## Train model

In [9]:
import tensorflow as tf
from config import *
import numpy as np
from sklearn.model_selection import train_test_split
import math
import os
import time
import pickle
from PIL import Image
from tqdm import tqdm

import matplotlib.pyplot as plt

  return f(*args, **kwds)
  return f(*args, **kwds)


In [10]:
def batch_generator(X, y_conf, y_loc, batch_size):
    start_idx = 0
    while True:
        image_files = X[start_idx : start_idx + batch_size]
        y_true_conf = np.array(y_conf[start_idx : start_idx + batch_size])
        y_true_loc  = np.array(y_loc[start_idx : start_idx + batch_size])

        # Read images from image_files
        images = []
        for image_file in image_files:
            image = Image.open('%s' % (image_file))
            image = np.asarray(image)
            image = image[:, :, :3]

            images.append(image)

        
        images = np.array(images)
        images = images/127.5 - 1.
        

        num_pos = np.where(y_true_conf > 0)[0].shape[0]
        num_neg = NEG_POS_RATIO * num_pos
        y_true_conf_size = np.sum(y_true_conf.shape)

        if num_pos + num_neg < y_true_conf_size:
            conf_loss_mask = np.copy(y_true_conf)
            conf_loss_mask[np.where(conf_loss_mask > 0)] = 1.

            zero_indices = np.where(conf_loss_mask == 0.)  
            zero_indices = np.transpose(zero_indices) 

            chosen_zero_indices = zero_indices[np.random.choice(zero_indices.shape[0], int(num_neg), False)]

            for zero_idx in chosen_zero_indices:
                i, j = zero_idx
                conf_loss_mask[i][j] = 1.

        else:
            conf_loss_mask = np.ones_like(y_true_conf)

        yield images, y_true_conf, y_true_loc, conf_loss_mask

        start_idx += batch_size
        if start_idx >= X.shape[0]:
            start_idx = 0


In [None]:
# def train():
with open('data_prep_%sx%s.p' % (IMG_W, IMG_H), mode='rb') as f:
    train = pickle.load(f)

X_train = []    
y_train_conf = []
y_train_loc = []

for image_file in train.keys():
#     print(image_file)
    X_train.append(image_file)
    y_train_conf.append(train[image_file]["y_true_conf"])
    y_train_loc.append(train[image_file]["y_true_loc"])

X_train = np.array(X_train)
y_train_conf = np.array(y_train_conf)
y_train_loc = np.array(y_train_loc)


X_train, X_valid, y_train_conf, y_valid_conf, y_train_loc, y_valid_loc = train_test_split(X_train, y_train_conf, y_train_loc, test_size=VALIDATION_SIZE, random_state=1)


with tf.Graph().as_default(), tf.Session() as sess:
    model = ssd_model()
    x = model['x']
    y_true_conf = model['y_true_conf']
    y_true_loc = model['y_true_loc']
    conf_loss_mask = model['conf_loss_mask']
    is_training = model['is_training']
    optimizer = model['optimizer']
    reported_loss = model['loss']

    # Training process
    saver = tf.train.Saver()

    if RESUME:
        print('Restoring previously trained model at %s' % MODEL_SAVE_PATH)
        saver.restore(sess, MODEL_SAVE_PATH)

        with open('loss_history.p', 'rb') as f:
            loss_history = pickle.load(f)
    else:
        print('Training model from scratch')
        sess.run(tf.global_variables_initializer())

        loss_history = []

    last_time = time.time()
    train_start_time = time.time()

    for epoch in range(NUM_EPOCH):
        
        train_gen = batch_generator(X_train, y_train_conf, y_train_loc, BATCH_SIZE)
        num_batches_train = math.ceil(X_train.shape[0] / BATCH_SIZE)
        losses = []  

        for i in tqdm(range(num_batches_train)):
            
            images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next(train_gen)
            
            if images.shape != (BATCH_SIZE, 300, 400, 3):
                continue
#             print(images.shape, y_true_conf_gen.shape, y_true_loc_gen.shape, conf_loss_mask_gen.shape)
            _, loss = sess.run([optimizer, reported_loss], feed_dict={
                x: images,
                y_true_conf: y_true_conf_gen,
                y_true_loc: y_true_loc_gen,
                conf_loss_mask: conf_loss_mask_gen,
                is_training: True
            })
            
            losses.append(loss)  

        train_loss = np.mean(losses)

        valid_gen = batch_generator(X_valid, y_valid_conf, y_valid_loc, BATCH_SIZE)
        num_batches_valid = math.ceil(X_valid.shape[0] / BATCH_SIZE)
        losses = []
        
        for _ in range(num_batches_valid):
            images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next(valid_gen)

            loss = sess.run(reported_loss, feed_dict={
                x: images,
                y_true_conf: y_true_conf_gen,
                y_true_loc: y_true_loc_gen,
                conf_loss_mask: conf_loss_mask_gen,
                is_training: False
            })
            losses.append(loss)
        valid_loss = np.mean(losses)

        loss_history.append((train_loss, valid_loss))

        print('Epoch %d -- Train loss: %.4f, Validation loss: %.4f, Elapsed time: %.2f sec' % (epoch+1, train_loss, valid_loss, time.time() - last_time))
        last_time = time.time()
        
        
        
        if epoch % 100 == 0:
            save_path = saver.save(sess, MODEL_SAVE_PATH, global_step=epoch+1)


    total_time = time.time() - train_start_time
    print('Total elapsed time: %d min %d sec' % (total_time/60, total_time%60))

    if SAVE_MODEL:
        save_path = saver.save(sess, MODEL_SAVE_PATH)
        with open('loss_history.p', 'wb') as f:
            pickle.dump(loss_history, f)

Instructions for updating:
Use tf.losses.get_regularization_losses instead.
Training model from scratch


100%|██████████| 71/71 [02:01<00:00,  1.71s/it]


Epoch 1 -- Train loss: 18.9276, Validation loss: 18.8708, Elapsed time: 127.19 sec


100%|██████████| 71/71 [01:55<00:00,  1.63s/it]


Epoch 2 -- Train loss: 18.8147, Validation loss: 18.7565, Elapsed time: 120.65 sec


 20%|█▉        | 14/71 [00:23<01:35,  1.67s/it]

## Inference

In [None]:
import tensorflow as tf
from config import *
import numpy as np
from sklearn.model_selection import train_test_split
import cv2
import math
import os
import time
import pickle
from PIL import Image
import matplotlib.pyplot as plt
from optparse import OptionParser
import glob

In [None]:
def inference(image, model, sess, mode, sign_map):

    image = np.array(image)
    image_orig = np.copy(image)

    x = model['x']
    is_training = model['is_training']
    preds_conf = model['preds_conf']
    preds_loc = model['preds_loc']
    probs = model['probs']

    image = Image.fromarray(image)
    orig_w, orig_h = image.size

    image = image.resize((IMG_W, IMG_H), Image.LANCZOS)  
    image = np.asarray(image)

    images = np.array([image])  

    t0 = time.time()  
    preds_conf_val, preds_loc_val, probs_val = sess.run([preds_conf, preds_loc, probs], feed_dict={x: images, is_training: False})

    print('Inference took %.1f ms (%.2f fps)' % ((time.time() - t0)*1000, 1/(time.time() - t0)))

    y_pred_conf = preds_conf_val[0]  
    y_pred_conf = y_pred_conf.astype('float32')
    prob = probs_val[0]

    y_pred_loc = preds_loc_val[0]

    boxes = nms(y_pred_conf, y_pred_loc, prob)
    print('Inference + NMS took %.1f ms (%.2f fps)' % ((time.time() - t0)*1000, 1/(time.time() - t0)))

    # rescale box back to image
    scale = np.array([orig_w/IMG_W, orig_h/IMG_H, orig_w/IMG_W, orig_h/IMG_H])
    if len(boxes) > 0:
        boxes[:, :4] = boxes[:, :4] * scale

    # draw box
    image = image_orig
    for box in boxes:
        box_coords = [int(round(x)) for x in box[:4]]
        cls = int(box[4])
        cls_prob = box[5]

        image = cv2.rectangle(image, tuple(box_coords[:2]), tuple(box_coords[2:]), (0,255,0))
        label_str = '%s %.2f' % (sign_map[cls], cls_prob)
        image = cv2.putText(image, label_str, (box_coords[0], box_coords[1]), 0, 0.5, (0,255,0), 1, cv2.LINE_AA)

    return image


In [1]:
def generate_output(input_files):
    """
    Generate annotated images, videos, or sample images, based on mode
    """
    # First, load mapping from integer class ID to sign name string
    sign_map = {}
    
    sign_map[0] = "background"  # class ID 0 reserved for background class
    sign_map[1] = "pedestrian"

    with tf.Graph().as_default(), tf.Session() as sess:
        # "Instantiate" neural network, get relevant tensors
        model = ssd_model()

        saver.restore(sess, MODEL_SAVE_PATH)

        for image_file in input_files:
            print('Running inference on %s' % image_file)
            image_orig = np.asarray(Image.open(image_file))
            image = inference(image_orig, model, sess, mode, sign_map)

            head, tail = os.path.split(image_file)
            
            plt.imsave('./inference/%s' % tail, image)
            plt.imshow(image)
            plt.show()


In [2]:
# generate_output()