### Data Load

In [1]:
!cp drive/MyDrive/KSA_AI/DeepLearning/Object_Detection/train.csv ./
!cp drive/MyDrive/KSA_AI/DeepLearning/Object_Detection/validation.csv ./
!cp drive/MyDrive/KSA_AI/DeepLearning/Object_Detection/test.csv ./
!cp drive/MyDrive/KSA_AI/DeepLearning/Object_Detection/SSD.zip ./

In [None]:
!unzip SSD.zip

In [3]:
!mv '모듈8데이터(SSD_앵무새)' dataset

In [4]:
import os
dataset_dir = os.path.join(os.getcwd(), "dataset")
dataset_classes = os.listdir(dataset_dir)

In [5]:
import shutil
for cls in dataset_classes:
    images = os.listdir(os.path.join(dataset_dir, cls, 'img'))
    for img in images:
        path = os.path.join(dataset_dir, cls, 'img', img)
        shutil.move(path, os.path.join(dataset_dir, cls))
    shutil.rmtree(os.path.join(dataset_dir, cls, 'img'))

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import tqdm
from PIL import Image

train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')
train_df

Unnamed: 0,image_path,xmin,xmax,ymin,ymax,class_id
0,dataset/0001고핀/0001_00000116.jpg,145,214,61,154,1
1,dataset/0001고핀/0001_00000133.jpg,199,261,87,174,1
2,dataset/0001고핀/0001_00000073.jpg,30,97,69,158,1
3,dataset/0001고핀/0001_00000018.jpg,131,234,14,141,1
4,dataset/0001고핀/0001_00000099.jpg,112,202,46,178,1
...,...,...,...,...,...,...
3035,dataset/0011회색앵무/0011_00000252.jpg,116,199,42,120,11
3036,dataset/0011회색앵무/0011_00000320.jpg,108,265,8,120,11
3037,dataset/0011회색앵무/0011_00000199.jpg,60,208,50,211,11
3038,dataset/0011회색앵무/0011_00000235.jpg,14,112,14,117,11


### TF Record

In [7]:
def _bytes_feature(value, is_list=False):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    
    if not is_list:
        value = [value]
    
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _float_feature(value, is_list=False):
    """Returns a float_list from a float / double."""
        
    if not is_list:
        value = [value]
        
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value, is_list=False):
    """Returns an int64_list from a bool / enum / int / uint."""
        
    if not is_list:
        value = [value]
        
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def serialize(sensor_features):
    # Create a dictionary mapping the feature name to the 
    # tf.Example-compatible data type.
    feature = {
        # 'image': _bytes_feature(tf.io.encode_jpeg(image), is_list=False)
        'image': _float_feature(sensor_features['image'], is_list=False),
        'class_id': _bytes_feature(sensor_features['class_id'], is_list=False),
        'xmin': _int64_feature(sensor_features['xmin'], is_list=False),
        'xmax': _int64_feature(sensor_features['xmax'], is_list=False),
        'ymin': _int64_feature(sensor_features['ymin'], is_list=False),
        'ymax': _int64_feature(sensor_features['ymax'], is_list=False)
        }

    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [8]:
def serialize_example(feature0, feature1, feature2, feature3, feature4, feature5, feature6, feature7):
  feature = {
      'image': _bytes_feature(feature0),
      'image_name': _bytes_feature(feature1),
      'sex': _int64_feature(feature3),
      'age_approx': _int64_feature(feature4),
      'anatom_site_general_challenge': _int64_feature(feature5),
      'source': _int64_feature(feature6),
      'target': _int64_feature(feature7)
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

### Preprocessing

In [9]:
import os
import tensorflow as tf
import pandas as pd

def create_tf_dataset(df, is_test=False):
    ds = tf.data.Dataset.from_tensor_slices(df["image_path"].values)

    if is_test:
        return ds
    else:
        ds_bbox = tf.data.Dataset.from_tensor_slices(
            [tf.constant(x) for x in df[['xmin', 'xmax', 'ymin', 'ymax']].values])
        target_ds = tf.data.Dataset.from_tensor_slices(df['class_id'].values)

        ds = tf.data.Dataset.zip((ds, ds_bbox, target_ds))
        return ds


def parse_image(filename, image_size, augmentation=False):
    # parts = tf.strings.split(filename, '/')
    # image_id = parts[-1]
    image = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(image)
    # image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, image_size,  method=tf.image.ResizeMethod.LANCZOS5, antialias=True)
    # image = tf.keras.preprocessing.image.img_to_array(image)
    return image


def prep_tf_dataset(img_path, coords, class_id, image_size, augmentation=False):
    img_tensor = parse_image(img_path, image_size=image_size)
    coords = tf.cast(coords, tf.float32)
    coords = coords/image_size[0]
    return img_tensor, coords, class_id

def ds_generator(df_path, image_size=(300,300), augmentation=False):
    '''
    input : pd.DataFrame(columns=['image_name', 'xmin', 'xmax', 'ymin', 'ymax', 'class_id'])
    output : <BatchDataset shapes: ((batch, 300, 300, 3), (batch, 4), (batch,)), types: (tf.float32, tf.float32, tf.int64)>
    '''
    df = pd.read_csv(df_path)
    data_len = len(df)
    ds = create_tf_dataset(df)
    ds = ds.map(lambda x, y, z: (prep_tf_dataset(x, y, z, image_size=image_size, augmentation=augmentation)),
                num_parallel_calls=tf.data.experimental.AUTOTUNE).prefetch(tf.data.experimental.AUTOTUNE)
    # ds = ds.shuffle(3000).batch(batch_size)
    return ds, data_len

### dataset generate

In [10]:
ds_generator('train.csv', image_size=(300, 300))

(<PrefetchDataset shapes: ((300, 300, None), (4,), ()), types: (tf.float32, tf.float32, tf.int64)>,
 3040)

In [11]:
def get_data_shapes():
    """Generating data shapes for tensorflow datasets.
    outputs:
        data shapes = output data shapes for (images, ground truth boxes, ground truth labels)
    """
    return ([None, None, None], [None,], [])   ###
 
def get_padding_values():
    """Generating padding values for missing values in batch for tensorflow datasets.
    outputs:
        padding values = padding values with dtypes for (images, ground truth boxes, ground truth labels)
    """
    return (tf.constant(0, tf.float32), tf.constant(0, tf.float32), tf.constant(-1, tf.int64))

### bounding box calcuate func

In [12]:
import tensorflow as tf

def non_max_suppression(pred_bboxes, pred_labels, **kwargs):
    """Applying non maximum suppression.
    Details could be found on tensorflow documentation.
    https://www.tensorflow.org/api_docs/python/tf/image/combined_non_max_suppression
    inputs:
        pred_bboxes = (batch_size, total_bboxes, total_labels, [x1, x2, y1, y2])
            total_labels should be 1 for binary operations like in rpn
        pred_labels = (batch_size, total_bboxes, total_labels)
        **kwargs = other parameters
    outputs:
        nms_boxes = (batch_size, max_detections, [x1, x2, y1, y2])
        nmsed_scores = (batch_size, max_detections)
        nmsed_classes = (batch_size, max_detections)
        valid_detections = (batch_size)
            Only the top valid_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid.
            The rest of the entries are zero paddings.
    """
    return tf.image.combined_non_max_suppression(
        pred_bboxes,
        pred_labels,
        **kwargs
    )

def generate_iou_map(bboxes, gt_boxes, transpose_perm=[0, 2, 1]):
    """Calculating intersection over union values for each ground truth boxes in a dynamic manner.
    It is supported from 1d to 3d dimensions for bounding boxes.
    Even if bboxes have different rank from gt_boxes it should be work.
    inputs:
        bboxes = (dynamic_dimension, [x1, x2, y1, y2])
        gt_boxes = (dynamic_dimension, [x1, x2, y1, y2])
        transpose_perm = (transpose_perm_order)
            for 3d gt_boxes => [0, 2, 1]
    outputs:
        iou_map = (dynamic_dimension, total_gt_boxes)
            same rank with the gt_boxes
    """

    gt_rank = tf.rank(gt_boxes)
    gt_expand_axis = gt_rank - 2
    #
    bbox_x1, bbox_x2, bbox_y1, bbox_y2 = tf.split(bboxes, 4, axis=-1)
    gt_x1, gt_x2, gt_y1, gt_y2 = tf.split(gt_boxes, 4, axis=-1)
    # Calculate bbox and ground truth boxes areas
    gt_area = tf.squeeze((gt_y2 - gt_y1) * (gt_x2 - gt_x1), axis=-1)
    bbox_area = tf.squeeze((bbox_y2 - bbox_y1) * (bbox_x2 - bbox_x1), axis=-1)
    #
    # x_top = tf.maximum(bbox_x1, tf.transpose(gt_x1, transpose_perm))
    # y_top = tf.maximum(bbox_y1, tf.transpose(gt_y1, transpose_perm))
    # x_bottom = tf.minimum(bbox_x2, tf.transpose(gt_x2, transpose_perm))
    # y_bottom = tf.minimum(bbox_y2, tf.transpose(gt_y2, transpose_perm))
    x_top = tf.maximum(bbox_x1, tf.transpose(gt_x1))
    y_top = tf.maximum(bbox_y1, tf.transpose(gt_y1))
    x_bottom = tf.minimum(bbox_x2, tf.transpose(gt_x2))
    y_bottom = tf.minimum(bbox_y2, tf.transpose(gt_y2))
    ### Calculate intersection area
    intersection_area = tf.maximum(x_bottom - x_top, 0) * tf.maximum(y_bottom - y_top, 0)
    ### Calculate union area
    union_area = (tf.expand_dims(bbox_area, -1) + tf.expand_dims(gt_area, gt_expand_axis) - intersection_area)
    # Intersection over Union
    return intersection_area / union_area

def get_bboxes_from_deltas(prior_boxes, deltas):
    """Calculating bounding boxes for given bounding box and delta values.
    inputs:
        prior_boxes = (total_bboxes, [x1, x2, y1, y2])
        deltas = (batch_size, total_bboxes, [delta_y, delta_x, delta_h, delta_w])
    outputs:
        final_boxes = (batch_size, total_bboxes, [x1, x2, y1, y2])
    """
    all_pbox_width = prior_boxes[..., 1] - prior_boxes[..., 0]
    all_pbox_height = prior_boxes[..., 3] - prior_boxes[..., 2]
    all_pbox_ctr_x = prior_boxes[..., 0] + 0.5 * all_pbox_width
    all_pbox_ctr_y = prior_boxes[..., 2] + 0.5 * all_pbox_height
    #
    all_bbox_width = tf.exp(deltas[..., 3]) * all_pbox_width
    all_bbox_height = tf.exp(deltas[..., 2]) * all_pbox_height
    all_bbox_ctr_x = (deltas[..., 1] * all_pbox_width) + all_pbox_ctr_x
    all_bbox_ctr_y = (deltas[..., 0] * all_pbox_height) + all_pbox_ctr_y
    #
    x1 = all_bbox_ctr_x - (0.5 * all_bbox_width)
    x2 = all_bbox_width + x1
    y1 = all_bbox_ctr_y - (0.5 * all_bbox_height)
    y2 = all_bbox_height + y1
    #
    return tf.stack([x1, x2, y1, y2], axis=-1)

def get_deltas_from_bboxes(bboxes, gt_boxes):
    """Calculating bounding box deltas for given bounding box and ground truth boxes.
    inputs:
        bboxes = (total_bboxes, [x1, x2, y1, y2])
        gt_boxes = (batch_size, total_bboxes, [x1, x2, y1, y2])
    outputs:
        final_deltas = (batch_size, total_bboxes, [delta_x, delta_y, delta_w, delta_h])
    """
    bbox_width = bboxes[..., 1] - bboxes[..., 0]
    bbox_height = bboxes[..., 3] - bboxes[..., 2]
    bbox_ctr_x = bboxes[..., 0] + 0.5 * bbox_width
    bbox_ctr_y = bboxes[..., 2] + 0.5 * bbox_height
    #
    gt_width = gt_boxes[..., 1] - gt_boxes[..., 0]
    gt_height = gt_boxes[..., 3] - gt_boxes[..., 2]
    gt_ctr_x = gt_boxes[..., 0] + 0.5 * gt_width
    gt_ctr_y = gt_boxes[..., 2] + 0.5 * gt_height
    #
    bbox_width = tf.where(tf.equal(bbox_width, 0), 1e-3, bbox_width)
    bbox_height = tf.where(tf.equal(bbox_height, 0), 1e-3, bbox_height)
    delta_x = tf.where(tf.equal(gt_width, 0), tf.zeros_like(gt_width), tf.truediv((gt_ctr_x - bbox_ctr_x), bbox_width))
    delta_y = tf.where(tf.equal(gt_height, 0), tf.zeros_like(gt_height), tf.truediv((gt_ctr_y - bbox_ctr_y), bbox_height))
    delta_w = tf.where(tf.equal(gt_width, 0), tf.zeros_like(gt_width), tf.math.log(gt_width / bbox_width))
    delta_h = tf.where(tf.equal(gt_height, 0), tf.zeros_like(gt_height), tf.math.log(gt_height / bbox_height))
    #
    return tf.stack([delta_x, delta_y, delta_w, delta_h], axis=-1)

def get_scale_for_nth_feature_map(k, m=6, scale_min=0.2, scale_max=0.9):
    """Calculating scale value for nth feature map using the given method in the paper.
    inputs:
        k = nth feature map for scale calculation
        m = length of all using feature maps for detections, 6 for ssd300
    outputs:
        scale = calculated scale value for given index
    """
    return scale_min + ((scale_max - scale_min) / (m - 1)) * (k - 1)

def generate_base_prior_boxes(aspect_ratios, feature_map_index, total_feature_map):
    """Generating top left prior boxes for given stride, height and width pairs of different aspect ratios.
    These prior boxes same with the anchors in Faster-RCNN.
    inputs:
        aspect_ratios = for all feature map shapes + 1 for ratio 1
        feature_map_index = nth feature maps for scale calculation
        total_feature_map = length of all using feature map for detections, 6 for ssd300
    outputs:
        base_prior_boxes = (prior_box_count, [x1, x2, y1, y2])
    """
    current_scale = get_scale_for_nth_feature_map(feature_map_index, m=total_feature_map)
    next_scale = get_scale_for_nth_feature_map(feature_map_index + 1, m=total_feature_map)
    base_prior_boxes = []
    for aspect_ratio in aspect_ratios:
        height = current_scale / tf.sqrt(aspect_ratio)
        width = current_scale * tf.sqrt(aspect_ratio)
        base_prior_boxes.append([-width/2, width/2, -height/2, height/2])
    # 1 extra pair for ratio 1
    height = width = tf.sqrt(current_scale * next_scale)
    base_prior_boxes.append([-width/2, width/2, -height/2, height/2])
    return tf.cast(base_prior_boxes, dtype=tf.float32)

def generate_prior_boxes(feature_map_shapes, aspect_ratios):
    """Generating top left prior boxes for given stride, height and width pairs of different aspect ratios.
    These prior boxes same with the anchors in Faster-RCNN.
    inputs:
        feature_map_shapes = for all feature map output size
        aspect_ratios = for all feature map shapes + 1 for ratio 1
    outputs:
        prior_boxes = (total_prior_boxes, [y1, x1, y2, x2])
        -->prior_boxes = (total_prior_boxes, [x1, x2, y1, y2])
            these values in normalized format between [0, 1]
    """
    prior_boxes = []
    for i, feature_map_shape in enumerate(feature_map_shapes):
        base_prior_boxes = generate_base_prior_boxes(aspect_ratios[i], i+1, len(feature_map_shapes))
        #
        stride = 1 / feature_map_shape
        grid_coords = tf.cast(tf.range(0, feature_map_shape) / feature_map_shape + stride / 2, dtype=tf.float32)
        grid_x, grid_y = tf.meshgrid(grid_coords, grid_coords)
        flat_grid_x, flat_grid_y = tf.reshape(grid_x, (-1, )), tf.reshape(grid_y, (-1, ))
        #
        grid_map = tf.stack([flat_grid_x, flat_grid_x, flat_grid_y, flat_grid_y], -1)
        #
        prior_boxes_for_feature_map = tf.reshape(base_prior_boxes, (1, -1, 4)) + tf.reshape(grid_map, (-1, 1, 4))
        prior_boxes_for_feature_map = tf.reshape(prior_boxes_for_feature_map, (-1, 4))
        #
        prior_boxes.append(prior_boxes_for_feature_map)
    prior_boxes = tf.concat(prior_boxes, axis=0)
    return tf.clip_by_value(prior_boxes, 0, 1)

def renormalize_bboxes_with_min_max(bboxes, min_max):
    """Renormalizing given bounding boxes to the new boundaries.
    r = (x - min) / (max - min)
    outputs:
        bboxes = (total_bboxes, [x1, x2, y1, y2])
        min_max = ([x_min, x_max, y_min, y_max])
    """
    x_min, x_max, y_min, y_max = tf.split(min_max, 4)
    renomalized_bboxes = bboxes - tf.concat([x_min, x_max, y_min, y_max], -1)
    renomalized_bboxes /= tf.concat([x_max-x_min, x_max-x_min, y_max-y_min, y_max-y_min], -1)
    return tf.clip_by_value(renomalized_bboxes, 0, 1)

def normalize_bboxes(bboxes, height, width):
    """Normalizing bounding boxes.
    inputs:
        bboxes = (batch_size, total_bboxes, [x1, x2, y1, y2])
        height = image height
        width = image width
    outputs:
        normalized_bboxes = (batch_size, total_bboxes, [x1, x2, y1, y2])
            in normalized form [0, 1]
    """
    x1 = bboxes[..., 0] / width
    x2 = bboxes[..., 1] / width
    y1 = bboxes[..., 2] / height
    y2 = bboxes[..., 2] / height
    return tf.stack([x1, x2, y1, y2], axis=-1)

def denormalize_bboxes(bboxes, height, width):
    """Denormalizing bounding boxes.
    inputs:
        bboxes = (batch_size, total_bboxes, [x1, x2, y1, y2])
            in normalized form [0, 1]
        height = image height
        width = image width
    outputs:
        denormalized_bboxes = (batch_size, total_bboxes, [x1, x2, y1, y2])
    """
    x1 = bboxes[..., 0] / width
    x2 = bboxes[..., 1] / width
    y1 = bboxes[..., 2] / height
    y2 = bboxes[..., 2] / height
    return tf.round(tf.stack([x1, x2, y1, y2], axis=-1))

In [13]:
def generator(dataset, prior_boxes, hyper_params):
    """Tensorflow data generator for fit method, yielding inputs and outputs.
    inputs:
        dataset = tf.data.Dataset, PaddedBatchDataset
        prior_boxes = (total_prior_boxes, [x1, x2, y1, y2])
            these values in normalized format between [0, 1]
        hyper_params = dictionary
    outputs:
        yield inputs, outputs
    """
    while True:
        for image_data in dataset:
            img, gt_boxes, gt_labels = image_data
            actual_deltas, actual_labels = calculate_actual_outputs(prior_boxes, gt_boxes, gt_labels, hyper_params)
            yield img, (actual_deltas, actual_labels)

# def calculate_actual_outputs(prior_boxes, gt_boxes, gt_labels, hyper_params):
#     """Calculate ssd actual output values.
#     Batch operations supported.
#     inputs:
#         prior_boxes = (total_prior_boxes, [x1, x2, y1, y2])
#             these values in normalized format between [0, 1]
#         gt_boxes (batch_size, gt_box_size, [x1, x2, y1, y2])
#             these values in normalized format between [0, 1]
#         gt_labels (batch_size, gt_box_size)
#         hyper_params = dictionary
#     outputs:
#         bbox_deltas = (batch_size, total_bboxes, [delta_y, delta_x, delta_h, delta_w])
#         bbox_labels = (batch_size, total_bboxes, [0,0,...,0])
#     """
#     batch_size = tf.shape(gt_boxes)[0]
#     total_labels = hyper_params["n_classes"]
#     iou_threshold = hyper_params["iou_threshold"]
#     variances = hyper_params["variances"]
#     total_prior_boxes = prior_boxes.shape[0]
#     # Calculate iou values between each bboxes and ground truth boxes
#     iou_map = generate_iou_map(prior_boxes, gt_boxes)
#     # Get max index value for each row
#     max_indices_each_gt_box = tf.argmax(iou_map, axis=1, output_type=tf.int32)
#     # IoU map has iou values for every gt boxes and we merge these values column wise
#     merged_iou_map = tf.reduce_max(iou_map, axis=1)
#     #
#     pos_cond = tf.greater(merged_iou_map, iou_threshold)
#     #
#     gt_boxes_map = tf.gather(gt_boxes, max_indices_each_gt_box, batch_dims=0)
#     expanded_gt_boxes = tf.where(tf.expand_dims(pos_cond, -1), gt_boxes_map, tf.zeros_like(gt_boxes_map))
#     bbox_deltas = get_deltas_from_bboxes(prior_boxes, expanded_gt_boxes) / variances
#     #
#     gt_labels_map = tf.gather(gt_labels, max_indices_each_gt_box, batch_dims=0)
#     expanded_gt_labels = tf.where(pos_cond, gt_labels_map, tf.zeros_like(gt_labels_map))
#     bbox_labels = tf.one_hot(expanded_gt_labels, total_labels)
#     #
#     return bbox_deltas, bbox_labels

In [14]:
def calculate_actual_outputs(prior_boxes, gt_boxes, gt_labels, hyper_params):
    """Calculate ssd actual output values.
    Batch operations supported.
    inputs:
        prior_boxes = (total_prior_boxes, [x1, x2, y1, y2])
            these values in normalized format between [0, 1]
        gt_boxes (batch_size, gt_box_size, [x1, x2, y1, y2])
            these values in normalized format between [0, 1]
        gt_labels (batch_size, gt_box_size)
        hyper_params = dictionary
    outputs:
        bbox_deltas = (batch_size, total_bboxes, [delta_y, delta_x, delta_h, delta_w])
        bbox_labels = (batch_size, total_bboxes, [0,0,...,0])
    """
    batch_size = tf.shape(gt_boxes)[0]
    total_labels = hyper_params["n_classes"]
    iou_threshold = hyper_params["iou_threshold"]
    variances = hyper_params["variances"]
    total_prior_boxes = prior_boxes.shape[0]
    # Calculate iou values between each bboxes and ground truth boxes
    iou_map = generate_iou_map(prior_boxes, gt_boxes)  # iou_map.shape : (8732,32)
    # Get max index value for each row
    max_indices_each_gt_box = tf.argmax(iou_map, axis=1, output_type=tf.int32)
    # IoU map has iou values for every gt boxes and we merge these values column wise
    merged_iou_map = tf.reduce_max(iou_map, axis=1)
    #

    # pos_cond = tf.greater(merged_iou_map, iou_threshold)
    pos_cond = tf.greater(iou_map, iou_threshold)

    #
    # gt_boxes_map = tf.gather(gt_boxes, max_indices_each_gt_box, batch_dims=1)
    # expanded_gt_boxes = tf.where(tf.expand_dims(pos_cond, -1), gt_boxes_map, tf.zeros_like(gt_boxes_map))
    # bbox_deltas = bbox_utils.get_deltas_from_bboxes(prior_boxes, expanded_gt_boxes) / variances

    # gt_boxes_map = tf.gather(gt_boxes, max_indices_each_gt_box, batch_dims=1)
    expanded_gt_boxes = tf.where(tf.expand_dims(pos_cond, -1), gt_boxes, tf.zeros_like(gt_boxes))
    bbox_deltas = get_deltas_from_bboxes(prior_boxes, tf.transpose(expanded_gt_boxes, perm=[1,0,2]) ) / variances


    #
    # gt_labels_map = tf.gather(gt_labels, max_indices_each_gt_box, batch_dims=1)
    # expanded_gt_labels = tf.where(pos_cond, gt_labels_map, tf.zeros_like(gt_labels_map))
    # bbox_labels = tf.one_hot(expanded_gt_labels, total_labels)
    #
    #
    # gt_labels_map = tf.gather(gt_labels, max_indices_each_gt_box, batch_dims=1)
    expanded_gt_labels = tf.where(pos_cond, gt_labels, tf.zeros_like(gt_labels))
    bbox_labels = tf.one_hot(tf.transpose(expanded_gt_labels), total_labels)
    #

    return bbox_deltas, bbox_labels

### Modeling

In [15]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Input, Conv2D, MaxPool2D, Activation

class HeadWrapper(Layer):
    """Merging all feature maps for detections.
    inputs:
        conv4_3 = (batch_size, (layer_shape x aspect_ratios), last_dimension)
            ssd300 conv4_3 shape => (38 x 38 x 4) = 5776
        conv7 = (batch_size, (layer_shape x aspect_ratios), last_dimension)
            ssd300 conv7 shape => (19 x 19 x 6) = 2166
        conv8_2 = (batch_size, (layer_shape x aspect_ratios), last_dimension)
            ssd300 conv8_2 shape => (10 x 10 x 6) = 600
        conv9_2 = (batch_size, (layer_shape x aspect_ratios), last_dimension)
            ssd300 conv9_2 shape => (5 x 5 x 6) = 150
        conv10_2 = (batch_size, (layer_shape x aspect_ratios), last_dimension)
            ssd300 conv10_2 shape => (3 x 3 x 4) = 36
        conv11_2 = (batch_size, (layer_shape x aspect_ratios), last_dimension)
            ssd300 conv11_2 shape => (1 x 1 x 4) = 4
                                           Total = 8732 default box
    outputs:
        merged_head = (batch_size, total_prior_boxes, last_dimension)
    """

    def __init__(self, last_dimension, **kwargs):
        super(HeadWrapper, self).__init__(**kwargs)
        self.last_dimension = last_dimension

    def get_config(self):
        config = super(HeadWrapper, self).get_config()
        config.update({"last_dimension": self.last_dimension})
        return config

    def call(self, inputs):
        last_dimension = self.last_dimension
        batch_size = tf.shape(inputs[0])[0]
        outputs = []
        for conv_layer in inputs:
            outputs.append(tf.reshape(conv_layer, (batch_size, -1, last_dimension)))
        #
        return tf.concat(outputs, axis=1)

In [16]:
def get_head_from_outputs(hyper_params, outputs):
    """Generating ssd bbox delta and label heads.
    inputs:
        hyper_params = dictionary
        outputs = list of ssd layers output to be used for prediction
    outputs:
        pred_deltas = merged outputs for bbox delta head
        pred_labels = merged outputs for bbox label head
    """
    total_labels = hyper_params["n_classes"]
    # +1 for ratio 1
    len_aspect_ratios = [len(x) + 1 for x in hyper_params["aspect_ratios"]]
    labels_head = []
    boxes_head = []
    for i, output in enumerate(outputs):
        aspect_ratio = len_aspect_ratios[i]
        labels_head.append(Conv2D(aspect_ratio * total_labels, (3, 3), padding="same", name="{}_conv_label_output".format(i+1))(output))
        boxes_head.append(Conv2D(aspect_ratio * 4, (3, 3), padding="same", name="{}_conv_boxes_output".format(i+1))(output))
    # Classification
    pred_labels = HeadWrapper(total_labels, name="labels_head")(labels_head)
    pred_labels = Activation("softmax", name="conf")(pred_labels)
    # Regression
    pred_deltas = HeadWrapper(4, name="loc")(boxes_head)
    return pred_deltas, pred_labels

In [17]:
from tensorflow.keras.layers import Layer
from tensorflow.keras.regularizers import l2

class L2Normalization(Layer):
    """Normalizing different scale features for fusion.
    paper: https://arxiv.org/abs/1506.04579
    inputs:
        feature_map = (batch_size, feature_map_height, feature_map_width, depth)
    outputs:
        normalized_feature_map = (batch_size, feature_map_height, feature_map_width, depth)
    """
    def __init__(self, scale_factor, **kwargs):
        super(L2Normalization, self).__init__(**kwargs)
        self.scale_factor = scale_factor

    def get_config(self):
        config = super(L2Normalization, self).get_config()
        config.update({"scale_factor": self.scale_factor})
        return config

    def build(self, input_shape):
        # Network need to learn scale factor for each channel
        init_scale_factor = tf.fill((input_shape[-1],), float(self.scale_factor))
        self.scale = tf.Variable(init_scale_factor, trainable=True)

    def call(self, inputs):
        return tf.nn.l2_normalize(inputs, axis=-1) * self.scale

In [18]:
## backbone : VGG
import tensorflow as tf
from tensorflow.keras.layers import Layer, Input, Conv2D, MaxPool2D
from tensorflow.keras.models import Model

def get_backbone_vgg(hyper_params):
    """Generating ssd model for hyper params.
    inputs:
        hyper_params = dictionary
    outputs:
        ssd_model = tf.keras.model
    """
    # Initial scale factor 20 in the paper.
    # Even if this scale factor could cause loss value to be NaN in some of the cases,
    # it was decided to remain the same after some tests.
    scale_factor = 20.0 
    reg_factor = 5e-4
    n_classes = hyper_params["n_classes"]
    # +1 for ratio 1
    len_aspect_ratios = [len(x) + 1 for x in hyper_params["aspect_ratios"]]
    #
    input = Input(shape=(None, None, 3), name="input")
    # conv1 block
    conv1_1 = Conv2D(64, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv1_1")(input)
    conv1_2 = Conv2D(64, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv1_2")(conv1_1)
    pool1 = MaxPool2D((2, 2), strides=(2, 2), padding="same", name="pool1")(conv1_2)
    # conv2 block
    conv2_1 = Conv2D(128, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv2_1")(pool1)
    conv2_2 = Conv2D(128, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv2_2")(conv2_1)
    pool2 = MaxPool2D((2, 2), strides=(2, 2), padding="same", name="pool2")(conv2_2)
    # conv3 block
    conv3_1 = Conv2D(256, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv3_1")(pool2)
    conv3_2 = Conv2D(256, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv3_2")(conv3_1)
    conv3_3 = Conv2D(256, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv3_3")(conv3_2)
    pool3 = MaxPool2D((2, 2), strides=(2, 2), padding="same", name="pool3")(conv3_3)
    # conv4 block
    conv4_1 = Conv2D(512, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv4_1")(pool3)
    conv4_2 = Conv2D(512, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv4_2")(conv4_1)
    conv4_3 = Conv2D(512, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv4_3")(conv4_2)
    pool4 = MaxPool2D((2, 2), strides=(2, 2), padding="same", name="pool4")(conv4_3)
    # conv5 block
    conv5_1 = Conv2D(512, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv5_1")(pool4)
    conv5_2 = Conv2D(512, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv5_2")(conv5_1)
    conv5_3 = Conv2D(512, (3, 3), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv5_3")(conv5_2)
    pool5 = MaxPool2D((3, 3), strides=(1, 1), padding="same", name="pool5")(conv5_3)
    # conv6 and conv7 converted from fc6 and fc7 and remove dropouts
    # These layers coming from modified vgg16 model
    # https://gist.github.com/weiliu89/2ed6e13bfd5b57cf81d6
    conv6 = Conv2D(1024, (3, 3), dilation_rate=6, padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv6")(pool5)
    conv7 = Conv2D(1024, (1, 1), strides=(1, 1), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv7")(conv6)
    ############################ Extra Feature Layers Start ############################
    # conv8 block <=> conv6 block in paper caffe implementation
    conv8_1 = Conv2D(256, (1, 1), strides=(1, 1), padding="valid", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv8_1")(conv7)
    conv8_2 = Conv2D(512, (3, 3), strides=(2, 2), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv8_2")(conv8_1)
    # conv9 block <=> conv7 block in paper caffe implementation
    conv9_1 = Conv2D(128, (1, 1), strides=(1, 1), padding="valid", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv9_1")(conv8_2)
    conv9_2 = Conv2D(256, (3, 3), strides=(2, 2), padding="same", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv9_2")(conv9_1)
    # conv10 block <=> conv8 block in paper caffe implementation
    conv10_1 = Conv2D(128, (1, 1), strides=(1, 1), padding="valid", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv10_1")(conv9_2)
    conv10_2 = Conv2D(256, (3, 3), strides=(1, 1), padding="valid", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv10_2")(conv10_1)
    # conv11 block <=> conv9 block in paper caffe implementation
    conv11_1 = Conv2D(128, (1, 1), strides=(1, 1), padding="valid", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv11_1")(conv10_2)
    conv11_2 = Conv2D(256, (3, 3), strides=(1, 1), padding="valid", activation="relu", kernel_initializer="glorot_normal", kernel_regularizer=l2(reg_factor), name="conv11_2")(conv11_1)
    ############################ Extra Feature Layers End ############################
    # l2 normalization for each location in the feature map
    conv4_3_norm = L2Normalization(scale_factor)(conv4_3)
    #
    pred_deltas, pred_labels = get_head_from_outputs(hyper_params, [conv4_3_norm, conv7, conv8_2, conv9_2, conv10_2, conv11_2])
    return Model(inputs=input, outputs=[pred_deltas, pred_labels])

def init_model(model):
    """Initializing model with dummy data for load weights with optimizer state and also graph construction.
    inputs:
        model = tf.keras.model
    """
    model(tf.random.uniform((1, 512, 512, 3)))

### Loss Function

In [40]:
import tensorflow as tf

class CustomLoss(object):
    def __init__(self, neg_pos_ratio, loc_loss_alpha):
        self.neg_pos_ratio = tf.constant(neg_pos_ratio, dtype=tf.float32)
        self.loc_loss_alpha = tf.constant(loc_loss_alpha, dtype=tf.float32)

    def loc_loss_fn(self, actual_deltas, pred_deltas):
        """Calculating SSD localization loss value for only positive samples.
        inputs:
            actual_deltas = (batch_size, total_prior_boxes, [delta_y, delta_x, delta_h, delta_w])
            pred_deltas = (batch_size, total_prior_boxes, [delta_y, delta_x, delta_h, delta_w])
        outputs:
            loc_loss = localization / regression / bounding box loss value
        """
        # Localization / bbox / regression loss calculation for all bboxes
        loc_loss_fn = tf.losses.Huber(reduction=tf.losses.Reduction.NONE)
        loc_loss_for_all = loc_loss_fn(actual_deltas, pred_deltas)
        # After tf 2.2.0 version, the huber calculates mean over the last axis
        loc_loss_for_all = tf.cond(tf.greater(tf.rank(loc_loss_for_all), tf.constant(2)),
                                   lambda: tf.reduce_sum(loc_loss_for_all, axis=-1),
                                   lambda: loc_loss_for_all * tf.cast(tf.shape(pred_deltas)[-1], dtype=tf.float32))
        #
        pos_cond = tf.reduce_any(tf.not_equal(actual_deltas, tf.constant(0.0)), axis=2)
        pos_mask = tf.cast(pos_cond, dtype=tf.float32)
        total_pos_bboxes = tf.reduce_sum(pos_mask, axis=1)
        #
        loc_loss = tf.reduce_sum(pos_mask * loc_loss_for_all, axis=-1)
        total_pos_bboxes = tf.where(tf.equal(total_pos_bboxes, tf.constant(0.0)), tf.constant(1.0), total_pos_bboxes)
        loc_loss = loc_loss / total_pos_bboxes
        #
        return loc_loss * self.loc_loss_alpha

    def conf_loss_fn(self, actual_labels, pred_labels):
        """Calculating SSD confidence loss value by performing hard negative mining as mentioned in the paper.
        inputs:
            actual_labels = (batch_size, total_prior_boxes, total_labels)
            pred_labels = (batch_size, total_prior_boxes, total_labels)
        outputs:
            conf_loss = confidence / class / label loss value
        """
        # Confidence / Label loss calculation for all labels
        conf_loss_fn = tf.losses.CategoricalCrossentropy(reduction=tf.losses.Reduction.NONE)
        conf_loss_for_all = conf_loss_fn(actual_labels, pred_labels)
        #
        pos_cond = tf.reduce_any(tf.not_equal(actual_labels[..., 1:], tf.constant(0.0)), axis=2)
        pos_mask = tf.cast(pos_cond, dtype=tf.float32)
        total_pos_bboxes = tf.reduce_sum(pos_mask, axis=1)
        # Hard negative mining
        total_neg_bboxes = tf.cast(total_pos_bboxes * self.neg_pos_ratio, tf.int32)
        #
        masked_loss = conf_loss_for_all * actual_labels[..., 0]
        sorted_loss = tf.argsort(masked_loss, direction="DESCENDING")
        sorted_loss = tf.argsort(sorted_loss)
        neg_cond = tf.less(sorted_loss, tf.expand_dims(total_neg_bboxes, axis=1))
        neg_mask = tf.cast(neg_cond, dtype=tf.float32)
        #
        final_mask = pos_mask + neg_mask
        conf_loss = tf.reduce_sum(final_mask * conf_loss_for_all, axis=-1)
        total_pos_bboxes = tf.where(tf.equal(total_pos_bboxes, tf.constant(0.0)), tf.constant(1.0), total_pos_bboxes)
        conf_loss = conf_loss / total_pos_bboxes
        #
        return conf_loss

### Training

In [41]:
import math
def get_step_size(data_length, batch_size):
    """Get step size for given total item size and batch size.
    inputs:
        total_items = number of total items
        batch_size = number of batch size during training or validation
    outputs:
        step_size = number of step size for model training
    """
    return math.ceil(data_length / batch_size)

In [42]:
import os
import time
import tensorflow as tf

# hyper_params config
hyper_params = {}
hyper_params["image_size"] = (300, 300)
hyper_params["aspect_ratios"] = [[1.0, 2.0, 0.5],
                                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                 [1.0, 2.0, 0.5],
                                 [1.0, 2.0, 0.5]]
hyper_params["feature_map_shapes"] = [38, 19, 10, 5, 3, 1]
hyper_params["n_classes"] = 11+1
hyper_params["iou_threshold"] = 0.5
hyper_params["neg_pos_ratio"] = 3
hyper_params["loc_loss_alpha"] = 1
hyper_params["variances"] = [0.1, 0.1, 0.2, 0.2]
# hyper_params["scales"] = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]
# hyper_params["mean_color"] = [123, 117, 104]

In [43]:
data_dir = os.path.join(os.getcwd(), 'dataset')
IMAGE_SIZE = (300, 300)
BATCH_SIZE = 32
EPOCHS = 2

In [44]:
data_shapes = get_data_shapes()
padding_values = get_padding_values()

train_ds, train_len = ds_generator('train.csv', image_size=IMAGE_SIZE)
val_ds, val_len = ds_generator('validation.csv', image_size=IMAGE_SIZE)
train_ds = train_ds.shuffle(100).padded_batch(BATCH_SIZE, padded_shapes=data_shapes, padding_values=padding_values)
val_ds = val_ds.padded_batch(BATCH_SIZE, padded_shapes=data_shapes, padding_values=padding_values)

In [45]:
from tensorflow.keras.optimizers import SGD, Adam

ssd_model = get_backbone_vgg(hyper_params)
ssd_custom_losses = CustomLoss(hyper_params["neg_pos_ratio"], hyper_params["loc_loss_alpha"])
ssd_model.compile(optimizer=Adam(learning_rate=1e-3),
                  loss=[ssd_custom_losses.loc_loss_fn, ssd_custom_losses.conf_loss_fn])
init_model(ssd_model)

In [46]:
# We calculate prior boxes for one time and use it for all operations because of the all images are the same sizes
prior_boxes = generate_prior_boxes(hyper_params["feature_map_shapes"], hyper_params["aspect_ratios"])
ssd_train_feed = generator(train_ds, prior_boxes, hyper_params)
ssd_val_feed = generator(val_ds, prior_boxes, hyper_params)

In [47]:
def scheduler(epoch):
    """Generating learning rate value for a given epoch.
    inputs:
        epoch = number of current epoch
    outputs:
        learning_rate = float learning rate value
    """
    if epoch < 100:
        return 1e-3
    elif epoch < 125:
        return 1e-4
    else:
        return 1e-5

In [48]:
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, LearningRateScheduler
from datetime import datetime
ssd_log_path = os.path.join("logs", "VGG", datetime.now().strftime("%Y%m%d-%H%M%S"))
# ssd_log_path = "logs/{}{}/{}".format(BACKBONE, datetime.now().strftime("%Y%m%d-%H%M%S")
ssd_model_path = os.path.join("trained", datetime.now().strftime("%Y%m%d-%H%M%S"))
model_name = os.path.join(ssd_model_path, "ssd300_vgg_weights.h5")

checkpoint_callback = ModelCheckpoint(model_name, monitor="val_loss", save_best_only=True, save_weights_only=True)
tensorboard_callback = TensorBoard(log_dir=ssd_log_path)
learning_rate_callback = LearningRateScheduler(scheduler, verbose=0)

In [49]:
step_size_train = get_step_size(train_len, BATCH_SIZE)
step_size_val = get_step_size(val_len, BATCH_SIZE)

os.makedirs(ssd_model_path, exist_ok=True)
os.makedirs(ssd_log_path, exist_ok=True)

ssd_model.fit(ssd_train_feed,
              steps_per_epoch=step_size_train,
              validation_data=ssd_val_feed,
              validation_steps=step_size_val,
              epochs=EPOCHS,
              callbacks=[checkpoint_callback, tensorboard_callback, learning_rate_callback]
              )

Epoch 1/2

KeyboardInterrupt: ignored