In [1]:
import tensorflow as tf
slim = tf.contrib.slim

import time
import os
import numpy as np
import collections
import re
import functools

from tensorflow.python import pywrap_tensorflow
from tensorflow.python.framework import constant_op
from tensorflow.python.ops import array_ops, image_ops

from tensorflow.contrib.cluster_resolver import TPUClusterResolver
from tensorflow.contrib import tpu


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [2]:
def get_hparams():
    hparams = tf.contrib.training.HParams(
        fine_tune = True,
        channels=3,
        use_tpu = True,
        ignore_thresh=0.7,
        num_train_images=6944,
        anchors = np.array([[10, 13], [16, 30], [33, 23],
                        [30, 61], [62, 45], [59,  119],
                        [116, 90], [156, 198], [373,326]], dtype=np.float32),
        
        input_shape = (416, 416),
        train_batch_size= 32 * 8,
        #batch_size = 32 * 8,
        shuffle = 32*8,
        width = 416,
        height = 416,
        train = tf.estimator.ModeKeys.TRAIN,
        num_class = 1,
        train_tfrecord = 'gs://{}/{}/{}'.format('neuron', 'data', 'OID_train.tfrecords'),#'bucket/data/OID_train.tfrecords',
        init_checkpoint = 'gs://{}/{}/{}'.format('neuron', 'data', 'yolo_checkpoint.ckpt'),#'bucket/data/yolo_checkpoint.ckpt',
        model_dir = 'gs://{}/{}'.format('neuron', 'checkpoints'),
        restore_part = ['yolov3/darknet53_body'],
        update_part = ['yolov3/yolov3_head'],
        save_vars = ['yolov3'],
        num_examples = 6944,
        iteration_per_loop = 100,
        base_learning_rate = 1e-1,
        decay_rate = 0.5,
        decay_steps = 250,
        num_cores = 8,
        num_cores_per_replica = 8,
        num_epochs = 50,
        tpu_zone = 'us-central1-c',
        #tpu = 'grpc://10.0.4.2:8470',
        tpu = 'grpc://10.0.101.2:8470',
        gcp_project = 'fluted-visitor-233103'
    )
    
    return hparams



In [3]:
def conv2d(inputs, filters, kernel_size, strides=1):
    def _fixed_padding(inputs, kernel_size):
        pad_total = kernel_size - 1
        pad_beg = pad_total // 2
        pad_end = pad_total - pad_beg

        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
                                        [pad_beg, pad_end], [0, 0]], mode='CONSTANT')
        return padded_inputs
    if strides > 1: 
        inputs = _fixed_padding(inputs, kernel_size)
    inputs = slim.conv2d(inputs, filters, kernel_size, stride=strides,
                         padding=('SAME' if strides == 1 else 'VALID'))
    return inputs

def darknet53_body(inputs):
    def res_block(inputs, filters):
        shortcut = inputs
        net = conv2d(inputs, filters * 1, 1)
        net = conv2d(net, filters * 2, 3)

        net = net + shortcut

        return net
    
    # first two conv2d layers
    net = conv2d(inputs, 32,  3, strides=1)
    net = conv2d(net, 64,  3, strides=2)

    # res_block * 1
    net = res_block(net, 32)

    net = conv2d(net, 128, 3, strides=2)

    # res_block * 2
    for i in range(2):
        net = res_block(net, 64)

    net = conv2d(net, 256, 3, strides=2)

    # res_block * 8
    for i in range(8):
        net = res_block(net, 128)

    route_1 = net
    net = conv2d(net, 512, 3, strides=2)

    # res_block * 8
    for i in range(8):
        net = res_block(net, 256)

    route_2 = net
    net = conv2d(net, 1024, 3, strides=2)

    # res_block * 4
    for i in range(4):
        net = res_block(net, 512)
    route_3 = net

    return route_1, route_2, route_3


def yolo_block(inputs, filters):
    net = conv2d(inputs, filters * 1, 1)
    net = conv2d(net, filters * 2, 3)
    net = conv2d(net, filters * 1, 1)
    net = conv2d(net, filters * 2, 3)
    net = conv2d(net, filters * 1, 1)
    route = net
    net = conv2d(net, filters * 2, 3)
    return route, net


def upsample_layer(inputs, out_shape):
    new_height, new_width = out_shape[1], out_shape[2]
    # NOTE: here height is the first
    inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width), align_corners=True, name='upsampled')
    return inputs

In [4]:
def int_shape(x):
    if hasattr(x, '_keras_shape'):
      return x._keras_shape
    try:
      return tuple(x.get_shape().as_list())
    except ValueError:
      return None
  
def resize_images(x):
    
    height_factor = 2
    width_factor = 2
    data_format = 'channels_last'
  
    if data_format == 'channels_first':
      rows, cols = 2, 3
    elif data_format == 'channels_last':
      rows, cols = 1, 2
    else:
      raise ValueError('Invalid `data_format` argument: %s' % (data_format,))

    original_shape = int_shape(x)
    new_shape = array_ops.shape(x)[rows:cols + 1]
    new_shape *= constant_op.constant(
        np.array([height_factor, width_factor], dtype='int32'))

    x = image_ops.resize_bilinear(x, new_shape, align_corners=True, name='upsample')


    if original_shape[rows] is None:
      new_height = None
    else:
      new_height = original_shape[rows] * height_factor

    if original_shape[cols] is None:
      new_width = None
    else:
      new_width = original_shape[cols] * width_factor

    if data_format == 'channels_first':
      output_shape = (None, None, new_height, new_width)
    else:
      output_shape = (None, new_height, new_width, None)
    x.set_shape(output_shape)
    return x


In [15]:
def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    
    assignment_map = {}
    initialized_variable_names = {}
    
    name_to_variable = collections.OrderedDict()
    '''for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = name'''
    
    for var in tvars:
        name_to_variable[var] = var
        
    init_vars = tf.train.list_variables(init_checkpoint)
    
    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
            
        assignment_map[name] = name
        initialized_variable_names[name] = 1
        initialized_variable_names[name+":0"] = 1
        
    return assignment_map, initialized_variable_names

In [6]:
class yolov3(object):

    def __init__(self, params, batch_norm_decay=0.9):

        self.anchors = params['anchors']
        self.class_num = params['num_class']
        #self.anchors = anchors
        self.batch_norm_decay = batch_norm_decay

    def forward(self, inputs, is_training=False, reuse=False):
        # the input img_size, form: [height, weight]
        # it will be used later
        #print(inputs)
        self.img_size = tf.shape(inputs)[1:3]
        # set batch norm params
        batch_norm_params = {
            'decay': self.batch_norm_decay,
            'epsilon': 1e-05,
            'scale': True,
            'is_training': is_training,
            'fused': None,  # Use fused batch norm if possible.
        }
        
        #with graph.as_default():
   
        with slim.arg_scope([slim.conv2d, slim.batch_norm],reuse=reuse):
          with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm,
                              normalizer_params=batch_norm_params,
                              biases_initializer=None,
                              activation_fn=lambda x: tf.nn.leaky_relu(x, alpha=0.1)):
              with tf.variable_scope('yolov3/darknet53_body'):
                  route_1, route_2, route_3 = darknet53_body(inputs)

              with tf.variable_scope('yolov3/yolov3_head'):
                  inter1, net = yolo_block(route_3, 512)
                  feature_map_1 = slim.conv2d(net, 3 * (5 + self.class_num), 1,
                                              stride=1, normalizer_fn=None,
                                              activation_fn=None, biases_initializer=tf.zeros_initializer())
                  feature_map_1 = tf.identity(feature_map_1, name='feature_map_1')

                  #print(feature_map_1)
                  inter1 = conv2d(inter1, 256, 1)
                  inter1 = resize_images(inter1)
                  concat1 = tf.concat([inter1, route_2], axis=3)
                  #print(inter1)

                  inter2, net = yolo_block(concat1, 256)
                  feature_map_2 = slim.conv2d(net, 3 * (5 + self.class_num), 1,
                                              stride=1, normalizer_fn=None,
                                              activation_fn=None, biases_initializer=tf.zeros_initializer())
                  feature_map_2 = tf.identity(feature_map_2, name='feature_map_2')
                  #print(feature_map_2)

                  inter2 = conv2d(inter2, 128, 1)
                  inter2 = resize_images(inter2)
                  concat2 = tf.concat([inter2, route_1], axis=3)
                  #print(inter2)

                  _, feature_map_3 = yolo_block(concat2, 128)
                  feature_map_3 = slim.conv2d(feature_map_3, 3 * (5 + self.class_num), 1,
                                              stride=1, normalizer_fn=None,
                                              activation_fn=None, biases_initializer=tf.zeros_initializer())
                  feature_map_3 = tf.identity(feature_map_3, name='feature_map_3')

          return feature_map_1, feature_map_2, feature_map_3


In [7]:
def broadcast_iou(true_box_xy, true_box_wh, pred_box_xy, pred_box_wh):
  
        
        #print(pred_box_xy.shape, pred_box_wh.shape)
        #print(true_box_xy.shape, true_box_wh.shape)
    
        pred_box_xy = tf.expand_dims(pred_box_xy, -2)
        pred_box_wh = tf.expand_dims(pred_box_wh, -2)
      
        true_box_xy = tf.expand_dims(true_box_xy, 0)
        true_box_wh = tf.expand_dims(true_box_wh, 0)
        
        #print(pred_box_xy.shape, pred_box_wh.shape)
        #print(true_box_xy.shape, true_box_wh.shape)
        
        intersect_mins = tf.maximum(pred_box_xy - pred_box_wh / 2.,
                                    true_box_xy - true_box_wh / 2.)
        
        intersect_maxs = tf.minimum(pred_box_xy + pred_box_wh / 2.,
                                    true_box_xy + true_box_wh / 2.)
        
        intersect_wh = tf.maximum(intersect_maxs - intersect_mins, 0.)
        
        intersect_area = intersect_wh[..., 0] * intersect_mins[..., 1]
        pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1]
        true_box_area = true_box_wh[..., 0] * true_box_wh[..., 1]
        
        iou = intersect_area / (pred_box_area + true_box_area - intersect_area + 1e-10)
        
        
        return iou

def _reorg_layer(feature_map, anchors, num_classes, img_size):

        num_anchors = len(anchors) # num_anchors=3
        # grid_size = tf.shape(feature_map)[1:3]
        grid_size = feature_map.shape.as_list()[1:3]

        stride = tf.cast(img_size // grid_size, tf.float32)
        anchors = [(a[0] / stride[0], a[1] / stride[1]) for a in anchors]

        feature_map = tf.reshape(feature_map, [-1, grid_size[0], grid_size[1], num_anchors, 5 + num_classes])

        box_centers, box_sizes, conf_logits, prob_logits = tf.split(
            feature_map, [2, 2, 1, num_classes], axis=-1)

        box_centers = tf.nn.sigmoid(box_centers)

        grid_x = tf.range(grid_size[0], dtype=tf.int32)
        grid_y = tf.range(grid_size[1], dtype=tf.int32)

        a, b = tf.meshgrid(grid_x, grid_y)
        x_offset = tf.reshape(a, (-1, 1))
        y_offset = tf.reshape(b, (-1, 1))
        x_y_offset = tf.concat([x_offset, y_offset], axis=-1)
        x_y_offset = tf.reshape(x_y_offset, [grid_size[0], grid_size[1], 1, 2])
        x_y_offset = tf.cast(x_y_offset, tf.float32)

        box_centers = box_centers + x_y_offset
        box_centers = box_centers * stride[::-1]

        box_sizes = tf.clip_by_value(tf.exp(box_sizes), 1e-9, 50) * anchors
        box_sizes = box_sizes * stride[::-1]

        boxes = tf.concat([box_centers, box_sizes], axis=-1)
        return x_y_offset, boxes, conf_logits, prob_logits
      
def loss_layer(feature_map_i, y_true, anchors, num_classes, img, ignore_thresh):

        NO_OBJECT_SCALE  = 1.0
        OBJECT_SCALE     = 5.0
        COORD_SCALE      = 1.0
        CLASS_SCALE      = 1.0

        img_size = tf.shape(img)[1:3]
        grid_size = tf.shape(feature_map_i)[1:3]
        grid_size_ = feature_map_i.shape.as_list()[1:3]

        y_true = tf.reshape(y_true, [-1, grid_size_[0], grid_size_[1], 3, 5+num_classes])
        stride = tf.cast(img_size//grid_size, dtype=tf.float32)
        N = tf.cast(tf.shape(feature_map_i)[0], tf.float32)

        pred_result = _reorg_layer(feature_map_i, anchors, num_classes, img_size)
        x_y_offset,  pred_boxes, pred_conf_logits, pred_prob_logits = pred_result

        object_mask = y_true[..., 4:5]
        
        '''valid_true_boxes = tf.boolean_mask(y_true[..., 0:4], tf.cast(object_mask[..., 0], 'bool'))
        #print(valid_true_boxes.shape)
        valid_true_box_xy = valid_true_boxes[:, 0:2]
        valid_true_box_wh = valid_true_boxes[:, 2:4]
        
        pred_box_xy = pred_boxes[..., 0:2]
        pred_box_wh = pred_boxes[..., 2:4]
        
        iou = broadcast_iou(valid_true_box_xy, valid_true_box_wh, pred_box_xy, pred_box_wh)
        
        best_iou = tf.reduce_max(iou, axis=-1)
        
        ignore_mask = tf.cast(best_iou < 0.5, tf.float32)
        ignore_mask = tf.expand_dims(ignore_mask, -1)
        
        true_xy = y_true[..., 0:2] / stride[::-1] - x_y_offset
        pred_xy = pred_box_xy / stride[::-1] - x_y_offset
        
        true_tw_th = y_true[..., 2:4] / anchors
        pred_tw_th = pred_box_wh / anchors
        
        true_tw_th = tf.where(condition=tf.equal(true_tw_th, 0), x=tf.ones_like(true_tw_th), y=true_tw_th)
        pred_tw_th = tf.where(condition=tf.equal(pred_tw_th, 0), x=tf.ones_like(pred_tw_th), y=pred_tw_th)
        
        true_box_conf = y_true[...,4:5]
        pred_box_conf = tf.sigmoid(pred_conf_logits)
        
        conf_mask = ignore_mask * (1 - object_mask) * NO_OBJECT_SCALE
        # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box
        conf_mask = conf_mask + object_mask * OBJECT_SCALE

        ### adjust class probabilities
        class_mask = object_mask * CLASS_SCALE
        nb_conf_box  = tf.reduce_sum(tf.to_float(conf_mask  > 0.0))
        nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))

        
        true_tw_th = tf.log(tf.clip_by_value(true_tw_th, 1e-9, 1e9))
        pred_tw_th = tf.log(tf.clip_by_value(pred_tw_th, 1e-9, 1e9))
        
        box_loss_scale = 2. - (y_true[..., 2:3] / tf.cast(img_size[1], tf.float32)) * (y_true[..., 3:4] / tf.cast(img_size[0], tf.float32))
        
        xy_loss = tf.reduce_sum(tf.square(true_xy - pred_xy) * object_mask* box_loss_scale) / N
        wh_loss = tf.reduce_sum(tf.square(true_tw_th - pred_tw_th) * object_mask ) / N
        
        conf_loss = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask)  / (nb_conf_box  + 1e-6) / N
        class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true[..., 5:], logits=pred_prob_logits)
        class_loss = tf.reduce_sum(class_loss) / N'''
        #loss_class = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true[...,5:], logits=pred_prob_logits)
        #class_loss = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6) / 2.
        
        '''conf_pos_mask = object_mask
        conf_neg_mask = (1 - object_mask) * ignore_mask
        conf_loss_pos = conf_pos_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask, logits=pred_conf_logits)
        conf_loss_neg = conf_neg_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask, logits=pred_conf_logits)
        
        conf_loss = tf.reduce_sum(conf_loss_pos + conf_loss_neg) / N
        
        class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true[..., 5:], logits=pred_prob_logits)
        class_loss = tf.reduce_sum(class_loss) / N'''
        
        
        true_box_xy = y_true[...,:2] # absolute coordinate
        true_box_wh = y_true[...,2:4] # absolute size

        pred_box_xy = pred_boxes[...,:2]# absolute coordinate
        pred_box_wh = pred_boxes[...,2:4]# absolute size

        # caculate iou between true boxes and pred boxes
        intersect_xy1 = tf.maximum(true_box_xy - true_box_wh / 2.0,
                                   pred_box_xy - pred_box_wh / 2.0)
        intersect_xy2 = tf.minimum(true_box_xy + true_box_wh / 2.0,
                                   pred_box_xy + pred_box_wh / 2.0)
        intersect_wh = tf.maximum(intersect_xy2 - intersect_xy1, 0.)
        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]

        true_area = true_box_wh[..., 0] * true_box_wh[..., 1]
        pred_area = pred_box_wh[..., 0] * pred_box_wh[..., 1]

        union_area = true_area + pred_area - intersect_area + 1e-10
        iou_scores = tf.truediv(intersect_area, union_area)
        iou_scores = tf.expand_dims(iou_scores, axis=-1)

        true_box_conf = y_true[...,4:5]
        pred_box_conf = tf.sigmoid(pred_conf_logits)
        ### adjust x and y => relative position to the containing cell
        true_box_xy = true_box_xy / stride  - x_y_offset
        pred_box_xy = pred_box_xy / stride  - x_y_offset

        ### adjust w and h => relative size to the containing cell
        true_box_wh_logit = true_box_wh / (anchors * stride)
        pred_box_wh_logit = pred_box_wh / (anchors * stride)

        true_box_wh_logit = tf.where(condition=tf.equal(true_box_wh_logit,0),
                                     x=tf.ones_like(true_box_wh_logit), y=true_box_wh_logit)
        pred_box_wh_logit = tf.where(condition=tf.equal(pred_box_wh_logit,0),
                                     x=tf.ones_like(pred_box_wh_logit), y=pred_box_wh_logit)

        true_box_wh = tf.log(tf.clip_by_value(true_box_wh_logit, 1e-9, 1e9))
        pred_box_wh = tf.log(tf.clip_by_value(pred_box_wh_logit, 1e-9, 1e9))
        #if not np.isnan(true_box_wh) and not np.isnan(pred_box_wh):
          

        
        conf_mask = tf.cast(iou_scores < ignore_thresh, dtype=tf.float32) * (1 - object_mask) * NO_OBJECT_SCALE
        # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box
        conf_mask = conf_mask + object_mask * OBJECT_SCALE

        ### adjust class probabilities
        class_mask = object_mask * CLASS_SCALE
        ### class mask: simply the position of the ground truth boxes (the predictors)
        coord_mask = object_mask * COORD_SCALE

        nb_coord_box = tf.reduce_sum(tf.cast(coord_mask > 0.0, dtype=tf.float32))
        nb_conf_box  = tf.reduce_sum(tf.cast(conf_mask  > 0.0, dtype=tf.float32))
        nb_class_box = tf.reduce_sum(tf.cast(class_mask > 0.0, dtype=tf.float32))

        xy_loss = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy)     * coord_mask) / (nb_coord_box + 1e-6) / 2.
        #xy_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels = true_box_xy, logits = pred_box_xy) * coord_mask / (nb_coord_box + 1e-6) / 2.
        wh_loss = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh)     * coord_mask) / (nb_coord_box + 1e-6) / 2.
        conf_loss = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask)  / (nb_conf_box  + 1e-6) / 2.
        loss_class = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true[...,5:], logits=pred_prob_logits)
        loss_class = tf.clip_by_value(loss_class, 1e-4, 10)
        class_loss = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6) / N

        return xy_loss, wh_loss, conf_loss, class_loss
      
def compute_loss(y_pred, y_true, anchors, num_classes, img, ignore_thresh):
        """
        Note: compute the loss
        Arguments: y_pred, list -> [feature_map_1, feature_map_2, feature_map_3]
                                        the shape of [None, 13, 13, 3*85]. etc
        """
        loss_coord, loss_sizes, loss_confs, loss_class = 0., 0., 0., 0.
        _ANCHORS = [anchors[6:9], anchors[3:6], anchors[0:3]]
        y_true = [y_true['y_true1'], y_true['y_true2'], y_true['y_true3']]
        
        #iou = []
        for i in range(len( y_pred )):
            result = loss_layer(y_pred[i], y_true[i], _ANCHORS[i], num_classes, img, ignore_thresh)
            loss_coord       += result[0]
            loss_sizes       += result[1]
            loss_confs       += result[2]
            loss_class       += result[3]
            #iou.append(result[4])
            
        #l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])*0.001
        total_loss = loss_coord + loss_sizes + loss_confs +  loss_class
        #total_loss = tf.reduce_mean(total_loss)
        return [total_loss, loss_coord, loss_sizes, loss_confs, loss_class]


In [12]:
def yolo_head(feats, anchors, num_classes, input_shape, training=True):
  
  num_anchors = len(anchors)
  anchors_tensor = tf.reshape(tf.constant(anchors, dtype=tf.float32), [1, 1, 1, num_anchors, 2])
  grid_size = tf.shape(feats)[1:3]
  predictions = tf.reshape(feats, [-1, grid_size[0], grid_size[1], num_anchors, num_classes + 5])
  
  grid_y = tf.tile(tf.reshape(tf.range(grid_size[0]), [-1, 1, 1, 1]), [1, grid_size[1], 1, 1])
  grid_x = tf.tile(tf.reshape(tf.range(grid_size[1]), [1, -1, 1, 1]), [grid_size[0], 1, 1, 1])
  #print(grid_x.dtype)
  #print(grid_y.dtype)
  grid = tf.concat([grid_x, grid_y], axis=-1)
  grid = tf.cast(grid, dtype=tf.float32)
  
  box_xy = (tf.sigmoid(predictions[..., :2]) + grid) / tf.cast(grid_size[::-1], tf.float32)
  box_wh = tf.exp(predictions[..., 2:4]) * anchors_tensor / input_shape[::-1]
  
  box_confidence = tf.sigmoid(predictions[..., 4:5])
  box_class_probs = tf.sigmoid(predictions[..., 5:])
  
  if training == True:
    return grid, predictions, box_xy, box_wh
  return box_xy, box_wh, box_confidence, box_class_probs


def box_iou(box1, box2):
  box1 = tf.expand_dims(box1, -2)
  box1_xy = box1[..., :2]
  box1_wh = box1[..., 2:4]
  box1_mins = box1_xy - box1_wh / 2.
  box1_maxs = box1_xy + box1_wh / 2.
  
  box2 = tf.expand_dims(box2, 0)
  box2_xy = box2[..., :2]
  box2_wh = box2[..., 2:4]
  box2_mins = box2_xy - box2_wh / 2.
  box2_maxs = box2_xy + box2_wh / 2.
  
  intersect_mins = tf.maximum(box1_mins, box2_mins)
  intersect_maxs = tf.minimum(box1_maxs, box2_maxs)
  intersect_wh = tf.maximum(intersect_maxs - intersect_mins, 0.)
  intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
  
  box1_area = box1_wh[..., 0] * box1_wh[..., 1]
  box2_area = box2_wh[..., 0] * box2_wh[..., 1]
  iou = intersect_area / (box1_area + box2_area - intersect_area + 1e-10)
  return iou


def yolo_loss(y_pred, y_true, anchors, num_classes, ignore_thresh):
  
    loss, xy_los, wh_los, confidence_los, class_los = 0, 0, 0, 0, 0
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
    input_shape = [416.0, 416.0]
    grid_shapes = [tf.cast(tf.shape(y_pred[l])[1:3], tf.float32) for l in range(3)]
    
    y_true = [y_true['y_true1'], y_true['y_true2'], y_true['y_true3']]
  
    '''def sigmoid_cross_entropy(labels=None, logits=None):
        #if labels is not None and logits is not None:
        labels = tf.convert_to_tensor(labels)
        logits = tf.convert_to_tensor(logits)
        x = logits
        z = labels
        x1 = tf.where(tf.greater(x, 0), x=x, y=tf.zeros_like(x))

        loss = x1 - x * z + tf.log(1+tf.exp(-abs(x)))
        loss = tf.clip_by_value(loss, 1e-11, 1e9)

        return loss'''

    for i in range(3):
        object_mask = y_true[i][..., 4:5]
        #print(y_true[i])
        class_probs = y_true[i][..., 5:]
        grid, predictions, pred_xy, pred_wh = yolo_head(y_pred[i], anchors[anchor_mask[i]], num_classes, input_shape, training=True)

        pred_box = tf.concat([pred_xy, pred_wh], axis=-1)
        raw_true_xy = y_true[i][..., :2] * grid_shapes[i][::-1] - grid
        object_mask_bool = (object_mask > 0)
        raw_true_wh = tf.log(tf.where(tf.equal(y_true[i][..., 2:4] / anchors[anchor_mask[i]] * input_shape[::-1], 0), tf.ones_like(y_true[i][..., 2:4]), y_true[i][..., 2:4] / anchors[anchor_mask[i]] * input_shape[::-1]))
        box_loss_scale = 2 - y_true[i][..., 2:3] * y_true[i][..., 3:4]
        '''ignore_mask = tf.TensorArray(dtype=tf.float32, size = 1, dynamic_size = True)
    
        def loop_body(internal_index, ignore_mask):
      
            true_box = tf.boolean_mask(y_true[i][internal_index, ..., 0:4], object_mask_bool[internal_index, ..., 0])
            iou = box_iou(pred_box[internal_index], true_box)
            best_iou = tf.reduce_max(iou, axis=-1)
            ignore_mask = ignore_mask.write(internal_index, tf.cast(best_iou < ignore_thresh, tf.float32))
            return internal_index + 1, ignore_mask
    
        _, ignore_mask = tf.while_loop(lambda internal_index, ignore_mask : internal_index < tf.shape(y_pred[0])[0], loop_body, [0, ignore_mask])
        ignore_mask = ignore_mask.stack()
        ignore_mask = tf.expand_dims(ignore_mask, axis=-1)'''
        
      
        #true_box = y_true[i][..., 0:4]
        true_box = tf.boolean_mask(y_true[i][..., 0:4], object_mask_bool[..., 0])
        #true_box = np.array(true_box, dtype=np.float32)
        #true_box = true_box[object_mask_bool[..., 0]]
        iou = box_iou(pred_box, true_box)
        best_iou = tf.reduce_max(iou, axis=-1)
        ignore_mask = tf.cast(best_iou < ignore_thresh, tf.float32)
        ignore_mask = tf.expand_dims(ignore_mask, -1)


        '''pred_box_xy = tf.where(condition=tf.equal(predictions[..., 0:2],0),
                                         x=tf.ones_like(predictions[..., 0:2]), y=predictions[..., 0:2])
        raw_true_xy = tf.where(condition=tf.equal(raw_true_xy,0),
                                         x=tf.ones_like(raw_true_xy), y=raw_true_xy)'''



        xy_loss =  object_mask * box_loss_scale * tf.nn.sigmoid_cross_entropy_with_logits(labels = raw_true_xy, logits = predictions[..., 0:2])
        wh_loss = object_mask * box_loss_scale * 0.5 * tf.square(raw_true_wh - predictions[..., 2:4])
        confidence_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels = object_mask, logits = predictions[..., 4:5]) + (1 - object_mask) * tf.nn.sigmoid_cross_entropy_with_logits(labels = object_mask, logits = predictions[..., 4:5]) * ignore_mask
        class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels = class_probs, logits = predictions[..., 5:])
        xy_loss = tf.reduce_sum(xy_loss) / tf.cast(tf.shape(y_pred[0])[0], tf.float32)
        wh_loss = tf.reduce_sum(wh_loss) / tf.cast(tf.shape(y_pred[0])[0], tf.float32)
        confidence_loss = tf.reduce_sum(confidence_loss) / tf.cast(tf.shape(y_pred[0])[0], tf.float32)
        class_loss = tf.reduce_sum(class_loss) / tf.cast(tf.shape(y_pred[0])[0], tf.float32)

        xy_los += xy_loss
        wh_los += wh_loss
        confidence_los += confidence_loss
        class_los += class_loss
        loss += xy_loss + wh_loss + confidence_loss + class_loss

    return [loss, xy_los, wh_los, confidence_los, class_los]

In [8]:
def model_fn(features, labels, mode, params):
    
    print('Model fn')
    #print('features shape: ',features.shape)
    #print('label: ',labels)
    model = yolov3(params)
    #with tf.variable_scope('yolov3'):
    y_pred = model.forward(features, is_training=True)
      
    tvars = tf.trainable_variables()
    vars_to_init = tf.contrib.framework.get_variables_to_restore(include=params['restore_part'])
    initialized_variable_names = {}
    
    if params['init_checkpoint']:
        (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(vars_to_init, params['init_checkpoint'])
    
    if mode == params['train']:
        global_step = tf.train.get_or_create_global_step()
        
        update_vars = tf.contrib.framework.get_variables_to_restore(include=params['update_part'])
        vars_to_save = tf.contrib.framework.get_variables_to_restore(include=params['save_vars'])
        #loss = yolo_loss(y_pred, labels, params['anchors'], params['num_class'], params['ignore_thresh'])
        loss = compute_loss(y_pred, labels, params['anchors'], params['num_class'], features, params['ignore_thresh'])
        l2_loss = tf.losses.get_regularization_loss()
        #loss[0] += l2_loss
        total_loss = loss[0] + l2_loss
        learning_rate = tf.train.exponential_decay(params['base_learning_rate'], global_step, params['decay_steps'], params['decay_rate'], staircase=True)
        
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
        
        if params['use_tpu']:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
        train_opt = optimizer.minimize(total_loss, global_step=global_step, var_list=update_vars)
        
        
        if params['fine_tune']:
            
            def scaffold_fn():
                tf.logging.info('Fine Tuning')
                tf.train.init_from_checkpoint(params['init_checkpoint'], assignment_map)
                #saver = tf.train.Saver(var_list = vars_to_save)
                return tf.train.Scaffold()
            
        tf.logging.info('**** Trainable Variables ****')
        for var in tvars:
            init_string = ''
            if var.name in initialized_variable_names:
                init_string = ', *INIT_FROM_CKPT*'
            tf.logging.info(' name = %s, shape = %s%s', var.name, var.shape, init_string)
            
            
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_opt,
            scaffold_fn=scaffold_fn
        )
                

In [9]:
def pad_to_fixed_size(data, pad_value):
  
    output_shape = [35, 4]
    max_num_instances = output_shape[0]
    dimension = output_shape[1]
    data = tf.reshape(data, [-1, dimension])
    num_instances = tf.shape(data)[0]
    assert_length = tf.Assert(
      tf.less_equal(num_instances, max_num_instances), [num_instances])
    with tf.control_dependencies([assert_length]):
        pad_length = max_num_instances - num_instances
    paddings = pad_value * tf.ones([pad_length, dimension])
    padded_data = tf.concat([data, paddings], axis=0)
    padded_data = tf.reshape(padded_data, output_shape)
    return padded_data


In [20]:
def preprocess(image, true_boxes):
    
    tf.logging.info('preprocess')
    hparams = get_hparams()
    image = image/255.0
    y_true_13, y_true_26, y_true_52 = tf.py_function(preprocess_true_boxes, inp=[true_boxes], Tout = [tf.float32, tf.float32, tf.float32])
    
    image = tf.reshape(image, [hparams.train_batch_size, hparams.width, hparams.height, hparams.channels])
    
    #y_true1_shape = y_true1.shape.as_list()
    y_true1 = tf.reshape(y_true_13, [hparams.train_batch_size, 13, 13, 3, 6])
    #y_true2_shape = y_true2.shape.as_list()
    y_true2 = tf.reshape(y_true_26, [hparams.train_batch_size, 26, 26, 3, 6])
    #y_true3_shape = y_true3.shape.as_list()
    y_true3 = tf.reshape(y_true_52, [hparams.train_batch_size, 52, 52, 3, 6])
    
    labels = {}
    labels['y_true1'] = y_true1
    labels['y_true2'] = y_true2
    labels['y_true3'] = y_true3
    
    #labels = [y_true1, y_true2, y_true3]
    
    return image, labels

def preprocess_true_boxe(gt_boxes):
    """
    Preprocess true boxes to training input format
    Parameters:
    -----------
    :param true_boxes: numpy.ndarray of shape [T, 4]
                        T: the number of boxes in each image.
                        4: coordinate => x_min, y_min, x_max, y_max
    :param true_labels: class id
    :param input_shape: the shape of input image to the yolov3 network, [416, 416]
    :param anchors: array, shape=[9,2], 9: the number of anchors, 2: width, height
    :param num_classes: integer, for coco dataset, it is 80
    Returns:
    ----------
    y_true: list(3 array), shape like yolo_outputs, [13, 13, 3, 85]
                        13:cell szie, 3:number of anchors
                        85: box_centers, box_sizes, confidence, probability
    """
    tf.logging.info('preprocess_true_boxes')
    hparams = get_hparams()
    num_layers = len(hparams.anchors) // 3
    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]]
    grid_sizes = [[hparams.height//x, hparams.width//x] for x in (32, 16, 8)]
    #print(gt_boxes.shape)
    #print(gt_boxes[..., 0:2])

    box_centers = (gt_boxes[:, 0:2] + gt_boxes[:, 2:4]) / 2 # the center of box
    box_sizes =    gt_boxes[:, 2:4] - gt_boxes[:, 0:2] # the height and width of box

    #gt_boxes[:, 0:2] = box_centers
    #gt_boxes[:, 2:4] = box_sizes

    y_true_13 = np.zeros(shape=[grid_sizes[0][0], grid_sizes[0][1], 3, 5+hparams.num_class], dtype=np.float32)
    y_true_26 = np.zeros(shape=[grid_sizes[1][0], grid_sizes[1][1], 3, 5+hparams.num_class], dtype=np.float32)
    y_true_52 = np.zeros(shape=[grid_sizes[2][0], grid_sizes[2][1], 3, 5+hparams.num_class], dtype=np.float32)

    y_true = [y_true_13, y_true_26, y_true_52]
    anchors_max =  hparams.anchors / 2.
    anchors_min = -anchors_max
    valid_mask = tf.greater(box_sizes[..., 0], 0)
    valid_mask = tf.cast(valid_mask, dtype=tf.bool)

    # Discard zero rows.
    wh = tf.boolean_mask(box_sizes, valid_mask)
    # set the center of all boxes as the origin of their coordinates
    # and correct their coordinates
    wh = np.expand_dims(wh, -2)
    boxes_max = wh / 2.
    boxes_min = -boxes_max

    intersect_mins = tf.maximum(boxes_min, anchors_min)
    intersect_maxs = tf.minimum(boxes_max, anchors_max)
    intersect_wh   = tf.maximum(intersect_maxs - intersect_mins, 0.)
    intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
    box_area       = wh[..., 0] * wh[..., 1]

    anchor_area = hparams.anchors[:, 0] * hparams.anchors[:, 1]
    iou = intersect_area / (box_area + anchor_area - intersect_area + 1e-10)
    # Find best anchor for each true box
    best_anchor = np.argmax(iou, axis=-1)

    for t, n in enumerate(best_anchor):
        #print(t)
        for l in range(num_layers):
            if n not in anchor_mask[l]: continue

            i = np.floor(gt_boxes[t,0]/hparams.width*grid_sizes[l][1]).astype('int32')
            j = np.floor(gt_boxes[t,1]/hparams.height*grid_sizes[l][0]).astype('int32')

            k = anchor_mask[l].index(n)
            c = 0

            y_true[l][j, i, k, 0:2] = box_centers[t]
            y_true[l][j, i, k, 2:4] = box_sizes[t]
            y_true[l][j, i, k,   4] = 1.
            y_true[l][j, i, k, 5+c] = 1.

    return y_true_13, y_true_26, y_true_52


def preprocess_true_boxes(true_boxes):
    
    tf.logging.info('preprocess_true_boxes')
    hparams = get_hparams()
    #true_boxes = np.array(true_boxes, dtype=np.float32)
    input_shape = np.array(hparams.input_shape, dtype=np.int32)
    num_layers = len(hparams.anchors)//3
    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]]
    grid_sizes = [input_shape//32, input_shape//16, input_shape//8]
    
    box_centers = (true_boxes[..., 0:2] + true_boxes[..., 2:4])//2
    box_sizes = true_boxes[..., 2:4] - true_boxes[..., 0:2]
    boxes_xy = box_centers
    boxes_wh = box_sizes
    #print(true_boxes)
    #print(boxes_xy.shape)
    #print(boxes_wh)
    
    mini_batch = true_boxes.shape[0]
    #true_boxes[..., 0:2] = boxes_xy
    #true_boxes[..., 2:4] = boxes_wh
    #print(true_boxes)
    #print(box_centers, box_sizes)
    
    y_true_13 = np.zeros(shape=[mini_batch, grid_sizes[0][0], grid_sizes[0][1], 3, 5+hparams.num_class], dtype=np.float32)
    y_true_26 = np.zeros(shape=[mini_batch, grid_sizes[1][0], grid_sizes[1][1], 3, 5+hparams.num_class], dtype=np.float32)
    y_true_52 = np.zeros(shape=[mini_batch, grid_sizes[2][0], grid_sizes[2][1], 3, 5+hparams.num_class], dtype=np.float32)
        
    
    y_true=[y_true_13, y_true_26, y_true_52]
    
    anchors_max = hparams.anchors / 2
    anchors_min = -anchors_max
    valid_mask = box_sizes[..., 0] > 0
    
    for b in range(mini_batch):
      
      wh = box_sizes[b, valid_mask[b]]
      if len(wh)==0: continue
        
      wh = np.expand_dims(wh, -2)
     
      boxes_max = wh / 2.
      boxes_min = -boxes_max
    
      intersect_mins = np.maximum(boxes_min, anchors_min)
      intersect_maxs = np.minimum(boxes_max, anchors_max)
      intersect_wh = np.maximum(intersect_maxs - intersect_mins, 0.)
      intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
      box_area = wh[..., 0] * wh[..., 1]
    
    
      anchor_area = hparams.anchors[..., 0] * hparams.anchors[..., 1]
      iou = intersect_area / (box_area + anchor_area - intersect_area + 1e-10)
    
      best_anchor = np.argmax(iou, axis=-1)
      #print(best_anchor)

      for t, n in enumerate(best_anchor):
          #print(true_boxes[t, 0:4])
          #print('\n')
          for l in range(num_layers):
              if n in anchor_mask[l]:
                
                i = np.floor(true_boxes[b, t, 0]*grid_sizes[l][0]).astype('int32')
                j = np.floor(true_boxes[b, t, 1]*grid_sizes[l][1]).astype('int32')
                k = anchor_mask[l].index(n)
                c = 0
                y_true[l][b, i, j, k, 0:2] = boxes_xy[b, t, 0:2]
                y_true[l][b, i, j, k, 2:4] = boxes_wh[b, t, 2:4]
                y_true[l][b, i, j, k, 4] = 1
                y_true[l][b, i, j, k, 5+c] = 1
            
    return y_true_13, y_true_26, y_true_52
  
    
    
def parse_data(feature):

    tf.logging.info('parse_data')
    features = tf.parse_single_example(
        feature,
        features={
        'image': tf.FixedLenFeature([], tf.string),
        'filename': tf.FixedLenFeature([], tf.string),
        'height': tf.FixedLenFeature([], tf.int64),
        'width': tf.FixedLenFeature([], tf.int64),
        #'class_id': tf.VarLenFeature(tf.int64),
        'objects': tf.FixedLenFeature([], tf.int64),
        'bbox': tf.VarLenFeature(tf.float32)
    })

    objects = tf.cast(features['objects'], tf.int32)
    #true_labels = tf.cast(features['class_id'].values, tf.int32)
    width = tf.cast(features['width'], tf.int32)
    height = tf.cast(features['height'], tf.int32)

    image = tf.image.decode_jpeg(features['image'], channels=3)
    shape = tf.stack([height, width, 3])

    bbox = features['bbox']
    bbox = tf.sparse.to_dense(bbox)
    bbox = tf.reshape(bbox, [objects, 4])

    #image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.reshape(image, shape)
    #features['xmin'] = tf.cast(features['xmin'], tf.int32)
    #print(features['xmin'])
    image = tf.cast(image, dtype=tf.float32)
    bbox = pad_to_fixed_size(bbox, 0)

    #image = tf.image.convert
    return preprocess(image, bbox)

In [11]:
def input_fn(params):
    tf.logging.info('input_fn')
    dataset = tf.data.TFRecordDataset(params['train_tfrecord'])
    dataset = dataset.map(parse_data, num_parallel_calls=64)
    #dataset = dataset.padded_batch(params['train_batch_size'], padded_shapes=([params['width'], params['height'], params['channels']], [None, 4]), drop_remainder=True)
    dataset = dataset.batch(params['train_batch_size'], drop_remainder = True)
    #dataset = dataset.map(preprocess, num_parallel_calls=64)
    dataset = dataset.shuffle(params['shuffle'])
    dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
    '''image, y_true1, y_true2, y_true3 = dataset.make_one_shot_iterator().get_next()
    
    image = tf.reshape(image, [params['train_batch_size'], params['width'], params['height'], params['channels']])
    
    #y_true1_shape = y_true1.shape.as_list()
    y_true1 = tf.reshape(y_true1, [params['train_batch_size'], 13, 13, 3, 6])
    #y_true2_shape = y_true2.shape.as_list()
    y_true2 = tf.reshape(y_true2, [params['train_batch_size'], 26, 26, 3, 6])
    #y_true3_shape = y_true3.shape.as_list()
    y_true3 = tf.reshape(y_true3, [params['train_batch_size'], 52, 52, 3, 6])
    
    labels = {}
    labels['y_true1'] = y_true1
    labels['y_true2'] = y_true2
    labels['y_true3'] = y_true3'''
    
    return dataset

In [None]:
if __name__ == '__main__':
    hparams = get_hparams()
    params = dict(
        hparams.values(),
        #train_batch_size= 32 * 8,
        num_cores_per_replica = 1,
        num_shards = 8
    )
    
    num_train_steps = int(hparams.num_examples/hparams.train_batch_size * hparams.num_epochs)
    
    if params['use_tpu']:
        tf.logging.info('Using TPU')
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            params['tpu'], zone=params['tpu_zone'], project=params['gcp_project']
        )
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        #tf.Session.reset(tpu_grpc_url)
    else:
        tpu_grpc_url = None
        
    
    config_proto = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=True
    )
    
    tpu_config = tf.contrib.tpu.TPUConfig(
        params['iteration_per_loop'],
        num_shards = params['num_shards'],
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    )
    
    run_config = tf.contrib.tpu.RunConfig(
        #master = tpu_grpc_url,
        cluster = tpu_cluster_resolver,
        save_summary_steps = 50,
        save_checkpoints_steps = params['iteration_per_loop'],
        model_dir = params['model_dir'],
        session_config = config_proto,
        tpu_config = tpu_config
    )
    
    train_estimator = tf.contrib.tpu.TPUEstimator(
        model_fn = model_fn,
        model_dir = params['model_dir'],
        use_tpu = params['use_tpu'],
        train_batch_size = params['train_batch_size'],
        config = run_config,
        params = params
    )
    
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num examples = %d", hparams.num_examples)
    tf.logging.info("  Batch size = %d", hparams.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    
    train_estimator.train(
        input_fn = input_fn,
        max_steps = num_train_steps
    )
    