Practice the code of https://github.com/matterport/Mask_RCNN/blob/master/mrcnn/model.py

In [0]:
 # Feature Pyramid Network Heads

def fpn_classifier_graph(rois, feature_maps, image_meta, pool_size, num_classes, train_bn = True, fc_layers_size = 1024):
    """
    Builds the computation graph of the feature pyramid network classifier and regressor heads. 
    
    rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized coordinates. 
    feature_maps: List of feature maps from different layers of the pyramid, [P2, P3, P4, P5]. Each has a different resolution.
    image_meta: [batch, (meta data)] Image details. See compose_image_meta()
    pool_size: The width of the square feature map generated from ROI Pooling. 
    num_classes: number of classes, which determines the depth of the results. 
    train_bn: Boolean. Train or freeze Batch Norm layers. 
    fc_layers_size: Size of the 2 FC layers 

    Returns:
        logits: [batch, num_rois, NUM_CLASSES] classifiers logits (before softmax) 
        probs: [batch, num_rois, NUM_CLASSES] classifiers probabilities 
        bbox_deltas: [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))] Deltas to apply to proposal boxes 
    """

    # ROI Pooling
    # Shape: [batch, num_rois, POOL_SIZE, POOL_SIZE, channels]

    x = PyramidROIAlign([pool_size, pool_size], name="roi_align_classifier")([rois, image_meta] + feature_maps)

In [0]:
def log2_graph(x):
    """Implementation of Log2. TF doesn't have a native implementation"""
    return tf.log(x) / tf.log(2.0)

import keras.engine as KE

class PyramidROIAlign(KE.Layer):
    """Implements ROI Pooling on multiple levels of the feature pyramid
    
    Params:
    - pool_shape : [pool_height, pool_width] of the output pooled regions. Usually [7, 7]

    Inputs:     # RPN 결과 나온 bounding boxes들 
    - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized coordinates. Possibly padded with zeros if not enough boxes to fill the array. 
    - image_meta: [batch, (meta data)] Image details. See compose_image_meta() <-- image의 attribute을 1d로 만들어준다. 
    - feature_maps: List of feature maps from different levels of the pyramid. Each is [batch, height, width, channels]

    Output:
    Pooled regions in the shape: [batch, num_boxes, pool_height, pool_width, channels].
    The width and height are those specific in the pool_shape in the layer constructor 
    """

    def __init__(self, pool_shape, **kwargs):       # <-- pool shape를 input으로 받아야 함
        super(PyramidROIAlign, self).__init__(**kwargs)
        self.pool_shape = tuple(pool_shape)
    
    def call(self, inputs):         #<-- inputs: boxes, image_meta, feature_maps
        # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
        boxes = inputs[0]

        # Image meta 
        # Holds details about the image. See compose_image_meta()
        image_meta = inputs[1]

        # Feature Maps. List of feature maps from different level of the feature pyramid. Each is [batch, height, width, channels]
        feature_maps = inputs[2:]

        # Assign each ROI to a level in the pyramid based on the ROI area. 
        y1, x1, y2, x2 = tf.split(boxes, 4, axis = 2)   #<-- [batch, num_boxes, 1] x 4 
        h = y2 - y1
        w = x2 - x1

        # Use shape of first image. Images in a batch must have the same size. 
        image_shape = parse_image_meta_graph(image_meta)['image_shape'][0]  # image_shape = [H, W, C]

        # Equation 1 in the Feature Pyramid Networks paper. Account for the fact that our coordinates are normalized here. 
        image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
        roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))      # normalized -> feature map 사이즈로 sclae 조정  
        roi_level = tf.minumum(5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32)))    # roi_level의 min=2, max=5
        roi_level = tf.squeeze(roi_level, 2)   # roi_level.shape = [batch, num_boxes], 2 이상 5 이하의 level 정보를 담고 있다.                                                                 

        # Loop through levels and apply ROI Pooling to each. P2 to P5       # P2부터 P5까지 돌며 각 level로 mapping된 roi들에 대해 RoI Pooling을 진행   # RoI Align
        pooled = []
        box_to_level = []
        for i, level in enumerate(range(2, 6)):         # 2, 3, 4, 5 <-- FPN layer index
            ix = tf.where(tf.equal(roi_level, level))   # 2d tensors. ix = [# of level-matched rois, 2]   true elements의 roi_level에서의 coordinates을 담는다.(ex. [1 0] : 1번째 batch, 0번째 box)
            level_boxes = tf.gather_nd(boxes, ix)       # box shape = [batch, num_boxes, (y1, x1, y2, x2)], box중 ix의 위치에 해당하는 box 좌표 가져올 것. 

            # Box indices for crop_and_resize
            box_indices = tf.cast(ix[:, 0], tf.int32)        # 1d tensor. ix의 column[0]의 값 = batch. 즉, 각 level_matched rois가 속하는 batch들의 값을 담는다. 

            # Keep track of which box is mapped to which level 
            box_to_level.append(ix)                     

            # Stop gradient propagation to ROI proposals        # box에 관한 정보는 RPN이 다룬다. 따라서 regression, classification 등의 head를 학습시키려면 RPN을 freeze 시켜서 변형을 막아야 한다. 
            level_boxes = tf.stop_gradient(level_boxes)         
            box_indices = tf.stop_gradient(box_indices)

            # Crop and Resize 
            # From Mask R-CNN paper: "We sample four regular locations, so that we can evaluate either max or average pooling. 
            # In fact, interpolating only a single value at each bin center (without pooling) is nearly as effective." <-- roi align
            # Here we use the simplified approach of a single value per bin, which is how it's done in tf.crop_and_resize()
            # Result: [batch * numb_boxes, pool_height, pool_width, channels]
            pooled.append(tf.image.crop_and_resize(
                feature_maps[i], level_boxes, box_indices, self.pool_shape, method = 'bilinear'     # level_boxes: 해당 level에 mapping되는 boxes   # box_indices: 1d
            ))                      # pooled.shape = [num_level_boxes, crop_height, crop_width, depth]

        # Pack pooled features into one tensor 
        pooled = tf.concat(pooled, axis = 0)      # pooled.shape = [num_boxes, crop_height, crop_width, depth]

        # Pack box_to_level mapping into one array and add another column representing the order of pooled boxes
        box_to_level = tf.concate(box_to_level, axis = 0)       # box_to_level.shape = [total # of rois, 2] ==> 2: roi의 batch, box 좌표
        box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)                  # box_range.shape = [total # of rois, 1] ==> 1: range(total # of rois)
        box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], axis = 1)    # box_to_level.shape = [total # of rols, 3] ==> 3: roi의 batch, box 좌표, 0~ # total rois

        # Rearrange pooled features to match the order of the original boxes        
        # Sort box_to_level by batch then box index                               
        # TF doesn't have a way to sort by two columns, so merge them and sort. 
        sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]       # sorting_tensor = 1d tensor. box_to_level.shape = [total # of rois, 3] [:, 0]: batch좌표, [:, 1]: box좌표. batch 정렬 후 box 정렬  
        ix = tf.nn.top_k(sorting_tensor, k=tf.shape(                            # ix = 1d tensor. sorting tensor를 내림차순으로 정렬한 것의 역 indices, 즉 오름차순. batch와 box가 낮을 수록 ix의 앞부분 
            box_to_level)[0]).indices[::-1]
        ix = tf.gather(box_to_level[:, 2], ix)                                  # 그 indices에 해당하는 box_to_level[:, 2]: range(total # of rois)숫자를 뽑음. (original box의 순서)
        pooled = tf.gather(pooled, ix)                                          # pooled = [num_boxes, crop_height, crop_width, depth]

        # Re-add the batch dimension
        shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis = 0)           # [batch, num_boxes, crop_height, crop_width, depth]
        pooled = tf.reshape(pooled, shape)                                  
        return pooled

    def compute_output_shape(self, input_shape):
        return input_shape[0][:2] + self.pool_shape + (input_shape[2][-1], )       # [batch + 7 + #channels, num_boxes, 7]

Using TensorFlow backend.


In [0]:
# Data Formatting 
import numpy as np

def compose_image_meta(image_id, original_image_shape, image_shape, window, scale, active_class_ids):
    """
    Takes attributes of an image and puts them in one 1D array. 

    image_id: An int ID of the image. Useful for debugging. 
    original_image_shape: [H, W, C] before resizing or padding. 
    image_shape: [H, W, C] after resizing and padding
    window: (y1, x1, y2, x2) in pixels. The area of the image where the real image is (excluding the padding)
    scale: The scaling factor applied to the original image (float32)
    active_class_ids: List of class_ids available in the dataset from which the image came. 
    Useful if training on images from multiple datasets where not all classes are present in all datasets. 
    """

    meta = np.array(
        [image_id] +    # size = 1
        list(original_image_shape) +    # size = 3
        list(image_shape) +     # size = 3
        list(window) +  # size = 4 (y1, x1, y2, x2) in image coordinates
        [scale] +   # size = 1
        list(active_class_ids)  # size = num_classes
    )
    return meta 

In [0]:
# Data Formatting

def parse_image_meta(meta):
    """
    Parses an array that contains image attributes to its components. 

    meta: [batch, meta length] where meta length depends on NUM_CLASSES

    Returns a dict of the parsed values. 
    """
    image_id = meta[:, 0]
    original_image_shape = meta[:, 1:4]
    image_shape = meta[:, 4:7]
    window = meta[:, 7:11]      # (y1, x1, y2, x2) window of image in pixels 
    scale = meta[:, 11]
    active_class_ids = meta[:, 12:]
    return {
        "image_id" : image_id.astype(np.int32),
        "original_image_shape" : original_image_shape.astype(np.int32),
        "image_shape" : image_shape.astype(np.int32),
        "window" : window.astype(np.int32),
        "scale" : scale.astype(np.float32),
        "active_class_ids" : active_class_ids.astype(np.int32)
    }

In [0]:
# Data Formatting 

def parse_image_meta_graph(meta):
    """
    Parses a tensor that contains image attributes to its componenets. See compose_image_meta() for more details. 
    
    meta: [batch, meta length] where meta length depends on NUM_CLASSES

    returns a dict of the parsed tensors
    """
    image_id = meta[:, 0]
    original_image_shape = meta[:, 1:4]
    image_shape = meta[:, 4:7]
    window = meta[:, 7:11]      # (y1, x1, y2, x2) window of image in pixels 
    scale = meta[:, 11]
    active_class_ids = meta[:, 12:]
    return {
        "image_id" : image_id,
        "original_image_shape" : original_image_shape,
        "image_shape" : image_shape,
        "window" : window,
        "scale" : scale,
        "active_class_ids" : active_class_ids,
    }

In [0]:
import tensorflow as tf, os

a = tf.constant([[[1,1],[3,6]],[[7,8],[9,9]]])
b = 
b = tf.where(tf.equal(a,3))
with tf.Session() as sess:
    output = sess.run(b)

print(a.shape)      # batch 2, box 2, coord 2

print(output)
print(type(output))
print(output.ndim)
print(type(a))

c = [1, 2, 3]
d = [2]
e = tf.concat([c, d], 0)
d = e[:,z]
print(e)

SyntaxError: ignored