## 13.7. 单发多框检测（SSD）

### 13.7.1. 模型

#### 13.7.1.1. 类别预测层

In [1]:
%matplotlib inline
import mindspore
from mindspore import nn
from d2l import mindspore as d2l


def cls_predictor(num_inputs, num_anchors, num_classes):
    return nn.Conv2d(num_inputs, num_anchors * (num_classes + 1),
                     kernel_size=3, padding=1, pad_mode='pad')

#### 13.7.1.2. 边界框预测层

In [2]:
def bbox_predictor(num_inputs, num_anchors):
    return nn.Conv2d(num_inputs, num_anchors * 4, kernel_size=3, padding=1, pad_mode='pad')

#### 13.7.1.3. 连结多尺度的预测

In [3]:
def forward(x, block):
    return block(x)

Y1 = forward(d2l.zeros((2, 8, 20, 20)), cls_predictor(8, 5, 10))
Y2 = forward(d2l.zeros((2, 16, 10, 10)), cls_predictor(16, 3, 10))
Y1.shape, Y2.shape

((2, 55, 20, 20), (2, 33, 10, 10))

In [4]:
def flatten_pred(pred):
    return d2l.flatten(pred.permute(0, 2, 3, 1)) # flatten不改变0轴的size

def concat_preds(preds):
    return d2l.concat([flatten_pred(p) for p in preds], axis=1)

In [5]:
concat_preds([Y1, Y2]).shape

(2, 25300)

#### 13.7.1.4. 高和宽减半块

In [6]:
def down_sample_blk(in_channels, out_channels):
    blk = []
    for _ in range(2):
        blk.append(nn.Conv2d(in_channels, out_channels,
                             kernel_size=3, padding=1, pad_mode='pad'))
        blk.append(nn.BatchNorm2d(out_channels))
        blk.append(nn.ReLU())
        in_channels = out_channels
    blk.append(nn.MaxPool2d(2, 2))
    return nn.SequentialCell(*blk)

In [7]:
forward(d2l.zeros((2, 3, 20, 20)), down_sample_blk(3, 10)).shape

(2, 10, 10, 10)

#### 13.7.1.5. 基本网络块

In [8]:
def base_net():
    blk = []
    num_filters = [3, 16, 32, 64]
    for i in range(len(num_filters) - 1):
        blk.append(down_sample_blk(num_filters[i], num_filters[i+1]))
    return nn.SequentialCell(*blk)

forward(d2l.zeros((2, 3, 256, 256)), base_net()).shape

(2, 64, 32, 32)

#### 13.7.1.6. 完整的模型

In [9]:
def get_blk(i):
    if i == 0:
        blk = base_net()
    elif i == 1:
        blk = down_sample_blk(64, 128)
    elif i == 4:
        blk = nn.AdaptiveMaxPool2d((1,1))
    else:
        blk = down_sample_blk(128, 128)
    return blk

In [10]:
def blk_forward(X, blk, size, ratio, cls_predictor, bbox_predictor):
    Y = blk(X)
    anchors = d2l.multibox_prior(Y, sizes=size, ratios=ratio)   ##### 需要第四章
    cls_preds = cls_predictor(Y)
    bbox_preds = bbox_predictor(Y)
    return (Y, anchors, cls_preds, bbox_preds)

In [11]:
sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79],
         [0.88, 0.961]]
ratios = [[1, 2, 0.5]] * 5
num_anchors = len(sizes[0]) + len(ratios[0]) - 1

In [12]:
class TinySSD(nn.Cell):
    def __init__(self, num_classes, **kwargs):
        super(TinySSD, self).__init__(**kwargs)
        self.num_classes = num_classes
        idx_to_in_channels = [64, 128, 128, 128, 128]
        for i in range(5):
            # 即赋值语句self.blk_i=get_blk(i)
            setattr(self, f'blk_{i}', get_blk(i))
            setattr(self, f'cls_{i}', cls_predictor(idx_to_in_channels[i],
                                                    num_anchors, num_classes))
            setattr(self, f'bbox_{i}', bbox_predictor(idx_to_in_channels[i],
                                                      num_anchors))

    def construct(self, X):
        anchors, cls_preds, bbox_preds = [None] * 5, [None] * 5, [None] * 5
        for i in range(5):
            # getattr(self,'blk_%d'%i)即访问self.blk_i
            X, anchors[i], cls_preds[i], bbox_preds[i] = blk_forward(
                X, getattr(self, f'blk_{i}'), sizes[i], ratios[i],
                getattr(self, f'cls_{i}'), getattr(self, f'bbox_{i}'))
        anchors = d2l.concat(anchors, axis=1)
        cls_preds = concat_preds(cls_preds)
        cls_preds = cls_preds.reshape(
            cls_preds.shape[0], -1, self.num_classes + 1)
        bbox_preds = concat_preds(bbox_preds)
        return anchors, cls_preds, bbox_preds

In [13]:
net = TinySSD(num_classes=1)
X = d2l.zeros((32, 3, 256, 256))
anchors, cls_preds, bbox_preds = net(X)

print('output anchors:', anchors.shape)
print('output class preds:', cls_preds.shape)
print('output bbox preds:', bbox_preds.shape)

output anchors: (1, 5444, 4)
output class preds: (32, 5444, 2)
output bbox preds: (32, 21776)


### 13.7.2. 训练模型

#### 13.7.2.1. 读取数据集和初始化

In [14]:
batch_size = 32
train_iter, _ = d2l.load_data_bananas(batch_size) # 

read 1000 training examples
read 100 validation examples


In [15]:
net = TinySSD(num_classes=1)
trainer = nn.SGD(net.trainable_params(), learning_rate=0.2, weight_decay=5e-4)

#### 13.7.2.2. 定义损失函数和评价函数

In [16]:
cls_loss = nn.CrossEntropyLoss(reduction='none')
bbox_loss = nn.L1Loss(reduction='none')

def calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks):
    batch_size, num_classes = cls_preds.shape[0], cls_preds.shape[2]
    cls = cls_loss(cls_preds.reshape(-1, num_classes),
                   cls_labels.reshape(-1)).reshape(batch_size, -1).mean(axis=1)
    bbox = bbox_loss(bbox_preds * bbox_masks,
                     bbox_labels * bbox_masks).mean(axis=1)
    return cls + bbox

In [17]:
def cls_eval(cls_preds, cls_labels):
    # 由于类别预测结果放在最后一维，argmax需要指定最后一维。
    return float((cls_preds.argmax(axis=-1).type(
        cls_labels.dtype) == cls_labels).sum())

def bbox_eval(bbox_preds, bbox_labels, bbox_masks):
#     return float((d2l.abs((bbox_labels - bbox_preds) * bbox_masks)).sum())
    return float(((bbox_labels - bbox_preds) * bbox_masks).abs().sum())

In [18]:
from mindspore import ops
def box_iou(boxes1, boxes2):
    """计算两个锚框或边界框列表中成对的交并比"""
    box_area = lambda boxes: ((boxes[:, 2] - boxes[:, 0]) *
                              (boxes[:, 3] - boxes[:, 1]))
    # boxes1,boxes2,areas1,areas2的形状:
    # boxes1：(boxes1的数量,4),
    # boxes2：(boxes2的数量,4),
    # areas1：(boxes1的数量,),
    # areas2：(boxes2的数量,)
    areas1 = box_area(boxes1)
    areas2 = box_area(boxes2)
    # inter_upperlefts,inter_lowerrights,inters的形状:
    # (boxes1的数量,boxes2的数量,2)
    inter_upperlefts = ops.maximum(boxes1[:, None, :2], boxes2[:, :2])
    inter_lowerrights = ops.minimum(boxes1[:, None, 2:], boxes2[:, 2:])
    inters = (inter_lowerrights - inter_upperlefts).clamp(min=0)
    # inter_areasandunion_areas的形状:(boxes1的数量,boxes2的数量)
    inter_areas = inters[:, :, 0] * inters[:, :, 1]
    union_areas = areas1[:, None] + areas2 - inter_areas
    return inter_areas / union_areas

def assign_anchor_to_bbox(ground_truth, anchors, iou_threshold=0.5):
    """将最接近的真实边界框分配给锚框"""
    num_anchors, num_gt_boxes = anchors.shape[0], ground_truth.shape[0]
    # 位于第i行和第j列的元素x_ij是锚框i和真实边界框j的IoU
    jaccard = box_iou(anchors, ground_truth)
    # 对于每个锚框，分配的真实边界框的张量
    anchors_bbox_map = ops.full((num_anchors,), -1, dtype=mindspore.int64)
    # 根据阈值，决定是否分配真实边界框
    indices, max_ious = ops.max(jaccard, axis=1)
    anc_i = ops.nonzero(max_ious >= iou_threshold).reshape(-1)
    box_j = ops.masked_select(indices, max_ious >= iou_threshold)
    anchors_bbox_map[anc_i] = box_j 
    col_discard = ops.full((num_anchors,), -1)
    row_discard = ops.full((num_gt_boxes,), -1)
    for _ in range(num_gt_boxes):
        max_idx = ops.argmax(jaccard)
        box_idx = (max_idx % num_gt_boxes).long()
        anc_idx = (max_idx / num_gt_boxes).long()
        anchors_bbox_map[anc_idx] = box_idx
        jaccard[:, box_idx] = col_discard
        jaccard[anc_idx, :] = row_discard
    return anchors_bbox_map

def offset_boxes(anchors, assigned_bb, eps=1e-6):
    """对锚框偏移量的转换"""
    c_anc = d2l.box_corner_to_center(anchors)
    c_assigned_bb = d2l.box_corner_to_center(assigned_bb)
    offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:]
    offset_wh = 5 * ops.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:])
    offset = ops.concat([offset_xy, offset_wh], axis=1)
    return offset

def multibox_target(anchors, labels):
    """使用真实边界框标记锚框"""
    batch_size, anchors = labels.shape[0], anchors.squeeze(0)
    batch_offset, batch_mask, batch_class_labels = [], [], []
    num_anchors = anchors.shape[0]
    for i in range(batch_size):
        label = labels[i, :, :]
        anchors_bbox_map = assign_anchor_to_bbox(
            label[:, 1:], anchors)
        bbox_mask = ops.tile((anchors_bbox_map >= 0).float().unsqueeze(-1), (1, 4))
        # 将类标签和分配的边界框坐标初始化为零
        class_labels = ops.zeros(num_anchors, dtype=mindspore.int32)
        assigned_bb = ops.zeros((num_anchors, 4), dtype=mindspore.float32)
        # 使用真实边界框来标记锚框的类别。
        # 如果一个锚框没有被分配，标记其为背景（值为零）
        indices_true = ops.nonzero(anchors_bbox_map >= 0)
        bb_idx = anchors_bbox_map[indices_true]
        class_labels[indices_true] = label[bb_idx, 0].long() + 1 
        assigned_bb[indices_true] = label[bb_idx, 1:]
        # 偏移量转换
        offset = offset_boxes(anchors, assigned_bb) * bbox_mask
        batch_offset.append(offset.reshape(-1))
        batch_mask.append(bbox_mask.reshape(-1))
        batch_class_labels.append(class_labels)
    bbox_offset = ops.stack(batch_offset)
    bbox_mask = ops.stack(batch_mask)
    class_labels = ops.stack(batch_class_labels)
    return (bbox_offset, bbox_mask, class_labels)

# print(anchors.shape, Y.shape) # (1, 5444, 4) (32, 1, 5)
# bbox_labels, bbox_masks, cls_labels = multibox_target(anchors, Y)

In [19]:
#forward
for X, Y in train_iter:
    print(X.shape, Y.shape)
    break

(32, 3, 256, 256) (32, 1, 5)


In [20]:
anchors, cls_preds, bbox_preds = net(X)
print('output anchors:', anchors.shape)
print('output class preds:', cls_preds.shape)
print('output bbox preds:', bbox_preds.shape)

output anchors: (1, 5444, 4)
output class preds: (32, 5444, 2)
output bbox preds: (32, 21776)


In [21]:
bbox_labels, bbox_masks, cls_labels = multibox_target(anchors, Y)
print('bbox_labels:', bbox_labels.shape, bbox_labels.shape)
print('bbox_masks:', bbox_masks.shape, bbox_masks.dtype)
print('cls_labels:', cls_labels.shape, cls_labels.dtype)



bbox_labels: (32, 21776) (32, 21776)
bbox_masks: (32, 21776) Float32
cls_labels: (32, 5444) Int32


In [22]:
l = calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks).mean()
l

Tensor(shape=[], dtype=Float32, value= 0.69856)

# BUG 直接运行的话，这里会卡住

In [None]:
#forward
for X, Y in train_iter:
    print(X.shape, Y.shape)
    anchors, cls_preds, bbox_preds = net(X)
    print('output anchors:', anchors.shape)
    print('output class preds:', cls_preds.shape)
    print('output bbox preds:', bbox_preds.shape)
    
    bbox_labels, bbox_masks, cls_labels = multibox_target(anchors, Y)
    print('bbox_labels:', bbox_labels.shape, bbox_labels.shape)
    print('bbox_masks:', bbox_masks.shape, bbox_masks.dtype)
    print('cls_labels:', cls_labels.shape, cls_labels.dtype)    
    
    l = calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels,
                  bbox_masks)
    print(l)
    break

In [None]:
X, Y = next(train_iter)
print(X.shape, Y.shape)
anchors, cls_preds, bbox_preds = net(X)
print('output anchors:', anchors.shape)
print('output class preds:', cls_preds.shape)
print('output bbox preds:', bbox_preds.shape)

bbox_labels, bbox_masks, cls_labels = multibox_target(anchors, Y)
print('bbox_labels:', bbox_labels.shape, bbox_labels.shape)
print('bbox_masks:', bbox_masks.shape, bbox_masks.dtype)
print('cls_labels:', cls_labels.shape, cls_labels.dtype)    

l = calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels,
              bbox_masks)
print(l)


#### 13.7.2.3. 训练模型

In [None]:
# num_epochs, timer = 20, d2l.Timer()
# animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
#                         legend=['class error', 'bbox mae'])

# def forward_fn(X, Y):
#     # 生成多尺度的锚框，为每个锚框预测类别和偏移量
#     anchors, cls_preds, bbox_preds = net(X)
#     # 为每个锚框标注类别和偏移量
#     bbox_labels, bbox_masks, cls_labels = multibox_target(anchors, Y) # d2l.
#     # 根据类别和偏移量的预测和标注值计算损失函数
#     l = calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels,
#                   bbox_masks).mean()
#     return l, cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks, bbox_labels
    
# grad_fn = mindspore.value_and_grad(forward_fn, None, trainer.parameters, has_aux=True)
    
# def train_step(inputs, targets):
#     (l, cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks, bbox_labels), grads = grad_fn(inputs, targets)
#     trainer(grads)
#     return l, cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks, bbox_labels
    
# for epoch in range(num_epochs):
#     # 训练精确度的和，训练精确度的和中的示例数
#     # 绝对误差的和，绝对误差的和中的示例数
#     metric = d2l.Accumulator(4)
#     net.set_train()
#     for features, target in train_iter:
#         timer.start()
#         print(epoch)
#         l, cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks, bbox_labels = train_step(features, target)
#         metric.add(cls_eval(cls_preds, cls_labels), cls_labels.numel(),
#                    bbox_eval(bbox_preds, bbox_labels, bbox_masks),
#                    bbox_labels.numel())
#     cls_err, bbox_mae = 1 - metric[0] / metric[1], metric[2] / metric[3]
#     animator.add(epoch + 1, (cls_err, bbox_mae))
# print(f'class err {cls_err:.2e}, bbox mae {bbox_mae:.2e}')
# print(f'{len(train_iter.dataset) / timer.stop():.1f} examples/sec on '
#       f'{str(device)}')