## 算法简介

算法总体框架如下图所示：  
![structure.png](structure.png)  
其中，z代表模板图像，算法采用的是第一帧的groundtruth；x为search region，表示后面待跟踪帧中的候选框搜索区域；$\phi$表示一种特征映射操作，将原始图像映射到特定的特征空间，论文采用的是CNN中的卷积层和pooling层；6x6x128表示z经过$\phi$后得到的特征，同理，22x22x128是x经过$\phi$后得到的特征；图中的 * 代表卷积操作，将6x6x128作为卷积核对22x22x128进行卷积操作，最终得到17x17的score map，表示search region中各个位置和模板相似度值。上图中的$\phi$结构是一样的，是孪生网络结构，且整个模型只有conv层和pooling层，是一种全卷积网络结构。  
算法本身通过比较搜索区域和目标模板的相似度，最后得到搜索区域的score map。然后在score map中找到相似度最大的点，作为新的目标的中心，原理上类似于相关滤波方法。

## 一、实验目的

1. 结合论文Dataset curation部分，理解作者对数据的处理方式
2. 理解孪生网络（Siamese）结构
3. 结合论文Training with large search images理解loss设计以及ground truth的生成

## 二、实验步骤

** 1. 解析ILSVRC2015 VID数据，并对数据进行处理 (Dataset curation)**

In [65]:
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

import os
import glob
import cv2
import numpy as np
from tqdm import tqdm

def sub_process(root_dir, save_dir, split, subdir='', ):
    '''
    数据处理子进程
    root_dir:  数据总路径
    save_dir:  处理数据存储路径
    split:     包含 snippets，test，train，val
    subdir:    相应文件路径下的folders，当split='val'时，subdir=''
    '''
    data_dir = os.path.join(root_dir, 'Data', 'VID', split)#训练视频文件（以图像格式存储）
    anno_dir = os.path.join(root_dir, 'Annotations', 'VID', split, subdir)#object标注文件
    video_names = os.listdir(anno_dir)#视频文件名称
    
    for idx, video in enumerate(video_names):#显示进度条
        video_anno_path = os.path.join(anno_dir, video)
        xml_files = glob.glob(os.path.join(video_anno_path, '*.xml'))#xml标注文件,每个视频基于帧数有多个标注文件
        
        #解析xml文件
        for xml in xml_files:
            tree = ET.parse(xml)
            root = tree.getroot()
            
            folder = root.find('folder').text
            filename = root.find('filename').text
            
            #读取图像数据
            img_file = os.path.join(data_dir, folder, filename + '.JPEG')
            img = cv2.imread(img_file)
            
            #获取所有object的bounding box
            bboxes = []
            for object in root.iter('object'):
                bbox = object.find('bndbox')
                xmax = float(bbox.find('xmax').text)
                xmin = float(bbox.find('xmin').text)
                ymax = float(bbox.find('ymax').text)
                ymin = float(bbox.find('ymin').text)
                
                width = xmax - xmin + 1
                height = ymax - ymin + 1
                
                bboxes.append([xmin, ymin, width, height])#xmin, ymin, width, height
            
            for idx, object in enumerate(root.iter('object')):
                id = object.find('trackid').text#追踪目标，需要确认，因为一张图可能存在多个追踪目标
                class_name = object.find('name').text #追踪目标类别，在追踪场景中，可以忽略
                
                #创建文件存储路径
                track_save_dir = get_track_save_directory(save_dir, split, subdir, video)
                if not os.path.exists(track_save_dir):
                    os.makedirs(track_save_dir)#创建多层路径
                savexfile = os.path.join(track_save_dir, '{}.{:02d}.crop.x.jpg'.format(filename, int(id)))
                savezfile = os.path.join(track_save_dir, '{}.{:02d}.crop.z.jpg'.format(filename, int(id)))
                #跳过已经处理的文件
                if os.path.isfile(savexfile) and os.path.isfile(savezfile):
                    continue
                
                #获取最终的搜索图像数据
                target_box = convert_bbox_format(bboxes[idx])
                image_crop_z, pad_z, bbox_z, image_crop_x, pad_x, bbox_x = get_crops(img, target_box, size_z=127, size_x=255, context_amount=0.5)
                
                cv2.imwrite(savexfile, image_crop_x, [int(cv2.IMWRITE_JPEG_QUALITY), 90])#保存 xcrop
                cv2.imwrite(savezfile, image_crop_z, [int(cv2.IMWRITE_JPEG_QUALITY), 90])#保存 zcrop
    print('finished')

def get_track_save_directory(save_dir, split, subdir, video):
    '''
    返回存储路径，分别存储在a,b,c,d,e文件夹，其中，a,b,c,d为train文件, e为val文件
    save_dir: 处理文件存储路径
    split:    train or val
    subdir:   train文件夹下的subfolder
    video:    相应的视频文件路径
    '''
    subdir_map = {'ILSVRC2015_VID_train_0000': 'a',
                'ILSVRC2015_VID_train_0001': 'b',
                'ILSVRC2015_VID_train_0002': 'c',
                'ILSVRC2015_VID_train_0003': 'd',
                '': 'e'}
    
    return os.path.join(save_dir, 'Data', 'VID', 'train', subdir_map[subdir], video) 

def convert_bbox_format(bbox):
    '''
    将原始的bbox转化为center_x, center_y, width, height格式
    bbox: list, [xmin, ymin, width, height]
    
    返回：
       bbox: list, [center_x, center_y, width, height]
    '''
    xmin, ymin, width, height = bbox
    c_x = xmin + get_center(width)
    c_y = ymin + get_center(height)
    return [c_x, c_y, width, height]

def get_center(x):
    '''
    获取x的中点
    '''
    return (x-1.)/2.

def get_crops(img, bbox, size_z, size_x, context_amount):
    '''
    从原始图像数据中获取搜索图像（子图）,并根据论文Dataset curation部分进行padding处理
    img: 原始帧图像数据
    bbox: object bbox
    size_z: 模板图像size    论文采用127
    size_x: 搜索图像size    论文采用255
    context_amount: 0.5   对应论文公式参数 2p = 0.5*(w+h)
    '''
    c_x, c_y, width, height = bbox
    wc_z = width + context_amount * (width + height)   #对应公式 w+2p
    hc_z = height + context_amount * (width + height)  #对应公式 h+2p
    s_z = np.sqrt(wc_z * hc_z)    #未进行scale前的图像size
    scale_z = size_z / s_z  #需要进行的缩放因子，最终达到定值127
    #获取crop_z
    image_crop_z, left_pad_z, top_pad_z, right_pad_z, bottom_pad_z = get_subwindow_avg(img, [c_x, c_y],
                                               [size_z, size_z],
                                               [np.round(s_z), np.round(s_z)])#image_crop_x: 255 x 255
    pad_z = np.ceil([scale_z*(left_pad_z), scale_z*(top_pad_z), scale_z*(right_pad_z), scale_z*(bottom_pad_z)])
    
    d_search = (size_x - size_z) / 2.#scale之后的图，搜索图需要在模板图的基础上添加多少背景信息
    pad = d_search / scale_z         #映射到scale之前的图像大小
    s_x = s_z + 2 * pad              #缩放前搜索图像的size大小
    scale_x = size_x / s_x           #缩放因子，其实和scale_x基本一致，有精度差别
    
    image_crop_x, left_pad_x, top_pad_x, right_pad_x, bottom_pad_x = get_subwindow_avg(img, [c_x, c_y],
                                               [size_x, size_x],
                                               [np.round(s_x), np.round(s_x)])#image_crop_x: 255 x 255
    pad_x = np.ceil([scale_x*(left_pad_x), scale_x*(top_pad_x), scale_x*(right_pad_x), scale_x*(bottom_pad_x)])
    
    #计算bbox_z, bbox_x
    ws_z = width * scale_z
    hs_z = height * scale_z
    ws_x = width * scale_x
    hs_x = height * scale_x
    bbox_z = [(size_z - ws_z)/2, (size_z - hs_z)/2, ws_z, hs_z]
    bbox_x = [(size_x - ws_x)/2, (size_x - hs_x)/2, ws_x, hs_x]
    
    return image_crop_z, pad_z, bbox_z, image_crop_x, pad_x, bbox_x

def get_subwindow_avg(img, pos, model_sz, original_sz):
    '''
    从原始图像获取original_sz的子图，并进行resize，得到model_sz大小的子图，原图不足
    于original_sz大小的分别用各通道的均值进行padding
    img:         原始图像   BGR
    pos:         object中心坐标
    model_sz:    模型要求输入，论文采用255*255
    original_sz: crop后得到的原始图像大小
    '''
    
    avg_chans = [np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]#B，G，R均值，用于后面进行padding
    if not original_sz:
        original_sz = model_sz
    im_sz = img.shape    #原始图像大小
    assert im_sz[0] > 2 and im_sz[1] > 2   #保证图像不能过小
    c = [get_center(s) for s in original_sz] #对应w/2   h/2
    
    #检查bounding box是否超过原图边界
    context_xmin = np.int(np.round(pos[0] - c[0]))   #xmin
    context_xmax = np.int(context_xmin + original_sz[0] - 1)#xmax
    context_ymin = np.int(np.round(pos[1] - c[1]))   #ymin
    context_ymax = np.int(context_ymin + original_sz[1] - 1) #ymax
    
    #计算是否需要进行padding
    left_pad = np.int(np.maximum(0, -context_xmin))#如果xmin是负值，则需要进行left padding
    top_pad = np.int(np.maximum(0, -context_ymin))#如果ymin是负值，则需要进行top padding
    right_pad = np.int(np.maximum(0, context_xmax - im_sz[1] + 1))#如果xmax超过原图width，则需要进行right padding
    bottom_pad = np.int(np.maximum(0, context_ymax - im_sz[0] + 1))#如果ymax超过原图height, 则需要进行bottom padding
    
    context_xmin = context_xmin + left_pad
    context_xmax = context_xmax + left_pad
    context_ymin = context_ymin + top_pad
    context_ymax = context_ymax + top_pad
    
    #基于计算出的padding元素对原图进行padding
    if top_pad > 0 or bottom_pad > 0 or left_pad > 0 or right_pad > 0:
        B = np.pad(img[:, :, 0], ((top_pad, bottom_pad), (left_pad, right_pad)),
                   'constant', constant_values=(avg_chans[0]))
        G = np.pad(img[:, :, 1], ((top_pad, bottom_pad), (left_pad, right_pad)),
                   'constant', constant_values=(avg_chans[1]))
        R = np.pad(img[:, :, 2], ((top_pad, bottom_pad), (left_pad, right_pad)),
                   'constant', constant_values=(avg_chans[2]))
        img = np.stack((B, G, R), axis=2)
        
    
    im_patch_original = img[context_ymin:context_ymax + 1,context_xmin:context_xmax + 1, :]
    
    if not (model_sz[0] == original_sz[0] and model_sz[1] == original_sz[1]):
        im_patch = cv2.resize(im_patch_original, tuple(model_sz))
    else:
        im_patch = im_patch_original
    return im_patch, left_pad, top_pad, right_pad, bottom_pad

In [66]:
vid_dir = './ILSVRC2015'
save_dir = './ILSVRC2015-VID-Curation'

one_work = lambda a, b: sub_process(vid_dir, save_dir, a, b)

one_work('train','ILSVRC2015_VID_train_0000')
one_work('train','ILSVRC2015_VID_train_0001')
one_work('train','ILSVRC2015_VID_train_0002')
one_work('train','ILSVRC2015_VID_train_0003')
one_work('val', '')

finished
finished
finished
finished
finished


** 2. 构建SiameseFC模型 **

2.1 构建 feature extractor---AlexNet

In [83]:
import keras
import tensorflow as tf
import numpy as np
from keras import initializers, regularizers, layers, optimizers, callbacks
from keras.engine.topology import Layer
import keras.backend as K
import random
import cv2

class Split(Layer):
    '''
    定义split层
    '''
    def __init__(self, num_or_size_splits =2, axis = 3, **kwargs):
        '''
        num_or_size_splits: 将feature map沿axis通道分割成几部分，x的维度必须能够整除num_or_size_splits，否则报错
        axis: 分割的通道
        '''
        self.num_or_size_splits = num_or_size_splits
        self.axis = axis
        super(Split, self).__init__(**kwargs)
    
    def call(self, x):
        b1, b2 = tf.split(x, self.num_or_size_splits, self.axis)
        return [b1, b2]
    
    def compute_output_shape(self, input_shape):

        new_shape = list(input_shape)
        new_shape[self.axis] = int(input_shape[self.axis] / self.num_or_size_splits)
        new_shape = tuple(new_shape)
        
        result = []
        for i in range(self.num_or_size_splits):
            result.append(new_shape)
        return result

def alexnet(input_shape):
    
    input_layer = layers.Input(shape = input_shape, name = 'input_layer')
    #注意：alexnet论文直接采用11x11的卷积操作，strides = 4
    # 此处通过结合conv2d strides=2 和maxpooling2d strides=2 进行
    net = layers.Conv2D(filters=96, kernel_size=(11,11), strides=2, 
                        kernel_initializer = initializers.VarianceScaling(scale=2.0, mode='fan_out'), 
                              kernel_regularizer = regularizers.l2(5e-4),
                              name='conv1')(input_layer)
    net = layers.BatchNormalization(momentum = 0.95, epsilon = 1e-06)(net)
    net = layers.Activation('relu')(net)
    net = layers.MaxPooling2D(pool_size=(3,3), strides=2, name = 'pool1')(net)
    
    b1, b2 = Split(num_or_size_splits = 2, axis=3, name = 'split1')(net)
    b1 = layers.Conv2D(filters = 128, kernel_size=(5, 5), 
                             kernel_initializer = initializers.VarianceScaling(scale=2.0, mode='fan_out'),
                             kernel_regularizer = regularizers.l2(5e-4),
                             name = 'conv2_b1')(b1)
    b2 = layers.Conv2D(filters = 128, kernel_size=(5, 5), 
                             kernel_initializer = initializers.VarianceScaling(scale=2.0, mode='fan_out'),
                             kernel_regularizer = regularizers.l2(5e-4),
                             name = 'conv2_b2')(b2)
    net = layers.Concatenate(axis=3)([b1, b2])
    net = layers.BatchNormalization(momentum=0.95, epsilon=1e-06)(net)
    net = layers.Activation('relu')(net)
    net = layers.MaxPooling2D(pool_size=(3,3), strides=2, name='pool2')(net)

    net = layers.Conv2D(filters = 384, kernel_size = (3,3), strides = 1, 
                        kernel_initializer = initializers.VarianceScaling(scale=2.0, mode='fan_out'),
                        kernel_regularizer = regularizers.l2(5e-4),
                        name = 'conv3')(net)
    net = layers.BatchNormalization(momentum = 0.95, epsilon = 1e-06)(net)
    net = layers.Activation('relu')(net)
    
    b1, b2 = Split(num_or_size_splits = 2, axis = 3, name = 'split2')(net)
    b1 = layers.Conv2D(filters = 192, kernel_size = (3,3), strides = 1, 
                             kernel_initializer = initializers.VarianceScaling(scale=2.0, mode='fan_out'),
                             kernel_regularizer = regularizers.l2(5e-4),
                             name = 'conv4_b1')(b1)
    b2 = layers.Conv2D(filters = 192, kernel_size = (3,3), strides = 1, 
                             kernel_initializer = initializers.VarianceScaling(scale=2.0, mode='fan_out'),
                             kernel_regularizer = regularizers.l2(5e-4),
                             name = 'conv4_b2')(b2)
    net = layers.Concatenate(axis=3)([b1, b2])
    net = layers.BatchNormalization(momentum=0.95, epsilon=1e-06)(net)
    net = layers.Activation('relu')(net)
    
    b1, b2 = Split(num_or_size_splits = 2, axis = 3, name = 'split3')(net)
    b1 = keras.layers.Conv2D(filters = 128, kernel_size = (3,3), strides = 1, 
                             kernel_initializer = initializers.VarianceScaling(scale=2.0, mode='fan_out'),
                             kernel_regularizer = regularizers.l2(5e-4),
                             name = 'conv5_b1')(b1)
    b2 = keras.layers.Conv2D(filters = 128, kernel_size = (3,3), strides = 1, 
                             kernel_initializer = initializers.VarianceScaling(scale=2.0, mode='fan_out'),
                             kernel_regularizer = regularizers.l2(5e-4),
                             name = 'conv5_b2')(b2)
    net = keras.layers.Concatenate(axis=3)([b1, b2])
    
    model = keras.models.Model(inputs = input_layer, outputs = net)
    return model

2.2 构建Siamese孪生网络

In [88]:
def Siamese(template_size, instance_size):

    def detection(inputs):
        '''
        将template_feature和instance_feature进行卷积计算，并在此基础上+bias
        获得1x17x17x1的score map
        '''
        f_x, f_z = inputs
        f_x = f_x[0]
        f_z = f_z[0]
        f_x = K.expand_dims(f_x, 0)
        f_z = K.expand_dims(f_z, -1)

        score_map = K.conv2d(f_x, f_z, strides=(1, 1), padding='valid')
        score_map = score_map[0:3]#删除最后一维
        bias = K.variable(value=[0], dtype=K.floatx(), name='detction_bias')
        detection = K.bias_add(0.001*score_map, bias)
        #尝试采用sigmoid激活  激活函数已经包含sigmoid激活步骤
        #detection = K.sigmoid(detection)#对detection结果进行sigmoid激活，将输出值约束到0-1之间
        return detection  #17x17
    
    base_model = alexnet(input_shape = (None, None, 3))#接收任意尺寸   
     
    template_input = layers.Input(shape = (template_size, template_size, 3))
    instance_input = layers.Input(shape = (instance_size, instance_size, 3))
    
    feature_template = base_model(template_input)       #(?, 6, 6, 256)
    feature_instance = base_model(instance_input)       #(?, 22, 22, 256)

    #不支持batch_size>1的情况
    score_map = layers.Lambda(detection, name='detection')([feature_instance, feature_template])
 
    siamese_model = keras.models.Model(inputs = [template_input, instance_input], outputs = score_map)
    
    return siamese_model

** 3. 构建ground truth **

In [72]:
def gt_construct(target_size = 17, stride = 8, config_rPos = 16, config_rNeg = 0):
    y = np.array(list(range(0, target_size))) - (target_size-1)/2
    x = np.array(list(range(0, target_size))) - (target_size-1)/2
    [Y,X] = np.meshgrid(y, x)

    def _logistic_label(X, Y, rPos, rNeg):
        dist_to_center = np.abs(X) + np.abs(Y)
        Z = np.where(dist_to_center <= rPos,
                    np.ones_like(X),
                    np.where(dist_to_center < rNeg,
                            0.5 * np.ones_like(X),
                            np.zeros_like(X)))
        return Z
    
    rPos = config_rPos / stride
    rNeg = config_rNeg / stride
    gt = _logistic_label(X, Y, rPos, rNeg)
    
    return gt

** 4. 构建train_generator和val_generator **

In [76]:
#数据预处理
def preprocess(img_data):
    '''
    对图像数据进行归一化处理
    '''
    img_data = img_data / 255  
    return img_data

4.1 train generator构建，对数据采用随机抽样

In [74]:
def data_generator(files, batch_size = 1):
    '''
    迭代器
    采用抽样的方式
    后期考虑：抽样过程是否需要考虑track_id的平衡
    '''
    while True:
        #数据随机打乱
        random.shuffle(files)
        
        trainx = []
        trainz = []
        trainY = []
        batch_files = files[0:batch_size]
        for file in batch_files:
            #file是*.crop.x.jpg
            crop_x_file = file
            crop_z_file = file.replace('.x.jpg', '.z.jpg')
            
            img_crop_x = cv2.imread(crop_x_file)
            img_crop_z = cv2.imread(crop_z_file)

            crop_x_input = preprocess(img_crop_x)#归一化处理
            crop_z_input = preprocess(img_crop_z)

            #
            trainx.append(crop_x_input)
            trainz.append(crop_z_input)
            
            #构建gt
            trainY.append(gt_construct())#采用默认参数
        
        trainx = np.array(trainx)
        trainz = np.array(trainz)
        trainY = np.array(trainY)

        #expand_dims
        trainY = np.expand_dims(trainY, axis=3)
        
        yield [trainz, trainx], trainY  #训练用X,Y

4.2 val generator,直接遍历所有validation数据

In [75]:
def val_data_generator(files, batch_size = 1):
    '''
    验证集validation
    验证集不需要进行随机抽样
    '''
    while True:
        batch_num = int(len(files)/batch_size)
        for i in range(batch_num):
            batch_files = files[i*batch_size: (i+1)*batch_size]
            trainx = []
            trainz = []
            trainY = []
            for file in batch_files:
                crop_x_file = file
                crop_z_file = file.replace('.x.jpg','.z.jpg')

                img_crop_x = cv2.imread(crop_x_file)
                img_crop_z = cv2.imread(crop_z_file)

                crop_x_input = preprocess(img_crop_x)
                crop_z_input = preprocess(img_crop_z)

                trainx.append(crop_x_input)
                trainz.append(crop_z_input)

                trainY.append(gt_construct())
            
            trainx = np.array(trainx)
            trainz = np.array(trainz)
            trainY = np.array(trainY)

            trainY = np.expand_dims(trainY, axis=3)
            yield[trainz, trainx], trainY

** 5. 构建loss函数 ** 

In [77]:
def weighted_loss(y_true, y_pred):
    '''
    y_true: shape is 1x17x17x1
    y_pred: shape is 1x17x17x1
    '''
    loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_pred,labels = y_true)
    
    #获取class_weights
    n_pos = tf.reduce_sum(tf.to_float(tf.equal(y_true[0,:,:,0], 1)))
    n_neg = tf.reduce_sum(tf.to_float(tf.equal(y_true[0,:,:,0], 0)))
    w_pos = 0.5 / n_pos
    w_neg = 0.5 / n_neg
    class_weights = tf.where(tf.equal(y_true, 0),
                            w_pos * tf.ones_like(y_true),
                            tf.ones_like(y_true))
    class_weights = tf.where(tf.equal(y_true, 0),
                            w_neg * tf.ones_like(y_true),
                            class_weights)
    
    loss = loss * class_weights
    loss = tf.reduce_mean(tf.reduce_sum(loss, [1, 2]))
    return loss

** 6. 模型训练 **

In [None]:
import glob
import os
import warnings

warnings.filterwarnings('ignore')

train_all_crop_img_files = []
val_all_crop_img_files = []

train_folder_path = './ILSVRC2015-VID-Curation/Data/VID/train'   #for test
train_folders = ['a','b','c','d']
for folder in train_folders:
    video_ids = os.listdir(os.path.join(train_folder_path, folder))
    for video_id in video_ids:
        train_all_crop_img_files = train_all_crop_img_files + glob.glob(os.path.join(train_folder_path, folder, video_id, '*.crop.x.jpg'))#获取训练数据文件

val_folder_path = './ILSVRC2015-VID-Curation/Data/VID/train/e'
video_ids = os.listdir(val_folder_path)
for video_id in video_ids:
    val_all_crop_img_files = val_all_crop_img_files + glob.glob(os.path.join(val_folder_path, video_id, '*.crop.x.jpg'))#获取验证数据文件

#受网络框架限制，只接受batch_size=1的情况    
train_gen = data_generator(train_all_crop_img_files, batch_size=1)
val_gen = val_data_generator(val_all_crop_img_files, batch_size=1)

model = Siamese(template_size=127, instance_size=255)

opt = optimizers.SGD(lr=0.001)
model.compile(optimizer=opt, loss=weighted_loss)

#model fit参数
modelcheckpoint = callbacks.ModelCheckpoint('simase_fc.h5')
tensorboard = callbacks.TensorBoard('./logs')

model.fit_generator(generator=train_gen, 
                    steps_per_epoch = 5000, 
                    epochs=100,
                    callbacks = [modelcheckpoint, tensorboard],
                    validation_data=val_gen,
                    validation_steps=len(val_all_crop_img_files))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
 390/5000 [=>............................] - ETA: 2:20:13 - loss: 1.4555

## 三、实验注意事项

1. 实验中，由template_feature$(\phi(z))$和instance_feature$(\phi(x))$卷积得到的output，在乘以0.001之后再加bias，论文中未体现此trick。    
2. 在构建detection过程中，忽略了feature map的batch大小，因此，模型只接受batch_size=1的情况。  
3. 处理后的template和instance图像数据，其追踪目标都位于图像中心位置，基于此前提条件，构建了样本的ground truth。

## 四、实验拓展

1. 尝试在理解注意事项1的基础上，将score_map的输出公式变为$f(z,x)=\omega\times(\phi(z)*\phi(x)) + b$，w和b均可训练模式，查看loss变换情况。 
2. 尝试对模型进行改进，能够接受batch_size>1的情况。  
3. 实验未集成数据增强方法，尝试优化代码，集成数据增强方法，以增强模型的泛化性能。  
4. 实验未集成基于训练好的model进行track，尝试自己加以完善。