In [1]:
from __future__ import print_function

import os
import cv2
import copy
import json
import numpy as np
from tqdm import tqdm

from keras.layers import *
from keras.models import Model
from bert4keras.layers import Loss
from bert4keras.optimizers import Adam
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.snippets import sequence_padding, is_string
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder

from caption_eval.custom_caption_eval import calculate_metrics

Using TensorFlow backend.


In [2]:
# bert配置
config_path = 'bert-model/uncased_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'bert-model/uncased_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'bert-model/uncased_L-12_H-768_A-12/vocab.txt'

In [3]:
# 加载并精简词表，建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)

In [4]:
# 模型配置
maxlen = 64
batch_size = 16
steps_per_epoch = 1000
epochs = 2

with_key_words = True
with_object_detection = False

In [5]:
def read_object_data(folder, valid=False):
    """读取并整理COCO的数据,包括caption, object, attributes 和 relationships , 同时提取目标特征.
    单个数据如下:
    [{'image_features': [2048]},
    {'objects': 
        [{'object_id': int,
            'name': str,
            'attributes': [],
            'object_features': [2048]}],
    'relationships': [],
    'caption': str}
    ]
    
    返回数据格式:
    [
    {'key_words': str,
    'condition_features': [2048],
    'objects_features': [[2048],[2048],...],
    'caption': str},
    ...
    ]
    """
    res = []
    
    files = os.listdir(folder)
    
    for file in tqdm(files[:1000], desc=''):
        file_path = folder + file
        data = np.load(file_path, allow_pickle=True)
        
        for d in data[1:]:
            condition_features = data[0]['image_features']
            objects_features = []
            
            objects = d.pop('objects')
            relationships = d.pop('relationships')
            
            key_words = ''
            for ob in objects:
                if with_key_words:
                    if ob['name'] != '':
                        key_words += ob['name'] + ' '
                
                if with_object_detection:
                    ob_f = ob['object_features']
                    if len(ob_f):
                        objects_features.append(ob_f)
                    
            d['condition_features'] = np.array(condition_features)
            d['objects_features']   = np.array(objects_features)
            d['key_words'] = key_words
            
            if valid:
                d['image_id'] = file.replace('npy', 'jpg')
            
            res.append(d)
        
    return res

def read_val_caption(f):
    """读取并整理COCO的Caption数据
    """
    data = json.load(open(f))
    images = {}
    for img in data['images']:
        images[img['id']] = {
            'image_id': img['file_name'],
            'caption': [],
        }
    for caption in data['annotations']:
        images[caption['image_id']]['caption'].append(caption['caption'])
        
    res = {}
    for img in images.values():
        res[img['image_id']] = img['caption']
        
    return res


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_objects_features, batch_condition_features, batch_token_ids, batch_segment_ids = [], [], [], []
        for is_end, D in self.sample(random):

            condition_features = D['condition_features']
            objects_features   = D['objects_features']
            
            caption = D['caption']
            inputs = D['key_words']

            token_ids, segment_ids = tokenizer.encode(
                inputs, caption, max_length=maxlen
            )
            
            batch_condition_features.append(condition_features)
            batch_objects_features.append(objects_features)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
                
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_condition_features = np.array(batch_condition_features)
                batch_objects_features   = np.array(batch_objects_features)
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids, batch_condition_features], None
                batch_objects_features, batch_condition_features, batch_token_ids, batch_segment_ids = [], [], [], []

In [6]:
# 加载数据
train_data = read_object_data(
    './data/MSCOCO/annotation/features/train2014/', False
)
valid_data = read_object_data(
    './data/MSCOCO/annotation/features/val2014/', True
)

valid_caption_data = read_val_caption(
    'data/coco2014/annotations/captions_val2014.json'
)

100%|██████████| 100/100 [00:00<00:00, 381.30it/s]
100%|██████████| 100/100 [00:00<00:00, 373.85it/s]


In [7]:
class CrossEntropy(Loss):
    """交叉熵作为loss，并mask掉padding部分
    """
    def compute_loss(self, inputs, mask=None):
        y_true, y_mask, y_pred = inputs
        y_true = y_true[:, 1:]  # 目标token_ids
        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.sum(loss * y_mask) / K.sum(y_mask)
        return loss

In [8]:
# 使用全连接层将2048维的图片特征降维成512
x = Input(shape=(2048,), name='image_features')

# Bert模型
model = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
    layer_norm_cond=x,
    layer_norm_cond_hidden_size=512,
    layer_norm_cond_hidden_act='swish',
    additional_input_layers=x
)

model.inputs

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


[<tf.Tensor 'Input-Token:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'Input-Segment:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'image_features:0' shape=(?, 2048) dtype=float32>]

In [9]:
output = CrossEntropy(2)(model.inputs[:2] + model.outputs)

model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     multiple             22417920    Input-Token[0][0]                
                                                                 MLM-Norm[0][0]                   
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]        

  'be expecting any data to be passed to {0}.'.format(name))


In [10]:
class AutoCaption(AutoRegressiveDecoder):
    """img2seq解码器
    """
    @AutoRegressiveDecoder.set_rtype('probas')
    def predict(self, inputs, output_ids, step):
        token_ids, segment_ids, condition_features, objects_features = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
        return model.predict([token_ids, segment_ids, condition_features, objects_features])[:, -1]

    def generate(self, inputs, condition_features, objects_features ,topk=3):
        token_ids, segment_ids = tokenizer.encode(inputs, max_length=maxlen)
        output_ids = self.beam_search([token_ids, segment_ids, condition_features, objects_features], topk)  # 基于beam search
        return tokenizer.decode(output_ids)

In [11]:
autocaption = AutoCaption(
    start_id=None,
    end_id=tokenizer._token_end_id,
    maxlen=maxlen
)

In [12]:
def caption_eval(epoch, loss):
    imgIds = []
        
    datasetGTS = {}
    datasetRES = {}
        
    GTS_annotations = []
    RES_annotations = []
    
    print(u'-Reading data ...')
    samples = [valid_data[i] for i in np.random.choice(len(valid_data), 100)]
        
    for D in samples:
        for caption in valid_caption_data[D['image_id']]:
            gts = {}
            gts[u'image_id'] = D['image_id']
            gts[u'caption']  = caption
            GTS_annotations.append(gts)
    
        res = {}
        res[u'image_id'] = D['image_id']
        res[u'caption']  = autocaption.generate(D['key_words'], D['condition_features'], D['objects_features'])
        
        RES_annotations.append(res)
        imgIds.append(D['image_id'])
        
    datasetGTS['annotations'] = GTS_annotations
    datasetRES['annotations'] = RES_annotations
    
    print(u'-Calculating scores ...')
    scores = calculate_metrics(imgIds, datasetGTS, datasetRES)
    print(scores)

    
def just_show():
    samples = [valid_data[i] for i in np.random.choice(len(valid_data), 2)]
    for D in samples:
        print(u'image_id:', D['image_id'])
        print(u'key_words:', inputs)
        print(u'predict:', autocaption.generate(D['key_words'], D['condition_features'], D['objects_features']))
        print(u'references:', valid_caption_data[D['image_id']])
        print()

    
class Evaluate(keras.callbacks.Callback):
    def __init__(self):
        self.lowest = 1e10

    def on_epoch_end(self, epoch, logs=None):
        
        # 演示效果
        just_show()
        
        caption_eval()

In [13]:
evaluator = Evaluate()
train_generator = data_generator(train_data, batch_size)

In [14]:
model.fit_generator(
    train_generator.forfit(),
    steps_per_epoch=steps_per_epoch,
    epochs=epochs
)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/2


ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 3 array(s), but instead got the following list of 4 arrays: [array([[    2,  1162,  2046, 13761,     3,    42,  1162,  1011,    42,
         2530,  2046,  1012,  1024,    50, 17475, 13761,     3,     0,
            0,     0,     0,     0,     0,     0],
      ...

In [33]:
for d in train_generator:
    print(d[0][3])

[array([[0.14536437, 0.34011772, 0.03523269, ..., 0.162845  , 1.01604259,
        0.45504972],
       [0.17094558, 0.34932253, 0.31953722, ..., 0.13969183, 0.66638255,
        0.50118119]])
 array([[0.        , 1.1073854 , 0.        , ..., 0.        , 0.87914252,
        0.57138073],
       [1.29315042, 0.        , 0.        , ..., 0.        , 0.87707627,
        0.        ],
       [0.99251759, 0.        , 0.        , ..., 0.12110248, 0.60137922,
        0.13493516]])
 array([[1.69064045, 0.        , 0.        , ..., 0.02755388, 0.66121358,
        0.60237092],
       [1.69064045, 0.        , 0.        , ..., 0.02755388, 0.66121358,
        0.60237092],
       [0.34857321, 0.        , 0.        , ..., 0.23797591, 0.66666728,
        0.        ]])
 array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.41874349, 0.01020338, 1.93365967, ..., 0.45249575, 0.78025728,
        3.76161742],
       [0.24728535, 0.15022062, 0.        , ..., 0.03



In [16]:
token_ids

[2, 19411, 3, 13412, 2262, 3]

In [17]:
segment_ids

[0, 0, 0, 1, 1, 1]

In [2]:
relu = activations('relu')

NameError: name 'activations' is not defined