In [2]:
from __future__ import print_function

import os
import cv2
import copy
import json
import numpy as np
from tqdm import tqdm


from keras.layers import *
from keras.models import Model
from bert4keras.layers import Loss
from bert4keras.optimizers import Adam
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.snippets import sequence_padding, is_string
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder

from caption_eval.custom_caption_eval import calculate_metrics

Using TensorFlow backend.


In [2]:
# bert配置
config_path = 'bert-model/uncased_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'bert-model/uncased_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'bert-model/uncased_L-12_H-768_A-12/vocab.txt'

# 加载并精简词表，建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)

# 模型配置
maxlen = 64
batch_size = 16
steps_per_epoch = 1000
epochs = 1

In [3]:
def read_object_data(folder, valid=False):
    """读取并整理COCO的数据,包括caption, object, attributes 和 relationships , 同时提取目标特征.
    单个数据如下:
    [
     {'region_feature'：[2048], keywords': str, 'caption': str},
     {'region_feature':[2048], keywords': str, 'caption': str},
     ...
    ]
    
    Returns:
    -train:
    [{'keywords': str,'region_feature': [2048],'caption': str},
    ...
    ]
    
    -valid:
    [{'image_id':str,
      'features': [2048],
      'caption': [str, str, str, str, str],
      'objects_key_words': [str, str, str, str, str]},
    ...  
    ]
    """
    print('-Read data ...')
    res = []
    
    files = os.listdir(folder)
    
    if valid:
        # 读取image features 和 关键字
        for _, file in tqdm(enumerate(files)):
            file_path = folder + file
            data = np.load(file_path, allow_pickle=True)
        
            image = {}
            image_id = file.replace('npy', 'jpg')
            image['image_id'] = image_id
            image['regions']  = []

            for region in data:
                region['region_feature'] = np.array(region['region_feature'])
                image['regions'].append(region)
            
            res.append(image)
    else:
        for _, file in tqdm(enumerate(files)):
            file_path = folder + file
            images = np.load(file_path, allow_pickle=True)
        
            for region in images:
                region['region_feature'] = np.array(region['region_feature'])
                res.append(region)
        
    return res


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_features, batch_token_ids, batch_segment_ids = [], [], []
        for is_end, D in self.sample(random):

            features = D['region_feature']
            caption = D['caption']
            inputs = D['keywords']

            token_ids, segment_ids = tokenizer.encode(
                inputs, caption, max_length=maxlen
            )
            
            batch_features.append(features)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
                
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_features = np.array(batch_features)
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids, batch_features], None
                batch_features, batch_token_ids, batch_segment_ids = [], [], []

# 加载数据
train_data = read_object_data('./data/VisualGenome/train2016/', valid=False)
#valid_data = read_object_data('./data/VisualGenome/valid2016/', valid=True)
print('-Train data numbers: ', len(train_data))
#print('-Valid data numbers: ', len(valid_data))
print('-Train data steps per epoch', len(train_data)/16)
#print('-Valid data steps per epoch', len(train_data)/16)

9it [00:00, 81.58it/s]

-Read data ...


85283it [14:13, 99.87it/s] 

-Train data numbers:  2992034
-Train data steps per epoch 187002.125





In [5]:
class CrossEntropy(Loss):
    """交叉熵作为loss，并mask掉padding部分
    """
    def compute_loss(self, inputs, mask=None):
        y_true, y_mask, y_pred = inputs
        y_true = y_true[:, 1:]  # 目标token_ids
        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.sum(loss * y_mask) / K.sum(y_mask)
        return loss

    
# 条件全连接层
x_in = Input(shape=(2048,), name='image_features')
    
# Bert模型
model = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
    layer_norm_cond=x_in,
    layer_norm_cond_hidden_size=512,
    layer_norm_cond_hidden_act='swish',
    additional_input_layers=x_in,
)


output = CrossEntropy(2)(model.inputs[0:2] + model.outputs)

model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()


class AutoCaption(AutoRegressiveDecoder):
    """img2seq解码器
    """
    @AutoRegressiveDecoder.set_rtype('probas')
    def predict(self, inputs, output_ids, step):
        token_ids, segment_ids, image = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
        return model.predict([token_ids, segment_ids, image])[:, -1]

    def generate(self, inputs, features, topk=1):
        token_ids, segment_ids = tokenizer.encode(inputs, max_length=maxlen)
        output_ids = self.beam_search([token_ids, segment_ids, features], topk)  # 基于beam search
        return tokenizer.decode(output_ids)


autocaption = AutoCaption(
    start_id=None,
    end_id=tokenizer._token_end_id,
    maxlen=maxlen
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     multiple             22417920    Input-Token[0][0]                
                                                                 MLM-Norm[0][0]                   
__________________________________________________________________________________________________
Embedding-S

  'be expecting any data to be passed to {0}.'.format(name))


In [24]:
def just_show():
    samples = [valid_data[i] for i in np.random.choice(len(valid_data), 2, replace=False)]
    for img in samples:
        region = np.random.choice(img['regions'])
        print(u'image_id:', img['image_id'])
        print(u'key_words:', region['keywords'])
        print(u'predict:', autocaption.generate(region['keywords'], region['region_feature']))
        print(u'references:', region['caption'])
        print()
        
        
def caption_eval(epoch, loss):
    just_show()
        
    datasetGTS = {}
    datasetRES = {}
        
    GTS_annotations = []
    RES_annotations = []
    
    samples = [valid_data[i] for i in np.random.choice(len(valid_data), 5, replace=False)]
    
    imgId = 0
    for _, sample in tqdm(enumerate(samples), desc='Reading data'):
        for region in sample['regions']:
            res = {}
            res[u'image_id'] = imgId
            res[u'caption'] = autocaption.generate(region['keywords'], region['region_feature'])
            RES_annotations.append(res)
            
            gts = {}
            gts[u'image_id'] = imgId
            gts[u'caption'] = region['caption']
            GTS_annotations.append(gts)
            
            imgId += 1
            
    imgIds = range(imgId)
    datasetGTS['annotations'] = GTS_annotations
    datasetRES['annotations'] = RES_annotations
    
    print(u'-Calculating scores ...')
    scores = calculate_metrics(imgIds, datasetGTS, datasetRES)
    print(scores)
    
    scores['epoch'] = epoch
    scores['loss']  = loss
    
    save_path = 'models/VisualGenome/conditional_kw/'
    
    with open(save_path + 'caption_eval.txt', "a") as f:
        f.write(str(scores) + '\n')


class Evaluate(keras.callbacks.Callback):
    def __init__(self):
        self.lowest = 1e10

    def on_epoch_end(self, epoch, logs=None):
        # 保存模型
        model.save_weights('models/VisualGenome/conditional_kw/model_{}.weights'.format(epoch))
        
#         scores = {}
#         scores['epoch'] = epoch
#         scores['loss']  = logs['loss']
#         save_path = 'models/VisualGenome/conditional_kw/'
    
#         with open(save_path + 'caption_eval.txt', "a") as f:
#             f.write(str(scores) + '\n')
        # 评价指标
        caption_eval(epoch, logs['loss'])

In [25]:
evaluator = Evaluate()
train_generator = data_generator(train_data, batch_size)

In [26]:
model.fit_generator(
    train_generator.forfit(),
    steps_per_epoch=steps_per_epoch,
    epochs=epochs,
    callbacks=[evaluator]
)

Epoch 1/1
image_id: 2408745.jpg
key_words: teddy bear bear 


Reading data: 0it [00:00, ?it/s]

predict: the bear is wearing a bear
references: a brown fluffy teddy bear

image_id: 2401689.jpg
key_words: woman pant 
predict: the woman is wearing black pants
references: woman wearing black pants



Reading data: 5it [00:18,  3.75s/it]


-Calculating scores ...
-tokenization...
-setting up scorers...
-computing Bleu score...
{'testlen': 723, 'reflen': 743, 'guess': [723, 575, 427, 283], 'correct': [300, 74, 21, 5]}
ratio: 0.9730820995949219
Bleu_1: 0.404
Bleu_2: 0.225
Bleu_3: 0.134
Bleu_4: 0.080
-computing METEOR score...
METEOR: 0.187
-computing Rouge score...
ROUGE_L: 0.387
-computing CIDEr score...
CIDEr: 1.409
-computing SPICE score...
-Prepare temp input file for the SPICE scorer.
-Start job
-Read and process results.
-Average scores.
-Scores.
SPICE: 0.436
-computing WMD score...
WMD: 0.209
{'Bleu_1': 0.40361684084720756, 'Bleu_2': 0.22478107176531206, 'Bleu_3': 0.13420408604315684, 'Bleu_4': 0.08028173999222438, 'METEOR': 0.1872992422163111, 'ROUGE_L': 0.3872265826337303, 'CIDEr': 1.4085632458342823, 'SPICE': 0.4362373737373737, 'WMD': 0.20920237286456608}


<keras.callbacks.callbacks.History at 0x7f20e1471a90>