In [1]:
from __future__ import print_function

import os
import cv2
import copy
import json
import numpy as np
from tqdm import tqdm

from keras.layers import *
from keras.models import Model
from bert4keras.layers import Loss
from bert4keras.optimizers import Adam
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.snippets import sequence_padding, is_string
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder

from caption_eval.custom_caption_eval import calculate_metrics

Using TensorFlow backend.


In [2]:
# bert配置
config_path = 'bert-model/uncased_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'bert-model/uncased_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'bert-model/uncased_L-12_H-768_A-12/vocab.txt'

# 加载并精简词表，建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)

# 模型配置
maxlen = 64
batch_size = 16
steps_per_epoch = 10
epochs = 1

In [11]:
def read_object_data(folder, valid=False):
    """读取并整理COCO的数据,包括caption, object, attributes 和 relationships , 同时提取目标特征.
    单个数据如下:
    [
     {'image_features': [2048]},
     {'key_words': str, 'caption': str},
     {'key_words': str, 'caption': str},
     ...
    ]
    
    返回数据格式:
    -train:
    [{'key_words': str,
      'features': [2048],
      'caption': str},
    ...
    ]
    
    -valid:
    [{'image_id':str,
      'features': [2048],
      'caption': [str, str, str, str, str],
      'objects_key_words': [str, str, str, str, str]},
    ...  
    ]
    """
    print('-Read data ...')
    res = []
    
    files = os.listdir(folder)
    
    if valid:
        # 读取valid的caption
        data = json.load(open('data/coco2014/annotations/captions_val2014.json'))
        images = {}
        for img in data['images']:
            images[img['id']] = {
                'image_id': img['file_name'],
                'caption': [],
            }
        for caption in data['annotations']:
            images[caption['image_id']]['caption'].append(caption['caption'])
        captions = {}
        for img in images.values():
            captions[img['image_id']] = img['caption']
            
        # 读取image features 和 关键字
        for _, file in tqdm(enumerate(files)):
            file_path = folder + file
            data = np.load(file_path, allow_pickle=True)
        
            image = {}
            image_id = file.replace('npy', 'jpg')
            image['image_id'] = image_id
            image['features'] = np.array(data[0]['image_features'])
            image['caption']  = captions[image_id]
            image['objects_key_words']  = []

            for d in data[1:]:
                image['objects_key_words'].append(d['key_words'])
            
            res.append(image)
    else:
        for _, file in tqdm(enumerate(files)):
            file_path = folder + file
            data = np.load(file_path, allow_pickle=True)
        
            for d in data[1:]:
                d['features'] = np.array(data[0]['image_features'])
            
                res.append(d)
        
    return res


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_features, batch_token_ids, batch_segment_ids = [], [], []
        for is_end, D in self.sample(random):

            features = D['features']
            caption = D['caption']
            inputs = D['key_words']

            token_ids, segment_ids = tokenizer.encode(
                inputs, caption, max_length=maxlen
            )
            
            batch_features.append(features)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
                
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_features = sequence_padding(batch_features)
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids, batch_features], None
                batch_features, batch_token_ids, batch_segment_ids = [], [], []

                
# 加载数据
train_data = read_object_data(
    './data/MSCOCO/annotation/features/train2014/', False
)
valid_data = read_object_data(
    './data/MSCOCO/annotation/features/val2014/', True
)

77it [00:00, 766.19it/s]

-Read data ...


82396it [01:59, 692.08it/s]


-Read data ...


40263it [00:30, 1324.62it/s]


In [5]:
class CrossEntropy(Loss):
    """交叉熵作为loss，并mask掉padding部分
    """
    def compute_loss(self, inputs, mask=None):
        y_true, y_mask, y_pred = inputs
        y_true = y_true[:, 1:]  # 目标token_ids
        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.sum(loss * y_mask) / K.sum(y_mask)
        return loss

In [6]:
# 条件全连接层
x_in = Input(shape=(2048,), name='image_features')
    
# Bert模型
model = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
    layer_norm_cond=x_in,
    layer_norm_cond_hidden_size=512,
    layer_norm_cond_hidden_act='swish',
    additional_input_layers=x_in,
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [7]:
output = CrossEntropy(2)(model.inputs[0:2] + model.outputs)

model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     multiple             22417920    Input-Token[0][0]                
                                                                 MLM-Norm[0][0]                   
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]        

  'be expecting any data to be passed to {0}.'.format(name))


In [8]:
class AutoCaption(AutoRegressiveDecoder):
    """img2seq解码器
    """
    @AutoRegressiveDecoder.set_rtype('probas')
    def predict(self, inputs, output_ids, step):
        token_ids, segment_ids, image = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
        return model.predict([token_ids, segment_ids, image])[:, -1]

    def generate(self, inputs, features, topk=1):
        token_ids, segment_ids = tokenizer.encode(inputs, max_length=maxlen)
        output_ids = self.beam_search([token_ids, segment_ids, features], topk)  # 基于beam search
        return tokenizer.decode(output_ids)


autocaption = AutoCaption(
    start_id=None,
    end_id=tokenizer._token_end_id,
    maxlen=maxlen
)

In [9]:
def just_show():
    samples = [valid_data[i] for i in np.random.choice(len(valid_data), 2, replace=False)]
    for D in samples:
        features = D['features']
        inputs = np.random.choice(D['objects_key_words'])
        print(u'image_id:', D['image_id'])
        print(u'key_words:', inputs)
        print(u'predict:', autocaption.generate(inputs, features))
        print(u'references:', D['caption'])
        print()
    
        
def caption_eval():
        
    datasetGTS = {}
    datasetRES = {}
        
    GTS_annotations = []
    RES_annotations = []
    
    samples = [valid_data[i] for i in np.random.choice(len(valid_data), 2, replace=False)]
    
    imgIds = 0
    for _, sample in tqdm(enumerate(samples), desc='Reading data'):
        for inputs in sample['objects_key_words']:
            res = {}
            res[u'image_id'] = imgIds
            res[u'caption'] = autocaption.generate(inputs, sample['features'])
            RES_annotations.append(res)
            
            for caption in sample['caption']:
                gts = {}
                gts[u'image_id'] = imgIds
                gts[u'caption'] = caption
                GTS_annotations.append(gts)
            
            imgIds += 1
            
    imgIds = range(imgIds)
        
    datasetGTS['annotations'] = GTS_annotations
    datasetRES['annotations'] = RES_annotations
    
    print(u'-Calculating scores ...')
    scores = calculate_metrics(imgIds, datasetGTS, datasetRES)
    print(scores)
    
    scores['epoch'] = epoch
    scores['loss']  = loss
    
    if with_key_words:
        save_path = 'models/coco2014/base_kw/'
    else:
        save_path = 'models/coco2014/base/'
    
    with open(save_path + 'caption_eval.txt', "a") as f:
        f.write(str(scores) + '\n')
    model.save_weights(save_path + 'model_{}.weights'.format(epoch))
    

class Evaluate(keras.callbacks.Callback):
    def __init__(self):
        self.lowest = 1e10

    def on_epoch_end(self, epoch, logs=None):
        just_show()
        caption_eval(epoch, logs['loss'])

In [10]:
evaluator = Evaluate()
train_generator = data_generator(train_data, batch_size)

In [11]:
model.fit_generator(
    train_generator.forfit(),
    steps_per_epoch=steps_per_epoch,
    epochs=epochs,
    callbacks=[evaluator]
)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/1
image_id: COCO_val2014_000000202093.jpg
key_words: motorcycle car 
predict: a a. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
references: ['Several large bulls standing around in a grassy field in front of a farm plantation.', 'A group of cows grazing in the green grass. ', 'A herd of cows on a farm in grassy field.', 'Several cows standing in the grass near a few buildings.', 'A small herd of cows in a large grassy field.']

image_id: COCO_val2014_000000278899.jpg
key_words: motorcycle roadway vehicle 


Reading data: 0it [00:00, ?it/s]

predict: a vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle vehicle
references: ['a lot of different types of luggage bags near one another', 'A group of purses and backpacks lumped together.', 'A pile of colorful luggage sitting side by side.', 'a large collection of various bags and suitcases', 'There are many suitcases piled up on top of each other. ']



Reading data: 2it [00:13,  6.95s/it]


-Calculating scores ...
-tokenization...
-setting up scorers...
-computing Bleu score...
{'testlen': 74, 'reflen': 83, 'guess': [74, 64, 62, 61], 'correct': [3, 1, 0, 0]}
ratio: 0.8915662650494992
Bleu_1: 0.036
Bleu_2: 0.022
Bleu_3: 0.000
Bleu_4: 0.000
-computing METEOR score...
METEOR: 0.032
-computing Rouge score...
ROUGE_L: 0.024
-computing CIDEr score...
CIDEr: 0.000
-computing SPICE score...
SPICE: 0.011
-computing WMD score...
WMD: 0.022
{'Bleu_1': 0.03589797376436835, 'Bleu_2': 0.022286156777206235, 'Bleu_3': 1.9214084279784522e-07, 'Bleu_4': 5.664706459066291e-10, 'METEOR': 0.03189847048799763, 'ROUGE_L': 0.024159561727004307, 'CIDEr': 4.290672868937101e-21, 'SPICE': 0.01111111111111111, 'WMD': 0.02167315840198584}


NameError: name 'epoch' is not defined

In [7]:
train_data

[{'caption': 'a woman riding a bike next to a stop sign',
  'key_words': 'woman bike sign ',
  'features': array([0.26279029, 0.23141171, 0.18918106, ..., 0.30394822, 0.02286127,
         0.27747229])},
 {'caption': 'a woman is riding a bicycle past a stop sign',
  'key_words': 'woman bicycle sign ',
  'features': array([0.26279029, 0.23141171, 0.18918106, ..., 0.30394822, 0.02286127,
         0.27747229])},
 {'caption': 'a woman her bike past a stop sign',
  'key_words': 'woman bike sign ',
  'features': array([0.26279029, 0.23141171, 0.18918106, ..., 0.30394822, 0.02286127,
         0.27747229])},
 {'caption': 'a pretty young lady riding a bike by a stop sign',
  'key_words': 'lady bike sign ',
  'features': array([0.26279029, 0.23141171, 0.18918106, ..., 0.30394822, 0.02286127,
         0.27747229])},
 {'caption': 'a large propeller airplane flying through a cloudy sky',
  'key_words': 'airplane sky ',
  'features': array([0.34272781, 0.66812372, 0.1074824 , ..., 0.0915942 , 0.22203

In [10]:
valid_data

[{'image_id': 'COCO_val2014_000000178078.jpg',
  'features': array([0.02026431, 0.36134467, 0.26312259, ..., 0.1018953 , 0.32963011,
         0.32357308]),
  'caption': ['a line of cars with a motorcycle in front',
   'A bike is posed in front of an old car.',
   'A red motorcycle parked in front of a parked car.',
   'Motorcycle parked on roadway with other vehicles nearby.',
   'The motorcycle is parallel parked sideways in front of cars.'],
  'objects_key_words': ['bike car ',
   'motorcycle car ',
   'motorcycle roadway vehicle ',
   'motorcycle car ']},
 {'image_id': 'COCO_val2014_000000121572.jpg',
  'features': array([0.12468677, 0.40851697, 0.19564483, ..., 1.2679621 , 0.00757444,
         0.13152997]),
  'caption': ['a couple of buses that are lined up by some buildings',
   'Some busses are parked along the city curb.',
   'Buses parked along a curb beside old buildings ',
   'A red double decker bus is in front of a white bus on the side of a road. ',
   'there are many doub

In [10]:
GTS_annotations, RES_annotations = caption_eval()

Reading data: 2it [00:00, 2007.32it/s]


In [11]:
GTS_annotations

[{'image_id': 0,
  'caption': 'two cats in an office space looking at the camera'},
 {'image_id': 0,
  'caption': 'two cats sitting together on a desk next to a keyboard'},
 {'image_id': 0, 'caption': 'two cats laying on a desk by a keyboard'},
 {'image_id': 0, 'caption': 'two cats sitting on a desk behind a keyboard'},
 {'image_id': 0,
  'caption': 'two cats lying on desk facing camera with a keyboard in foreground'},
 {'image_id': 1,
  'caption': 'two cats in an office space looking at the camera'},
 {'image_id': 1,
  'caption': 'two cats sitting together on a desk next to a keyboard'},
 {'image_id': 1, 'caption': 'two cats laying on a desk by a keyboard'},
 {'image_id': 1, 'caption': 'two cats sitting on a desk behind a keyboard'},
 {'image_id': 1,
  'caption': 'two cats lying on desk facing camera with a keyboard in foreground'},
 {'image_id': 2,
  'caption': 'two cats in an office space looking at the camera'},
 {'image_id': 2,
  'caption': 'two cats sitting together on a desk nex

In [12]:
RES_annotations

[{'image_id': 0, 'caption': ''},
 {'image_id': 1, 'caption': ''},
 {'image_id': 2, 'caption': ''},
 {'image_id': 3, 'caption': ''},
 {'image_id': 4, 'caption': ''},
 {'image_id': 5, 'caption': ''},
 {'image_id': 6, 'caption': ''},
 {'image_id': 7, 'caption': ''},
 {'image_id': 8, 'caption': ''}]

In [13]:
len(train_data)

347074

In [14]:
len(valid_data)

40263