In [1]:
## library
# GPU selection
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# minimum
import numpy as np

In [326]:
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Input, Flatten, AveragePooling2D, GlobalAveragePooling2D
from tensorflow.keras.layers import Dense, LSTM, Embedding, Masking, Dropout
from tensorflow.keras.layers import Input, Lambda, RepeatVector, Reshape
from tensorflow.keras.layers import TimeDistributed, Concatenate, Dot, Add
from tensorflow.keras import backend as K

In [5]:
# loading the pre-processed data
flickr30k_images = np.load('/home/librarian/corpus/flickr30k/flickr30k_vectors49-resnet50.npy', allow_pickle=True)

test_image_ids    = np.load('preprocessed_flicker30k/test_image_ids.npy', allow_pickle=True)
train_image_ids   = np.load('preprocessed_flicker30k/train_image_ids.npy', allow_pickle=True)
image_ids_indices = np.load('preprocessed_flicker30k/image_ids_indices.npy', allow_pickle=True)[None][0]
ix2word           = np.load('preprocessed_flicker30k/ix2word.npy', allow_pickle=True)[None][0]
captions          = np.load('preprocessed_flicker30k/captions.npy', allow_pickle=True)
max_len           = len(captions[0,1]) - 2

def ixs2sent(ixs):
    return [ix2word[ix] for ix in ixs if ix2word[ix] != '<pad/>']

In [7]:
# the training and testing
# train
X_img_indices_train = np.array([
    image_ids_indices[image_id]
    for image_id, sent in captions
    if image_id in train_image_ids
])

X_sents_train = np.array([
    sent
    for image_id, sent in captions
    if image_id in train_image_ids
])

# test
X_img_indices_test = np.array([
    image_ids_indices[image_id]
    for image_id, sent in captions
    if image_id in test_image_ids
])

X_sents_test = np.array([
    sent
    for image_id, sent in captions
    if image_id in test_image_ids
])


In [8]:
def X_generator(batch_size=32, is_train=True):
    if is_train:
        X_img_indices = X_img_indices_train
        X_sents = X_sents_train
    else:
        X_img_indices = X_img_indices_test
        X_sents = X_sents_test

    X_indices = np.arange(len(X_img_indices))
    steps_per_epoch = int(len(X_indices)/batch_size)
    
    while True:
        # shuffle 
        np.random.shuffle(X_indices)
        
        for step in range(steps_per_epoch):
            sents = X_sents[X_indices[step*batch_size:(step+1)*batch_size]]
            img_indices = X_img_indices[X_indices[step*batch_size:(step+1)*batch_size]]
            yield ([sents[:, :-1], flickr30k_images[img_indices]], np.expand_dims(sents[:, 1:], 2))
            

In [365]:
def build_model():
    # dimensionalities:
    emb_size = 128
    vfs_size = 128
    regions_size = 7 * 7
    visual_feature_size = 2048 # resnet50
        
    ### fine tune visual features
    def mlp_vision(x): 
        x = Dense(vfs_size, activation='relu')(x)
        return x

    ### repeat the image vector 
    def feature_fusion(x, max_len=max_len):
        return K.concatenate([
            x[0],
            K.repeat_elements(K.expand_dims(x[1], 1), max_len+1, 1),
        ], 2)

    ### how to apply attention sequentially on image:
    def apply_attention(x, max_len=max_len):
        a, vf0 = x
        vf0_ = K.repeat_elements(K.expand_dims(vf0, 1), max_len+1, 1)
        
        return K.sum(K.expand_dims(a, 3) * vf0_, 2)
    
    ### simple decoder model
    ## inputs
    # word embeddings
    delayed_sentence = Input(shape=[max_len+1])
    e_t  = Embedding(len(ix2word), emb_size)(delayed_sentence)
    e_t  = Dropout(0.1)(e_t)
    
    # visual features
    visual_features  = Input(shape=[regions_size, visual_feature_size]) 
    c   = mlp_vision(visual_features) 
    
    # average visual features over all regions
    c_g = GlobalAveragePooling2D()(Reshape([7 , 7, vfs_size])(c))
    c_g = Flatten()(c_g)
    
    # fusing two modalities
    ec_t = Lambda(feature_fusion)([e_t, c_g])
    
    # LSTM-language model
    h_t  = LSTM(emb_size, dropout=0.1, return_sequences=True)(ec_t)
    
    # fusing two modalities again!
    # but use spatial attention here.
    # att 1
    #_hc_t = Lambda(feature_fusion)([h_t, c_g])
    #z_t   = Dense(emb_size, activation='tanh')(_hc_t)
    #a_t   = Dense(regions_size, activation='softmax')(z_t)
    #c_t   = Lambda(apply_attention)([a_t, c])
    #final = Concatenate()([h_t, c_t])
    
    # simple fuse
    #_hc_t = Lambda(feature_fusion)([h_t, c_g])
    #final = Dense(emb_size, activation='relu')(_hc_t)
    
    # no extra fuse
    final = h_t
    
    out   = Dense(len(ix2word), activation='softmax')(final)
    model = Model([delayed_sentence, visual_features], out)

    model.summary()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    return model


In [366]:
history = []
model = build_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_79 (InputLayer)           (None, 49, 2048)     0                                            
__________________________________________________________________________________________________
dense_104 (Dense)               (None, 49, 128)      262272      input_79[0][0]                   
__________________________________________________________________________________________________
input_78 (InputLayer)           (None, 26)           0                                            
__________________________________________________________________________________________________
reshape_37 (Reshape)            (None, 7, 7, 128)    0           dense_104[0][0]                  
__________________________________________________________________________________________________
embedding_

In [367]:
for epochs, batch_size in [(1, 64), (1, 128), (16, 512)]:
    h = model.fit_generator(
        generator=X_generator(batch_size=batch_size, is_train=True), 
        steps_per_epoch=int(len(X_sents_train)/batch_size), 
        validation_data=X_generator(batch_size=batch_size, is_train=False),
        validation_steps=int(len(X_sents_test)/batch_size),
        epochs=epochs,
    )
    history.append(h)


Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [369]:
model.save("simple_caption_model.h5")