# Encode the image as a feature vector by runing it through a pre-trained InceptionV3 net.

In [None]:
import json
from PIL import Image
import numpy as np
import os
from keras.applications.inception_v3 import InceptionV3


def get_lists(anno_path):
    with open(anno_path, 'r') as f:
        file = json.load(f)
    caps = file['annotations']
    images = file['images']
    anno_list = {item['image_id']: item['caption'] for item in caps}
    pic_list = {item['id']: item['file_name'] for item in images}
    return anno_list, pic_list


if __name__ == '__main__':
    model = model = InceptionV3(include_top=False, input_shape= (480,640,3), pooling = 'avg')
    anno_path = '/home/minheng/coco/annotations/annotations/captions_train2014.json'
    anno_list, pic_list = get_lists(anno_path)
    X = []
    Y = []
    path = '/home/minheng/coco/images/train2014/'
    for key in anno_list.keys():
        file_name = pic_list[key]
        pic = np.expand_dims(np.array(Image.open(os.path.join(path,file_name)).resize((640,480))),axis = 0)
        if pic.shape == (1, 480, 640, 3):
            x = model.predict(pic)
            X.append(x)
            Y.append(anno_list[key])
    X = np.array(X)
    Y = np.array(Y)
    np.save('coco_pre_prop_X_new', X)
    np.save('coco_pre_prop_Y_new', Y)

In [6]:
import numpy as np
X = np.load('coco_pre_prop_X_new.npy')
Y = np.load('coco_pre_prop_Y_new.npy')

In [5]:
X[0].shape

(1, 2048)

In [6]:
Y[0]

'A clock that blends in with the wall hangs in a bathroom. '

# Encode the label using a pre-trained word embedding model(glove)

# first tokenize the words, pad each caption to a fixed length with 0s. Map each caption to the embedding space. Generate a mask for the calculation of loss. In the mask, 1 means the actual word token and 0 means the padding tokens. 

In [12]:
import numpy as np
import os
import math
from keras.layers import Embedding, Input, Dense, LSTM, RepeatVector,concatenate, Reshape,multiply
from keras.models import Sequential, Model
import keras
from keras.callbacks import ModelCheckpoint, Callback, LearningRateScheduler
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd

def get_emb(tokenizer):
    embeddings_index = {}
    f = open(os.path.join('glove.6B.300d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((5000, 300))
    for word, i in word_index.items():
        if i >= 5000:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


def emb_model(n, embedding_matrix,y):
    input_data = Input(shape=[n])
    embedding_layer = Embedding(5000, 300, weights=[embedding_matrix],
                                input_length=25, trainable=False)(input_data)
    model = Model(input_data, embedding_layer)
    return model.predict(y)

if __name__ == '__main__':
    Y_emb = np.load('coco_pre_prop_Y_new.npy')
    X = np.load('coco_pre_prop_X_new.npy')
    #X = np.reshape(X, [X.shape[0], X.shape[-1]])
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(Y_emb)
    sequences = tokenizer.texts_to_sequences(Y_emb)
    data = pad_sequences(sequences, maxlen=25,truncating='post', padding='post')
    embedding_matrix = get_emb(tokenizer)
    Y_target = emb_model(25,embedding_matrix, data)
    mask = Y_target.sum(axis = -1, keepdims = True) != 0
    mask = np.repeat(mask, 300,axis = 2)

# use 100 samples for base case testing

In [3]:
def get_lists(anno_path):
    with open(anno_path, 'r') as f:
        file = json.load(f)
    caps = file['annotations']
    images = file['images']
    anno_list = {item['image_id']: item['caption'] for item in caps}
    pic_list = {item['id']: item['file_name'] for item in images}
    return anno_list, pic_list

In [4]:
import json
from PIL import Image
import numpy as np
import os
import re
anno_path = '/home/minheng/coco/annotations/annotations/captions_train2014.json'
anno_list, pic_list = get_lists(anno_path)

In [5]:
X_100 = []
Y_100 = []
path = '/home/minheng/coco/images/train2014/'
i = 0
for key in anno_list.keys():
    if i <100:
        file_name = pic_list[key]
        pic = np.expand_dims(np.array(Image.open(os.path.join(path,file_name)).resize((320,240))),axis = 0)
        if pic.shape == (1, 240, 320, 3):
            X_100.append(pic)
            Y_100.append(anno_list[key])
            i +=1
X_100 = np.array(X_100)
Y_100 = np.array(Y_100)

# use a bag of word approch to encode y

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
vectorizer = CountVectorizer(min_df=1)
Y_tran = vectorizer.fit_transform(Y_100)
Y_100 = Y_tran.toarray()

In [8]:
X_100 = np.reshape(X_100, newshape = [100,240,320,3])

# built and base case using one dense layer

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Flatten

Using TensorFlow backend.


In [10]:
base = Sequential()
base.add(Flatten(input_shape = [240,320,3]))
base.add(Dense(368, activation = 'sigmoid'))

In [11]:
base.compile(optimizer = 'adam',loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [12]:
base.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 230400)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 368)               84787568  
Total params: 84,787,568.0
Trainable params: 84,787,568.0
Non-trainable params: 0.0
_________________________________________________________________


In [13]:
base.fit(X_100,Y_100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ffa2f290f98>

# Develope the LSTM model, this is a one to many learning. The structure of the google model is feeding the output of the current time step as input to the next time step. Due to the restriction of Keras, this approch is not possible unless I write a custom LSTM layer. So instead of the google approch, I modify the approch a little bit. 

# Model structure

In [7]:
def show_and_tell():
    initializer = keras.initializers.RandomUniform(minval=-0.08, maxval=0.08)
    img_fea = Input(shape=[2048], name = 'img_fea')
    dense1 = Dense(300, kernel_initializer=initializer)(img_fea)
    rep = RepeatVector(n=25)(dense1)
    lstm_cell = LSTM(300, return_sequences=True,
                kernel_initializer=initializer, dropout=0.25)(rep)
    lstm_cell2 = LSTM(300, activation = 'softmax',return_sequences=True,
                kernel_initializer=initializer, dropout=0.25)(lstm_cell)
    mask_input = Input(shape = [25,300], name = 'mask_input')
    final_output = multiply([lstm_cell2,mask_input])
    show_and_tell = Model(inputs = [img_fea,mask_input], outputs = [final_output])
    return show_and_tell

In [19]:
show_and_tell = show_and_tell()

In [20]:
show_and_tell.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
img_fea (InputLayer)             (None, 2048)          0                                            
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 300)           614700                                       
____________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)   (None, 25, 300)       0                                            
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 25, 300)       721200                                       
___________________________________________________________________________________________

# Develope the model

# Overfit 5 samples

In [22]:
from sklearn.metrics import r2_score
def r2(y_true, y_pred):
    return r2_score(y_true, y_pred)

In [25]:
from keras import backend as K

def r2(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [31]:
#show_and_tell = show_and_tell()
#show_and_tell.compile(loss='mse', optimizer='sgd', metrics=[r2])
show_and_tell.fit({'img_fea':X[:5],'mask_input':mask[:5]}, Y_target[:5],epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd7469d6048>

# Train the whole thing

In [None]:
def step_decay(epoch):
    initial_lrate = 2
    drop = 0.5
    epochs_drop = 8
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

if __name__ == '__main__':
    op = optimizers.SGD(lr=0.0, decay=0.0, nesterov=False)
    show_and_tell = show_and_tell()
    show_and_tell.compile(loss='mse', optimizer=op, metrics = [r2])
    lrate = LearningRateScheduler(step_decay)
    board = keras.callbacks.TensorBoard()
    show_and_tell.fit({'img_fea':X,'mask_input':mask}, Y_target,epochs = 50,validation_split=0.3, callbacks=[lrate,board])
    show_and_tell.save('modify8.h5')

# Output 

In [1]:
from keras import backend as K
from keras.models import load_model
show_and_tell = load_model('modify8.h5',custom_objects={'r2':r2})

Using TensorFlow backend.


In [13]:
import numpy as np
test_mask = np.ones([1,25,300])

pre = show_and_tell.predict({'img_fea':X[0],'mask_input':test_mask})

pre = np.reshape(pre,newshape=[25,300])

embedding_matrix.shape

out = []
for i in pre:
    index_vect = i.dot(embedding_matrix.T)
    out.append(np.argmax(index_vect))

In [15]:
out

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]