In [16]:
import math
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import cv2
import skimage

import tensorflow.python.platform
from keras.preprocessing import sequence
from collections import Counter

In [17]:
model_path = './models/tensorflow'
feature_path = './data/feats.npy'
annotation_path = './data/results_20130124.token'

## Loading data
Parse the image embedding features from the Flickr30k dataset `./data/feats.npy`, and load the caption data via `pandas` from `./data/results_20130124.token`

In [18]:
def get_data(annotation_path, feature_path):
     annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'])
     return np.load(feature_path), annotations['caption'].values

In [19]:
feats, captions = get_data(annotation_path, feature_path)

In [20]:
print(feats.shape)
print(captions.shape)

(158915, 4096)
(158915,)


In [21]:
print(captions[0])

Two young guys with shaggy hair look at their hands while hanging out in the yard .


In [22]:
def preProBuildWordVocab(sentence_iterator, word_count_threshold=30): # function from Andre Karpathy's NeuralTalk
    print('Create word counts using word count threshold', word_count_threshold)
    word_counts = {}
    nsents = 0
    for sent in sentence_iterator:
      nsents += 1
      for w in sent.lower().split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('Filtered vocabulary from %d to %d' % (len(word_counts), len(vocab)))

    ixtoword = {}
    ixtoword[0] = '.'
    wordtoix = {}
    wordtoix['#START#'] = 0
    ix = 1
    for w in vocab:
      wordtoix[w] = ix
      ixtoword[ix] = w
      ix += 1

    word_counts['.'] = nsents
    bias_init_vector = np.array([1.0*word_counts[ixtoword[i]] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector)
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector)
    return wordtoix, ixtoword, bias_init_vector

In [23]:
def crop_image(x, height=227, width=227, flt=True): #boilerplate image reshaping code
    img = cv2.imread(x)
    if flt:
        img = img.astype(np.float32)

    if len(image.shape) == 2:
        img = np.tile(img[:,:,None], 3)
    elif len(img.shape) == 4:
        img = img[:,:,:,0]

    h, w, rgb = img.shape
    if w == h:
        resized_image = cv2.resize(img, (height,width))

    elif h < w:
        resized_img = cv2.resize(img, (int(w * float(height)/h), width))
        crop_len = int((resized_img.shape[1] - height) / 2)
        resized_img = resized_img[:,crop_len:resized_img.shape[1] - crop_len]

    else:
        resized_img = cv2.resize(img, (height, int(h * float(width) / w)))
        crop_len = int((resized_img.shape[0] - width) / 2)
        resized_img = resized_img[crop_len:resized_img.shape[0] - crop_len,:]

    return cv2.resize(resized_img, (height, width))

In [24]:
class Caption_Generator():
    def __init__(self, dim_in, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, init_b):

        self.dim_in = dim_in
        self.dim_embed = dim_embed
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_lstm_steps = n_lstm_steps
        self.n_words = n_words

        with tf.device("/cpu:0"):
            self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='word_embedding')

        self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')

        self.lstm = tf.nn.rnn_cell.BasicLSTMCell(dim_hidden)
        
        self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_hidden], -0.1, 0.1), name='img_embedding')
        self.img_embedding_bias = tf.Variable(tf.zeros([dim_hidden]), name='img_embedding_bias')

        self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='word_encoding')

        self.word_encoding_bias = tf.Variable(init_b, name='word_encoding_bias')

    def build_model(self):

        img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])
        caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
        mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])

        image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
        
        state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)

        loss = 0.0
        with tf.variable_scope("RNN"):
            for i in range(self.n_lstm_steps): 
                if i > 0:
                    with tf.device("/cpu:0"):
                        current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias
                else:
                     current_embedding = image_embedding
                if i > 0: 
                    tf.get_variable_scope().reuse_variables()

                out, state = self.lstm(current_embedding, state)

                if i > 0: 
                    labels = tf.expand_dims(caption_placeholder[:, i], 1)
                    ixs = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
                    concated = tf.concat(1, [ixs, labels])
                    onehot = tf.sparse_to_dense(
                            concated, tf.pack([self.batch_size, self.n_words]), 1.0, 0.0)

                    logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                    xentropy = tf.nn.softmax_cross_entropy_with_logits(logit, onehot)
                    xentropy = xentropy * mask[:,i]

                    loss = tf.reduce_sum(xentropy)
                    total_loss = loss

            total_loss = total_loss / tf.reduce_sum(mask[:,1:])
            return loss, image,  caption_placeholder, mask


In [25]:
### Parameters ###
dim_embed = 256
dim_hidden = 256
dim_in = 4096
batch_size = 128
momentum = 0.9
n_epochs = 25

def train(learning_rate=0.001, continue_training=False):
    
    tf.reset_default_graph()

    feats, captions = get_data(annotation_path, feature_path)
    wordtoix, ixtoword, init_b = preProBuildWordVocab(captions)

    np.save('data/ixtoword', ixtoword)

    index = np.arange(len(feats))
    np.random.shuffle(index)

    feats = feats[index]
    captions = captions[index]

    sess = tf.InteractiveSession()
    n_words = len(wordtoix)
    maxlen = np.max( [x for x in map(lambda x: len(x.split(' ')), captions) ] )
    caption_generator = Caption_Generator(dim_in, dim_hidden, dim_embed, batch_size, maxlen+2, n_words, init_b)

    loss, image, sentence, mask = caption_generator.build_model()

    saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    tf.initialize_all_variables().run()

    if continue_training:
        saver.restore(sess,tf.train.latest_checkpoint(model_path))

    for epoch in range(n_epochs):

        for start, end in zip( \
                range(0, len(feats), batch_size),
                range(batch_size, len(feats), batch_size)
                ):

            current_feats = feats[start:end]
            current_captions = captions[start:end]

            current_caption_ind = [x for x in map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ')[:-1] if word in wordtoix], current_captions)]

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=maxlen+1)
            current_caption_matrix = np.hstack( [np.full( (len(current_caption_matrix),1), 0), current_caption_matrix] ).astype(int)

            current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array([x for x in map(lambda x: (x != 0).sum()+2, current_caption_matrix )])

            for ind, row in enumerate(current_mask_matrix):
                row[:nonzeros[ind]] = 1

            _, loss_value = sess.run([train_op, loss], feed_dict={
                image: current_feats,
                sentence : current_caption_matrix,
                mask : current_mask_matrix
                })

            print("Current Cost: ", loss_value, "\t Epoch {}/{}".format(epoch, n_epochs), "\t Iter {}/{}".format(start,len(feats)))

        print("Saving the model from epoch: ", epoch)
        saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch+9)
        learning_rate *= 0.95

In [15]:
try:
    train()
except KeyboardInterrupt:
    print('Exiting Training')

Create word counts using word count threshold 30
Filtered vocabulary from 20326 to 2942


TypeError: Tensors in list passed to 'values' of 'Concat' Op have types [int32, float32] that don't all match.