In [31]:
def preProBuildWordVocab(sentence_iterator, word_count_threshold=5):
    # borrowed this function from NeuralTalk
    print ('preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, ))

    word_counts = {}
    nsents = 0

    for sent in sentence_iterator:
        nsents += 1
        tmp_sent = sent.lower().split(' ')
        if '' in tmp_sent:
            tmp_sent.remove('')

        for w in tmp_sent:
           word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print ('filtered words from %d to %d' % (len(word_counts), len(vocab)))

    ixtoword = {}
    ixtoword[0] = '<bos>'
    ixtoword[1] = '<eos>'
    ixtoword[2] = '<pad>'
    ixtoword[3] = '<unk>'

    wordtoix = {}
    wordtoix['<bos>'] = 0
    wordtoix['<eos>'] = 1
    wordtoix['<pad>'] = 2
    wordtoix['<unk>'] = 3

    for idx, w in enumerate(vocab):
        wordtoix[w] = idx + 4
        ixtoword[idx+4] = w

    word_counts['<eos>'] = nsents
    word_counts['<bos>'] = nsents
    word_counts['<pad>'] = nsents
    word_counts['<unk>'] = nsents

    bias_init_vector = np.array([1.0 * word_counts[ ixtoword[i] ] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range

    return wordtoix, ixtoword, bias_init_vector


In [36]:


import pickle
import numpy as np
batch_size = 50 # Being support batch_size
num_boxes = 50 # number of Detected regions in each image
feats_dim = 4096 # feature dimensions of each regions
project_dim = 1024 # project the features to one vector, which is 1024 dimensions

sentRNN_lstm_dim = 512 # the sentence LSTM hidden units
sentRNN_FC_dim = 1024 # the fully connected units
wordRNN_lstm_dim = 512 # the word LSTM hidden units
word_embed_dim = 1024 # the learned embedding vectors for the words

S_max = 6
N_max = 30
T_stop = 0.5

n_epochs = 500
learning_rate = 0.0001

open('E:\Projects\Story-Telling-Using-Show-Attend-and-tell\genomedata\paragraphs_v1.json').read()

img2paragraph = pickle.load(open('E:\Projects\Story-Telling-Using-Show-Attend-and-tell\img2paragraph','rb'))
all_sentences = []
for key, paragraph in img2paragraph.items():
    for each_sent in paragraph[1]:
        each_sent.replace(',', ' ,')
        all_sentences.append(each_sent)
word2idx, idx2word, bias_init_vector = preProBuildWordVocab(all_sentences, word_count_threshold=2)
np.save('E:\Projects\Story-Telling-Using-Show-Attend-and-tell\idx2word_batch', idx2word)

img2paragraph_modify = {}
for img_name, img_paragraph in img2paragraph.items():
    img_paragraph_1 = img_paragraph[1]

    if '' in img_paragraph_1:
        img_paragraph_1.remove('')
    if ' ' in paragraph[1]:
        img_paragraph_1.remove(' ')   
    img_num_sents = len(img_paragraph_1)
    if img_num_sents > S_max:
        img_num_sents = S_max
    img_num_distribution = np.zeros([S_max], dtype=np.int32)
    img_num_distribution[img_num_sents-1:] = 1

    img_captions_matrix = np.ones([S_max, N_max+1], dtype=np.int32) * 2 # zeros([6, 50])
    for idx, img_sent in enumerate(img_paragraph_1):
        # the number of sentences is img_num_sents
        if idx == img_num_sents:
            break

        # because we treat the ',' as a word
        img_sent = img_sent.replace(',', ' ,')

       
        if len(img_sent)>=3 and img_sent[0] == ' ' and img_sent[1] != ' ':
            img_sent = img_sent[1:]
        elif len(img_sent)>=3 and img_sent[0] == ' ' and img_sent[1] == ' ' and img_sent[2] != ' ':
            img_sent = img_sent[2:]

        # Be careful the last part in a sentence, like this:
        # '...world.'
        # '...world. '
        if len(img_sent)>=1 and img_sent[-1] == '.':
            img_sent = img_sent[0:-1]
        elif len(img_sent)>=2 and img_sent[-1] == ' ' and img_sent[-2] == '.':
            img_sent = img_sent[0:-2]

        # Last, we add the <bos> and the <eos> in each sentences
        img_sent = '<bos> ' + img_sent + ' <eos>'

        for idy, word in enumerate(img_sent.lower().split(' ')):
            # because the biggest number of words in a sentence is N_max, here is 50
            if idy == N_max:
                break

            if word in word2idx:
                img_captions_matrix[idx, idy] = word2idx[word]
            else:
                img_captions_matrix[idx, idy] = word2idx['<unk>']

    img2paragraph_modify[str(img_name)] = [img_num_distribution, img_captions_matrix]
with open('E:\Projects\Story-Telling-Using-Show-Attend-and-tell\img2paragraph_modify_batch', 'wb') as f:
    pickle.dump(img2paragraph_modify, f)

preprocessing word counts and creating vocab based on word count threshold 2
filtered words from 18926 to 9933


In [64]:
class RegionPooling_HierarchicalRNN():
    def __init__(self, n_words,
                       batch_size,
                       num_boxes,
                       feats_dim,
                       project_dim,
                       sentRNN_lstm_dim,
                       sentRNN_FC_dim,
                       wordRNN_lstm_dim,
                       S_max,
                       N_max,
                       word_embed_dim,
                       bias_init_vector=None):

        self.n_words = n_words
        self.batch_size = batch_size
        self.num_boxes = num_boxes # 50
        self.feats_dim = feats_dim # 4096
        self.project_dim = project_dim # 1024
        self.S_max = S_max # 6
        self.N_max = N_max # 50
        self.word_embed_dim = word_embed_dim # 1024

        self.sentRNN_lstm_dim = sentRNN_lstm_dim # 512 hidden size
        self.sentRNN_FC_dim = sentRNN_FC_dim # 1024 in fully connected layer
        self.wordRNN_lstm_dim = wordRNN_lstm_dim # 512 hidden size

        # word embedding, parameters of embedding
        # embedding shape: n_words x wordRNN_lstm_dim
        with tf.device('/cpu:0'):
            self.Wemb = tf.Variable(tf.random_uniform([n_words, word_embed_dim], -0.1, 0.1), name='Wemb')
        #self.bemb = tf.Variable(tf.zeros([word_embed_dim]), name='bemb')

        # regionPooling_W shape: 4096 x 1024
        # regionPooling_b shape: 1024
        self.regionPooling_W = tf.Variable(tf.random_uniform([feats_dim, project_dim], -0.1, 0.1), name='regionPooling_W')
        self.regionPooling_b = tf.Variable(tf.zeros([project_dim]), name='regionPooling_b')

        # sentence LSTM
        self.sent_LSTM = tf.nn.rnn_cell.BasicLSTMCell(sentRNN_lstm_dim, state_is_tuple=True)

        # logistic classifier
        self.logistic_Theta_W = tf.Variable(tf.random_uniform([sentRNN_lstm_dim, 2], -0.1, 0.1), name='logistic_Theta_W')
        self.logistic_Theta_b = tf.Variable(tf.zeros(2), name='logistic_Theta_b')

        # fc1_W: 512 x 1024, fc1_b: 1024
        # fc2_W: 1024 x 1024, fc2_b: 1024
        self.fc1_W = tf.Variable(tf.random_uniform([sentRNN_lstm_dim, sentRNN_FC_dim], -0.1, 0.1), name='fc1_W')
        self.fc1_b = tf.Variable(tf.zeros(sentRNN_FC_dim), name='fc1_b')
        self.fc2_W = tf.Variable(tf.random_uniform([sentRNN_FC_dim, 1024], -0.1, 0.1), name='fc2_W')
        self.fc2_b = tf.Variable(tf.zeros(1024), name='fc2_b')

        # word LSTM
        self.word_LSTM = tf.nn.rnn_cell.BasicLSTMCell(wordRNN_lstm_dim, state_is_tuple=True)
        self.word_LSTM = tf.nn.rnn_cell.MultiRNNCell([self.word_LSTM] * 2, state_is_tuple=True)

        self.embed_word_W = tf.Variable(tf.random_uniform([wordRNN_lstm_dim, n_words], -0.1,0.1), name='embed_word_W')
        if bias_init_vector is not None:
            self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b')
            self.embed_word_b = tf.Variable(tf.zeros([n_words]), name='embed_word_b')
    def build_model(self):
        # receive the feats in the current image
        # it's shape is 10 x 50 x 4096
        # tmp_feats: 500 x 4096
        feats = tf.placeholder(tf.float32, [self.batch_size, self.num_boxes, self.feats_dim])
        tmp_feats = tf.reshape(feats, [-1, self.feats_dim])

        # project_vec_all: 500 x 4096 * 4096 x 1024 --> 500 x 1024
        # project_vec: 10 x 1024
        project_vec_all = tf.matmul(tmp_feats, self.regionPooling_W) + self.regionPooling_b
        project_vec_all = tf.reshape(project_vec_all, [self.batch_size, 50, self.project_dim])
        project_vec = tf.reduce_max(project_vec_all, reduction_indices=1)

        # receive the [continue:0, stop:1] lists
        # example: [0, 0, 0, 0, 1, 1], it means this paragraph has five sentences
        num_distribution = tf.placeholder(tf.int32, [self.batch_size, self.S_max])

        # receive the ground truth words, which has been changed to idx use word2idx function
        captions = tf.placeholder(tf.int32, [self.batch_size, self.S_max, self.N_max+1])
        captions_masks = tf.placeholder(tf.float32, [self.batch_size, self.S_max, self.N_max+1])
        # ---------------------------------------------------------------------------------------------------------------------
        sent_state = self.sent_LSTM.zero_state(batch_size=self.batch_size, dtype=tf.float32)

        probs = []
        loss = 0.0
        loss_sent = 0.0
        loss_word = 0.0
        lambda_sent = 5.0
        lambda_word = 1.0

        print ('Start build model:')
        for i in range(0, self.S_max):
            if i > 0:
                tf.get_variable_scope().reuse_variables()

            with tf.variable_scope('sent_LSTM'):
                sent_output, sent_state = self.sent_LSTM(project_vec, sent_state)
            with tf.name_scope('fc1'):
                hidden1 = tf.nn.relu( tf.matmul(sent_output, self.fc1_W) + self.fc1_b )
            with tf.name_scope('fc2'):
                sent_topic_vec = tf.nn.relu( tf.matmul(hidden1, self.fc2_W) + self.fc2_b )
        sentRNN_logistic_mu = tf.nn.xw_plus_b( sent_output, self.logistic_Theta_W, self.logistic_Theta_b )
        sentRNN_label = tf.stack([ 1 - num_distribution[:, i], num_distribution[:, i] ])
        sentRNN_label = tf.transpose(sentRNN_label)
        sentRNN_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits = sentRNN_logistic_mu, labels=sentRNN_label)
        sentRNN_loss = tf.reduce_sum(sentRNN_loss)/self.batch_size
        loss += sentRNN_loss * lambda_sent
        loss_sent += sentRNN_loss
        topic = tf.nn.rnn_cell.LSTMStateTuple(sent_topic_vec[:, 0:512], sent_topic_vec[:, 512:])
        word_state = (topic, topic)
        for j in range(0, self.N_max):
            if j > 0:
                tf.get_variable_scope().reuse_variables()

            with tf.device('/cpu:0'):
                current_embed = tf.nn.embedding_lookup(self.Wemb, captions[:, i, j])

            #with tf.variable_scope('word_LSTM',reuse = tf.AUTO_REUSE):
                #word_output, word_state = self.word_LSTM(current_embed, word_state)

                # How to make one-hot encoder, I refer from this excellent web:
                # http://stackoverflow.com/questions/33681517/tensorflow-one-hot-encoder
            labels = tf.reshape(captions[:, i, j+1], [-1, 1])
            indices = tf.reshape(tf.range(0, self.batch_size, 1), [-1, 1])
            concated = tf.concat(1, [indices, labels])
            onehot_labels = tf.sparse_to_dense(concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)
            logit_words = tf.nn.xw_plus_b(word_output[:], self.embed_word_W, self.embed_word_b)
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits = logit_words, labels =onehot_labels)
            cross_entropy = cross_entropy * captions_masks[:, i, j]
            loss_wordRNN = tf.reduce_sum(cross_entropy) / self.batch_size
            loss += loss_wordRNN * lambda_word
            loss_word += loss_wordRNN

        return feats, num_distribution, captions, captions_masks, loss, loss_sent, loss_word

In [65]:
import h5py
import os
import tensorflow as tf
def train():
    ##############################################################################
    # some preparing work
    ##############################################################################
    model_path = 'E:\Projects\Story-Telling-Using-Show-Attend-and-tell\models_batch\\'
    train_feats_path = 'E:\Projects\Story-Telling-Using-Show-Attend-and-tell\genomedata\opt.output_h5-feats1'
    #if os.path.isfile(train_feats_path) and os.access(train_feats_path, os.R_OK):
        #train_output_file = h5py.File(train_feats_path, 'r')
    #else :
    #    print("Either file is missing or is not readable")
        
    #train_feats = train_output_file.get('feats')
    train_imgs_full_path_lists = open('E:\Projects\Story-Telling-Using-Show-Attend-and-tell\imgs_train_path.txt').read().splitlines()
    train_imgs_names = map(lambda x: os.path.basename(x).split('.')[0], train_imgs_full_path_lists)
   # print(list(train_imgs_names))
    model = RegionPooling_HierarchicalRNN(n_words = len(word2idx),
                                          batch_size = batch_size,
                                          num_boxes = num_boxes,
                                          feats_dim = feats_dim,
                                          project_dim = project_dim,
                                          sentRNN_lstm_dim = sentRNN_lstm_dim,
                                          sentRNN_FC_dim = sentRNN_FC_dim,
                                          wordRNN_lstm_dim = wordRNN_lstm_dim,
                                          S_max = S_max,
                                          N_max = N_max,
                                          word_embed_dim = word_embed_dim,
                                          bias_init_vector = bias_init_vector)

    tf_feats, tf_num_distribution, tf_captions_matrix, tf_captions_masks, tf_loss, tf_loss_sent, tf_loss_word = model.build_model()
    sess = tf.InteractiveSession()
    saver = tf.train.Saver(max_to_keep=500, write_version=1)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.global_variables_initializer().run()



In [66]:
train()

Start build model:


ValueError: Shapes (2, 50, 1) and () are incompatible