In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

import numpy as np

class Inference(object):
    def __init__ (self,model,word_to_idx,idx_to_word):
        self.model=model
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
    def feed_video(self, sess, encoded_video):
        initial_state_1,initial_state_2 = sess.run([self.model.infer_hidden_state_1,
                                                    self.model.infer_hidden_state_2,],
                                                    feed_dict = {self.model.rnn_input: encoded_video, self.model.is_training: False})
        return [initial_state_1,initial_state_2]  # (2,3,2,?,1000)
    def inference_step(self,sess,input_feed,state_feed):
        feed_dict = {}
        feed_dict[self.model.state_feed_1] = state_feed[0]  # want (?,6000)
        feed_dict[self.model.state_feed_2] = state_feed[1]  # want (?,6000)
        feed_dict[self.model.word_input] = input_feed
        
        preds,next_state_1,next_state_2 = sess.run(
                                    [self.model.infer_predictions,
                                     self.model.infer_last_state_l1,
                                     self.model.infer_last_state_l2],
                                     feed_dict=feed_dict)
        return preds,[next_state_1,next_state_2]
    def generate_caption_batch(self,sess,video_batch,max_len=20):
        input_batch = np.array([[self.word_to_idx["<bos>"]]]*video_batch.shape[0])
        state = self.feed_video(sess,video_batch)
        eos_batch = np.array([[self.word_to_idx["<eos>"]]]*video_batch.shape[0])
        finished_batch = np.array([[False]]*video_batch.shape[0])
        caption_generated = ["" for i in range(video_batch.shape[0])]
        loss = [0.0 for d in range(video_batch.shape[0])] 
        for i in range(max_len):
            pred,state = self.inference_step(sess,input_batch,state)
            input_batch = np.argmax(pred,axis=2)
            pred = np.squeeze(pred,axis=1)            
            pred_values = np.max(pred,axis=1)
            is_end = np.all(finished_batch)
            if is_end:
                break
            for idx,prob in enumerate(pred):
                if not finished_batch[idx]:
                    loss[idx] -= np.log(pred_values[idx])
                    if self.word_to_idx["<eos>"] != np.argmax(prob):
                        caption_generated[idx] += " "+self.idx_to_word[np.argmax(prob)]
            finished_batch = np.logical_or(input_batch==eos_batch,finished_batch)
            
        return [[(l/(len(i[1:].split(" ")) + 1),i[1:])] for l,i in zip(loss,caption_generated)]


In [2]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

import os
import tensorflow as tf
import numpy as np
import time
import json

from model import Model_S2VT
from data_generator import Data_Generator
from inference_util import Inference

import inception_base
import configuration

FLAGS = tf.app.flags.FLAGS

tf.flags.DEFINE_integer("batch_size", 64,
                       "Batch size of train data input.")
tf.flags.DEFINE_integer("beam_size", 3,
                       "Beam size.")
tf.flags.DEFINE_string("checkpoint_model", None,
                       "Model Checkpoint to use.")
tf.flags.DEFINE_integer("max_captions", None,
                       "Maximum number of captions to generate")
tf.flags.DEFINE_integer("max_len_captions", None,
                       "Maximum length of captions to generate")
tf.flags.DEFINE_string("dataset", "test",
                       "Dataset to use")
tf.flags.DEFINE_string("outfile_name", "generated_caption.json",
                       "Name of the output result file")


In [3]:
    FLAGS._parse_flags()
    data_config = configuration.DataConfig().config

    if FLAGS.checkpoint_model:
        model_path = FLAGS.checkpoint_model
    else:
        model_path = tf.train.latest_checkpoint(data_config["checkpoint_dir"])

In [4]:
        data_config = configuration.DataConfig().config
        data_gen = Data_Generator(processed_video_dir = data_config["processed_video_dir"],
                                caption_file = data_config["caption_file"],
                                unique_freq_cutoff = data_config["unique_frequency_cutoff"],
                                max_caption_len = data_config["max_caption_length"])

        data_gen.load_vocabulary(data_config["caption_data_dir"])
        data_gen.load_dataset(data_config["caption_data_dir"])

        assert FLAGS.dataset in ["val","test","train"]

        if FLAGS.max_len_captions:
            max_len = FLAGS.max_len_captions
        else:
            max_len = data_config['max_caption_length']

        model_config = configuration.ModelConfig(data_gen).config
        model = Model_S2VT( num_frames = model_config["num_frames"],
                            image_width = model_config["image_width"],
                            image_height = model_config["image_height"],
                            image_channels = model_config["image_channels"],
                            num_caption_unroll = model_config["num_caption_unroll"],
                            num_last_layer_units = model_config["num_last_layer_units"],
                            image_embedding_size = model_config["image_embedding_size"],
                            word_embedding_size = model_config["word_embedding_size"],
                            hidden_size_lstm1 = model_config["hidden_size_lstm1"],
                            hidden_size_lstm2 = model_config["hidden_size_lstm2"],
                            vocab_size = model_config["vocab_size"],
                            initializer_scale = model_config["initializer_scale"],
                            learning_rate = model_config["learning_rate"],
                            mode="inference",
                            rnn1_input_keep_prob=model_config["rnn1_input_keep_prob"],
                            rnn1_output_keep_prob=model_config["rnn1_output_keep_prob"],
                            rnn2_input_keep_prob=model_config["rnn2_input_keep_prob"],
                            rnn2_output_keep_prob=model_config["rnn2_output_keep_prob"],
                            num_layers_per_rnn=model_config["num_layers_per_rnn"]
                            )
        model.build()

In [5]:
infer_util = Inference(model,data_gen.word_to_idx,data_gen.idx_to_word)

In [6]:
self = infer_util

In [7]:
        if FLAGS.max_captions:
            max_iter = FLAGS.max_captions
        else:
            max_iter = len(data_gen.dataset[FLAGS.dataset])+10 #+10 is just to be safe ;)
        
        video_paths = {i["file_name"]:i["path"] for i in data_gen.dataset[FLAGS.dataset]}

        
        gen_captions = []


In [8]:
sess = tf.Session()

In [9]:
model_path = tf.train.latest_checkpoint(data_config["checkpoint_dir"])

In [10]:
beam_size = 3

In [15]:
video_batch = dataset["video"]

In [106]:
        assert beam_size >= 2
        input_batch = np.array([[self.word_to_idx["<bos>"]]]*video_batch.shape[0])
        # state = tf.convert_to_tensor(np.asarray(self.feed_video(sess,video_batch)))  # (2,3,2,?,1000)
        state = np.asarray(self.feed_video(sess,video_batch))  # (2,3,2,?,1000)
        batch_of_beams = []
        for i in range(video_batch.shape[0]):
            beam = [] # {st: , current_cap: , loss: , prev_word:}
            #for j in range(beam_size):
            beam.append({    "st":  state[:,:,:,i,:],                      #[state[k][i] for k in range(len(state))],
                             "current_cap":"" ,
                             "loss":0,
                             "prev_word":self.word_to_idx["<bos>"] })
            batch_of_beams.append(beam)



In [108]:
batch_of_beams[0]

[{'current_cap': '',
  'loss': 0,
  'prev_word': 1,
  'st': array([[[[ -9.99838333e+01,   2.04102921e+00,   9.14099274e+01, ...,
             -1.59060247e-02,  -9.60721054e+01,  -7.53348589e+00],
           [ -2.09867994e-05,   6.77062869e-01,   2.60653610e-06, ...,
             -8.73918623e-07,  -1.94410095e-04,  -1.73635719e-08]],
  
          [[  9.88782346e-01,  -2.07560749e+01,   7.47688770e-01, ...,
             -1.04166579e+00,   2.61445105e-01,   1.73813291e-02],
           [  8.42868462e-02,  -4.74841446e-02,   3.69045347e-01, ...,
             -6.29446447e-01,   1.42354354e-01,   1.72829512e-03]],
  
          [[ -7.28070021e-01,  -1.00246280e-01,  -4.21963283e-04, ...,
             -7.55462980e+00,   4.20989394e-01,   7.29404545e+00],
           [ -3.50736737e-01,  -1.82715780e-03,  -8.16863394e-05, ...,
             -5.00280440e-01,   1.39130965e-01,   2.68254399e-01]]],
  
  
         [[[ -9.73954010e+01,  -8.10542755e+01,   9.58131409e+01, ...,
              6.73773575e+0

In [109]:
batch_of_beams[0][0]["st"].shape

(2, 3, 2, 1000)

In [110]:
batch_of_beams[0][0]["st"]

array([[[[ -9.99838333e+01,   2.04102921e+00,   9.14099274e+01, ...,
           -1.59060247e-02,  -9.60721054e+01,  -7.53348589e+00],
         [ -2.09867994e-05,   6.77062869e-01,   2.60653610e-06, ...,
           -8.73918623e-07,  -1.94410095e-04,  -1.73635719e-08]],

        [[  9.88782346e-01,  -2.07560749e+01,   7.47688770e-01, ...,
           -1.04166579e+00,   2.61445105e-01,   1.73813291e-02],
         [  8.42868462e-02,  -4.74841446e-02,   3.69045347e-01, ...,
           -6.29446447e-01,   1.42354354e-01,   1.72829512e-03]],

        [[ -7.28070021e-01,  -1.00246280e-01,  -4.21963283e-04, ...,
           -7.55462980e+00,   4.20989394e-01,   7.29404545e+00],
         [ -3.50736737e-01,  -1.82715780e-03,  -8.16863394e-05, ...,
           -5.00280440e-01,   1.39130965e-01,   2.68254399e-01]]],


       [[[ -9.73954010e+01,  -8.10542755e+01,   9.58131409e+01, ...,
            6.73773575e+01,   9.24972076e+01,  -9.67322388e+01],
         [ -1.74283847e-01,  -2.67621642e-03,   2.1358

In [111]:
batch_of_beams[0][1]["st"]

IndexError: list index out of range

In [112]:
        completed_captions = [[] for d in range(video_batch.shape[0])] 


In [185]:
 i = 2

In [186]:
beam_squared_list = [[] for d in range(video_batch.shape[0])] 

In [187]:
video_batch.shape[0]

64

In [188]:
len(batch_of_beams[0])

3

In [189]:
vv = 0

In [190]:
                input_batch = [[video[vv]["prev_word"]] for video in batch_of_beams]

                state = np.array([video[vv]["st"] for video in batch_of_beams]) #? * 2 * 3 * 2 * 1000
                # state = tf.reshape(state,[-1,2,6000])
                state = np.reshape(state,[-1,2,6000])


In [191]:
state.shape


(64, 2, 6000)

In [192]:
state[0]

array([[ -7.18862305e+01,   1.45790863e+00,   2.56467018e+01, ...,
         -1.32005705e-04,  -5.85127389e-03,   1.77215447e-03],
       [ -9.39445190e+01,  -7.88581924e+01,   9.63216171e+01, ...,
         -4.79316682e-01,   2.35233217e-01,  -1.36554256e-01]], dtype=float32)

In [193]:
state[1]

array([[ -8.10724106e+01,  -6.73338711e-01,   3.68155251e+01, ...,
         -1.21881385e-05,   4.77400608e-04,   5.91508928e-04],
       [ -9.08534393e+01,  -6.09099655e+01,   8.04283676e+01, ...,
         -2.66367465e-01,   1.11403890e-01,  -1.10604897e-01]], dtype=float32)

In [194]:
                pred,state = self.inference_step(sess,input_batch,state.swapaxes(0,1))
                state = np.asarray(state)
                pred = np.squeeze(pred,axis=1)


In [195]:
state.shape

(2, 3, 2, 64, 1000)

In [196]:
state[:,:,:,0,:]

array([[[[ -6.05223579e+01,   6.38768196e-01,   6.17066526e+00, ...,
            4.06917892e-02,  -5.38499298e+01,  -3.69563401e-01],
         [ -1.02895215e-01,   2.15328764e-02,   4.47077379e-02, ...,
            2.97516887e-03,  -4.58392911e-02,  -5.94801679e-02]],

        [[ -3.54041278e-01,  -1.81968384e+01,  -4.81483251e-01, ...,
           -5.93857393e-02,   6.79109514e-01,   9.84028056e-02],
         [ -5.06397849e-03,  -8.08976352e-01,  -3.72972488e-02, ...,
           -4.57791612e-04,   4.84963227e-03,   1.45565733e-04]],

        [[ -3.96058476e-03,  -9.68905449e-01,   1.78015127e-03, ...,
           -8.49343777e+00,  -4.11529876e-02,   2.66327858e+00],
         [ -1.43273239e-04,  -8.33399186e-04,   4.88800936e-07, ...,
           -8.27363328e-05,  -5.19499707e-04,   1.25170627e-03]]],


       [[[ -8.24110107e+01,  -7.19346161e+01,   8.73680801e+01, ...,
            6.69161682e+01,   9.16233673e+01,  -7.74344482e+01],
         [ -6.48962334e-02,  -1.22694662e-02,   4.3225

In [197]:
state[:,:,:,1,:]

array([[[[ -6.68741379e+01,  -5.03829122e-01,   1.05510778e+01, ...,
           -4.65097837e-03,  -5.82719383e+01,  -9.94058251e-02],
         [ -6.77893236e-02,  -1.94962993e-02,   2.37251241e-02, ...,
           -3.21857515e-04,  -4.78835516e-02,  -1.93420984e-02]],

        [[ -8.18027306e+00,  -2.85391521e+00,  -1.67784795e-01, ...,
           -7.98211247e-03,   4.77696925e-01,   1.53137967e-01],
         [ -2.39530881e-03,  -5.86891174e-01,  -3.01803686e-02, ...,
           -1.18742784e-04,   5.03763149e-04,   2.78546304e-05]],

        [[ -3.75485932e-03,  -9.80784655e-01,   4.87186573e-03, ...,
           -6.30064917e+00,  -1.13337822e-01,   1.70398402e+00],
         [ -1.86717516e-04,  -1.45497639e-03,   3.12047541e-06, ...,
           -1.36619099e-04,  -1.81789312e-03,   1.89936673e-03]]],


       [[[ -8.26215134e+01,  -5.77710609e+01,   7.32980042e+01, ...,
            6.40386810e+01,   7.31793594e+01,  -4.03797722e+01],
         [ -5.71778193e-02,  -1.23174358e-02,   3.4308

In [198]:
                for j in range(len(beam_squared_list)):
                    #print("-------- Vid ------",j, " Current cap ", batch_of_beams[j][vv]["current_cap"] )
                    for pred_word in pred[j].argsort()[-beam_size:][::-1]:
                        #print(pred_word, " Pred Word-- ", self.idx_to_word[pred_word])
                        new_loss = batch_of_beams[j][vv]["loss"] - np.log(pred[j][pred_word])
                        if (pred_word == self.word_to_idx["<eos>"]) : 
                            completed_captions[j].append(( new_loss / (i+1) , # did  +1 avoiding divide by 0
                                                          batch_of_beams[j][vv]["current_cap"]))
                        else:
                            beam_squared_list[j].append({"st":  state[:,:,:,j,:], #,[state[k][j] for k in range(len(state))] ,
                                                         "current_cap":batch_of_beams[j][vv]["current_cap"] + 
                                                            " " + self.idx_to_word[pred_word],
                                                         "loss":new_loss,
                                                         "prev_word":pred_word })


In [199]:
beam_squared_list[0]

[{'current_cap': ' a woman is',
  'loss': 1.4964427649974823,
  'prev_word': 5,
  'st': array([[[[ -6.05223579e+01,   6.38768196e-01,   6.17066526e+00, ...,
              4.06917892e-02,  -5.38499298e+01,  -3.69563401e-01],
           [ -1.02895215e-01,   2.15328764e-02,   4.47077379e-02, ...,
              2.97516887e-03,  -4.58392911e-02,  -5.94801679e-02]],
  
          [[ -3.54041278e-01,  -1.81968384e+01,  -4.81483251e-01, ...,
             -5.93857393e-02,   6.79109514e-01,   9.84028056e-02],
           [ -5.06397849e-03,  -8.08976352e-01,  -3.72972488e-02, ...,
             -4.57791612e-04,   4.84963227e-03,   1.45565733e-04]],
  
          [[ -3.96058476e-03,  -9.68905449e-01,   1.78015127e-03, ...,
             -8.49343777e+00,  -4.11529876e-02,   2.66327858e+00],
           [ -1.43273239e-04,  -8.33399186e-04,   4.88800936e-07, ...,
             -8.27363328e-05,  -5.19499707e-04,   1.25170627e-03]]],
  
  
         [[[ -8.24110107e+01,  -7.19346161e+01,   8.73680801e+01, ...,

In [200]:
beam_squared_list[3]

[{'current_cap': ' a man is',
  'loss': 2.342376172542572,
  'prev_word': 5,
  'st': array([[[[ -6.03195152e+01,  -1.38572991e-01,   1.32158136e+01, ...,
              3.01691115e-01,  -4.20910454e+01,   1.76077560e-02],
           [ -1.91515610e-01,  -3.95993423e-03,   1.10232875e-01, ...,
              2.02649347e-02,  -4.91376854e-02,   4.31244168e-03]],
  
          [[ -2.22946143e+00,  -6.13076627e-01,  -4.72688884e-01, ...,
             -6.34076297e-02,   1.06641412e+00,   6.49412572e-01],
           [ -5.07369228e-02,  -4.66130108e-01,  -2.73120198e-02, ...,
             -2.14057349e-04,   2.27711890e-02,   1.09344088e-02]],
  
          [[ -1.46801560e-03,  -9.89146888e-01,   5.92224009e-04, ...,
             -1.72135563e+01,  -2.05453053e-01,   2.26917577e+00],
           [ -3.00105476e-05,  -4.49942017e-04,   3.26415588e-08, ...,
             -2.22821709e-05,  -3.39500536e-03,   3.24071734e-04]]],
  
  
         [[[ -8.20154648e+01,  -1.35776825e+01,   6.87278061e+01, ...,
  

In [201]:
            for j in range(len(beam_squared_list)):
                beam_squared_list[j].sort(key = lambda x: x["loss"])
                batch_of_beams[j] = beam_squared_list[j][:beam_size].copy()


In [202]:
len(beam_squared_list[0])

3

In [None]:
len(state[0][0][0][0])

In [None]:
state[:,:,:,0,:]

In [None]:
sss = tf.convert_to_tensor(np.asarray(state))

In [None]:
sss.shape

In [None]:
one = sss[:,:,:,0,:]

In [None]:
one.shape

In [13]:
batch_size = 64

In [11]:
gen_caption = []    
saver = tf.train.Saver()

if model_path != None:
    print("Restoring weights from %s" %model_path)
    saver.restore(sess,model_path)

else:
    print("No checkpoint found. Exiting")

video_files = list(video_paths.keys())    

iter = 0
btch = 0


Restoring weights from ./models/train\model-30540
INFO:tensorflow:Restoring parameters from ./models/train\model-30540


In [14]:
        print("Processing batch %d" %(int(0/batch_size)+1))
        start = 0
        end = min(len(video_files),0+batch_size)
        dataset={}
        dataset["video"] = np.asarray([np.load(video_paths[video_files[i]]) for i in range(start,end)])
        dataset["path"] = [video_paths[video_files[i]] for i in range(start,end)]
        dataset["file"] = [video_files[i] for i in range(start,end)]


Processing batch 1


In [None]:
beam_size = 3

In [None]:
video_batch  = dataset["video"]

In [None]:
        assert beam_size >= 2
        input_batch = np.array([[self.word_to_idx["<bos>"]]]*dataset["video"].shape[0])
        state = self.feed_video(sess,dataset["video"])

        batch_of_beams = []


In [None]:
model_config["num_layers_per_rnn"]

In [None]:
state[0].shape

In [None]:
model.infer_hidden_state_1.shape

In [None]:
model.cell_1.state_size


In [None]:
image_padding = tf.zeros([tf.shape(model.word_encoded)[0],1,model.num_last_layer_units])

In [None]:
                state_tuple_1 = tf.split(value=model.state_feed_1,
                                    num_or_size_splits=model.num_layers_per_rnn, axis=1)


In [None]:
                state_tuple_1 = tuple([tf.contrib.rnn.LSTMStateTuple(*tf.split(
                                value=LayerTuple,
                                num_or_size_splits=2, axis=1)
                                ) for LayerTuple in state_tuple_1])


In [None]:
                    outputs_l11, last_state_l11 = tf.nn.dynamic_rnn(
                                                cell=tf.contrib.rnn.MultiRNNCell(
                    [tf.contrib.rnn.BasicLSTMCell(
                    model.hidden_size_lstm1,
                    reuse=tf.get_variable_scope().reuse) for _ in range(model.num_layers_per_rnn)]),
                                                inputs=image_padding,
                                                initial_state=state_tuple_1,
                                                dtype=tf.float32)


In [None]:
last_state_l11

In [None]:
infer_hidden_state_1 = tf.concat(last_state_l11,axis=1,name="hidden_state_1_concat")


In [None]:
tf.

In [None]:
model.infer_last_state_l1

In [None]:
last_state_l11[0]


In [None]:
infer_hidden_state_1

In [None]:
infer_hidden_state_111 = tf.concat(last_state_l11,axis=2,name="hidden_state_1_concat")


In [None]:
infer_hidden_state_111