In [3]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

import os

import tensorflow as tf
import numpy as np
import time
import json

from model import Caption_Model
from data_generator import Data_Generator
from inference_util import Inference

import inception_base
import configuration

FLAGS = tf.app.flags.FLAGS

tf.flags.DEFINE_integer("batch_size", 64,
                       "Batch size of train data input.")
tf.flags.DEFINE_integer("beam_size", 3,
                       "Beam size.")
tf.flags.DEFINE_string("checkpoint_model", None,
                       "Model Checkpoint to use.")
tf.flags.DEFINE_integer("max_captions", None,
                       "Maximum number of captions to generate")
tf.flags.DEFINE_integer("max_len_captions", None,
                       "Maximum length of captions to generate")
tf.flags.DEFINE_string("dataset", "test",
                       "Dataset to use")
tf.flags.DEFINE_string("outfile_name", "generated_caption.json",
                       "Name of the output result file")

ArgumentError: argument --batch_size: conflicting option string: --batch_size

In [None]:
tf.constant

In [None]:
batch_size = 64
beam_size = 3
checkpoint_model = "./models/train/model-21160"
max_captions = None
max_len_captions = 30
dataset = "test"
outfile_name = "generated_caption.json"

In [None]:
data_config = configuration.DataConfig().config
data_gen = Data_Generator(processed_video_dir = data_config["processed_video_dir"],
                        caption_file = data_config["caption_file"],
                        unique_freq_cutoff = data_config["unique_frequency_cutoff"],
                        max_caption_len = data_config["max_caption_length"])

data_gen.load_vocabulary(data_config["caption_data_dir"])
data_gen.load_dataset(data_config["caption_data_dir"])

assert dataset in ["val","test","train"]

if max_len_captions:
    max_len = max_len_captions
else:
    max_len = data_config['max_caption_length']

model_config = configuration.ModelConfig(data_gen).config
model = Caption_Model( num_frames = model_config["num_frames"],
                    image_width = model_config["image_width"],
                    image_height = model_config["image_height"],
                    image_channels = model_config["image_channels"],
                    num_caption_unroll = model_config["num_caption_unroll"],
                    num_last_layer_units = model_config["num_last_layer_units"],
                    image_embedding_size = model_config["image_embedding_size"],
                    word_embedding_size = model_config["word_embedding_size"],
                    hidden_size_lstm = model_config["hidden_size_lstm"],
                    num_lstm_layer = model_config["num_lstm_layer"],
                    vocab_size = model_config["vocab_size"],
                    initializer_scale = model_config["initializer_scale"],
                    learning_rate = model_config["learning_rate"],
                    mode="inference",
                    rnn1_input_keep_prob=model_config["rnn1_input_keep_prob"],
                    rnn1_output_keep_prob=model_config["rnn1_output_keep_prob"],
                    rnn2_input_keep_prob=model_config["rnn2_input_keep_prob"],
                    rnn2_output_keep_prob=model_config["rnn2_output_keep_prob"]
                    )
model.build()

infer_util = Inference(model,data_gen.word_to_idx,data_gen.idx_to_word)


In [None]:
if max_captions:
    max_iter = max_captions
else:
    max_iter = len(data_gen.dataset[dataset])+10 #+10 is just to be safe ;)

video_paths = {i["file_name"]:i["path"] for i in data_gen.dataset[dataset]}


gen_captions = []

In [None]:
model_path = "./models/train/model-21160"

In [None]:
sess = tf.Session()

In [None]:
gen_caption = []    
saver = tf.train.Saver()

if model_path != None:
    print("Restoring weights from %s" %model_path)
    saver.restore(sess,model_path)

In [None]:
video_files = list(video_paths.keys())    
    
iter = 0
btch = 0

In [None]:
print("Processing batch %d" %(int(btch/batch_size)+1))
start = btch
# end = min(len(video_files),btch+batch_size)
end = btch+3
dataset={}
dataset["video"] = np.asarray([np.load(video_paths[video_files[i]]) for i in range(start,end)])
dataset["path"] = [video_paths[video_files[i]] for i in range(start,end)]
dataset["file"] = [video_files[i] for i in range(start,end)]

In [None]:
self = infer_util

In [None]:
dataset["gen_caption"] = infer_util.generate_caption_batch(sess,dataset["video"],max_len=max_len)

In [None]:
dataset

In [None]:
dataset["gen_caption"] = infer_util.generate_caption_batch_beam(sess,beam_size,dataset["video"],max_len=max_len)

In [None]:
dataset["gen_caption"]

In [None]:
video_batch = dataset["video"]

In [None]:
input_batch = np.array([[self.word_to_idx["<bos>"]]]*video_batch.shape[0])
state = np.array(self.feed_video(sess,video_batch))

In [None]:
state = np.split(state,state.shape[2],axis=2)
# state is list of size batch_size with each element of shape num_layer * 2 * 1 *hidden_size        
batch_of_beams = []

In [None]:
for i in range(video_batch.shape[0]):
    beam = [] # {st: , current_cap: , loss: , prev_word:}
    #for j in range(beam_size):
    beam.append({"st":state[i],
                     "current_cap":"" ,
                     "loss":0,
                     "prev_word":self.word_to_idx["<bos>"] })
    batch_of_beams.append(beam)

In [None]:
completed_captions = [[] for d in range(video_batch.shape[0])]

In [None]:
i=0

In [None]:
beam_squared_list = [[] for d in range(video_batch.shape[0])] 
for vv in range(len(batch_of_beams[0])):
    input_batch = [[video[vv]["prev_word"]] for video in batch_of_beams]
    state = [video[vv]["st"] for video in batch_of_beams] #batchsize * 2 * 2000
    state = np.concatenate(state,axis=2)
    pred,state = self.inference_step(sess,input_batch,state)
    state = np.array(self.feed_video(sess,video_batch))
    state = np.split(state,state.shape[2],axis=2)
    pred = np.squeeze(pred,axis=1)
    for j in range(len(beam_squared_list)):
        #print("-------- Vid ------",j, " Current cap ", batch_of_beams[j][vv]["current_cap"] )
        for pred_word in pred[j].argsort()[-beam_size:][::-1]:
            #print(pred_word, " Pred Word-- ", self.idx_to_word[pred_word])
            new_loss = batch_of_beams[j][vv]["loss"] - np.log(pred[j][pred_word])
            if (pred_word == self.word_to_idx["<eos>"]) : 
                completed_captions[j].append(( new_loss / (i+1) , # did  +1 avoiding divide by 0
                                              batch_of_beams[j][vv]["current_cap"]))
            else:
                beam_squared_list[j].append({"st":state[j] ,
                                             "current_cap":batch_of_beams[j][vv]["current_cap"] + 
                                                " " + self.idx_to_word[pred_word],
                                             "loss":new_loss,
                                             "prev_word":pred_word })

for j in range(len(beam_squared_list)):
    beam_squared_list[j].sort(key = lambda x: x["loss"])
    batch_of_beams[j] = beam_squared_list[j][:beam_size].copy()