In [None]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import cv2
import os
import pysrt
import sys
import argparse
import numpy as np

import tensorflow as tf

from scipy import misc

from inference_util import Inference
from model import Model_S2VT
import inception_base
import configuration
from data_generator import Data_Generator

In [None]:
import matplotlib.pyplot as plt

In [None]:
model_checkpoint = "./models/train/model-10144"
inception_checkpoint = "../../dataset/inception_v4.ckpt"
num_frames_per_sec=10
num_frames_per_clip =100
max_cap_len=20
batch_size=64

In [None]:
data_config = configuration.DataConfig().config
data_gen = Data_Generator(processed_video_dir = data_config["processed_video_dir"],
                        caption_file = data_config["caption_file"],
                        unique_freq_cutoff = data_config["unique_frequency_cutoff"],
                        max_caption_len = data_config["max_caption_length"])
data_gen.load_vocabulary(data_config["caption_data_dir"])
data_gen.load_dataset(data_config["caption_data_dir"])

model_config = configuration.ModelConfig(data_gen).config
model = Model_S2VT( num_frames = num_frames_per_clip,
                    image_width = model_config["image_width"],
                    image_height = model_config["image_height"],
                    image_channels = model_config["image_channels"],
                    num_caption_unroll = model_config["num_caption_unroll"],
                    num_last_layer_units = model_config["num_last_layer_units"],
                    image_embedding_size = model_config["image_embedding_size"],
                    word_embedding_size = model_config["word_embedding_size"],
                    hidden_size_lstm1 = model_config["hidden_size_lstm1"],
                    hidden_size_lstm2 = model_config["hidden_size_lstm2"],
                    vocab_size = model_config["vocab_size"],
                    initializer_scale = model_config["initializer_scale"],
                    learning_rate = model_config["learning_rate"],
                    mode="inference",
                    rnn1_input_keep_prob=model_config["rnn1_input_keep_prob"],
                    rnn1_output_keep_prob=model_config["rnn1_output_keep_prob"],
                    rnn2_input_keep_prob=model_config["rnn2_input_keep_prob"],
                    rnn2_output_keep_prob=model_config["rnn2_output_keep_prob"]
                    )
model.build()

sess = tf.Session()
saver = tf.train.Saver()
saver.restore(sess,model_checkpoint)


infer_util = Inference(model,data_gen.word_to_idx,data_gen.idx_to_word)

In [None]:
saver = tf.train.Saver(var_list=model.inception_variables)
saver.restore(sess,inception_checkpoint)


In [None]:
video_path = "../../dataset/raw/MSR-VTT_2016/TestVideo/video9975.mp4"


In [None]:
video = cv2.VideoCapture(video_path)

image_feed = model.processed_video_feed
inception_output = model.inception_output_raw

length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
width  = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps    = video.get(cv2.CAP_PROP_FPS)

time_length = length/fps

num_frames_to_read = int((int(time_length)*(num_frames_per_sec))/num_frames_per_clip)*num_frames_per_clip
frames_to_read = set(np.linspace(0,length-1,num=num_frames_to_read,endpoint=False,dtype=np.int32))

num_clips_per_batch = (num_frames_per_clip*batch_size)
num_batches = int((num_frames_to_read+num_clips_per_batch-1)/num_clips_per_batch)

captions=[]
frame_list = []

start_time=0
processed_batch=[]
batch_index=1

In [None]:
frames = []

In [None]:
for i in range(length):
    ret, frame = video.read()
    print(str(i)+" is in frames "+str(i in frames_to_read))
    if ret is False:
        break
    if i in frames_to_read:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = misc.imresize(frame,[299,299,3])
        frame = ((2*(frame.astype(np.float32) / 255 ))-1)
        frames.append(frame)
        frame = sess.run(inception_output,feed_dict={image_feed:[frame]})
        frame_list.append(frame)
        print("---"+str(i))
        if len(frame_list)%100==0:
            print(len(frame_list))
    if len(frame_list)==(num_frames_per_clip*batch_size):
        print("Processing batch %d of %d" %(batch_index, num_batches))
        batch_index+= 1
        processed_batch = np.array(frame_list,dtype=np.float32)
        print(processed_batch.shape)
        embedded_frames = np.reshape(processed_batch,[-1,num_frames_per_clip,inception_base.num_end_units_v4])
        caption_batch = infer_util.generate_caption_batch(sess,embedded_frames,max_len=max_len)
        for cap in caption_batch:
            caption = {}
            caption["start_time"] = start_time
            caption["end_time"] = start_time+ (num_frames_per_clip/num_frames_per_sec)
            start_time = caption["end_time"]
            caption["caption"] = cap
            captions.append(caption)
        frame_list = []
        del processed_batch
        del embedded_frames
        del caption_batch

In [None]:
print("Processing batch %d of %d" %(batch_index, num_batches))
processed_batch = np.array(frame_list,dtype=np.float32)

In [None]:
processed_batch.shape

In [None]:
embedded_frames = np.expand_dims(np.squeeze(processed_batch),0)

In [None]:
embedded_frames = np.reshape(processed_batch,[-1,num_frames_per_clip,inception_base.num_end_units_v4])

In [None]:
embedded_frames.shape

In [None]:
caption_batch = infer_util.generate_caption_batch(sess,embedded_frames,max_len=max_cap_len)

In [None]:
caption_batch

In [None]:
input_video = np.load("../../dataset/msrvtt_processed_video/test/video9975.mp4.npy")

In [None]:
input_video.shape

In [None]:
video_batch = np.expand_dims(input_video,axis=0)

In [None]:
video_batch.shape

In [None]:
caption_batch = infer_util.generate_caption_batch(sess,video_batch,max_len=max_cap_len)

In [None]:
caption_batch

In [None]:
video_batch[0][0]

In [None]:
embedded_frames[0][0]

In [1]:
s = "\n".join(["haha","jaja","kka"])

In [2]:
s

'haha\njaja\nkka'

In [5]:
import ipdb

MultipleInstanceError: Multiple incompatible subclass instances of TerminalIPythonApp are being created.