In [1]:
import pyttsx3
engine = pyttsx3.init()
engine.setProperty('rate',200)  #250 words per minute
engine.setProperty('volume',0.9) 
engine.say("Hello this is me talking")
engine.runAndWait()

In [2]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import cv2
import os
import pysrt
import sys
import argparse
import numpy as np
import time
import threading

import tensorflow as tf

from scipy import misc

from inference_util import Inference
from model import Caption_Model
import inception_base
import configuration
from data_generator import Data_Generator

In [3]:
POOL_SIZE=60

In [4]:
model_checkpoint = "./new_models/model-12180"
inception_checkpoint = "../../dataset/inception_v4.ckpt"
max_cap_len = 20
beam_size = 3

In [5]:
data_config = configuration.DataConfig().config
data_gen = Data_Generator(processed_video_dir = data_config["processed_video_dir"],
                        caption_file = data_config["caption_file"],
                        unique_freq_cutoff = data_config["unique_frequency_cutoff"],
                        max_caption_len = data_config["max_caption_length"])
data_gen.load_vocabulary(data_config["caption_data_dir"])
data_gen.load_dataset(data_config["caption_data_dir"])

model_config = configuration.ModelConfig(data_gen).config
model = Caption_Model( num_frames = POOL_SIZE,
                    image_width = model_config["image_width"],
                    image_height = model_config["image_height"],
                    image_channels = model_config["image_channels"],
                    num_caption_unroll = model_config["num_caption_unroll"],
                    num_last_layer_units = model_config["num_last_layer_units"],
                    image_embedding_size = model_config["image_embedding_size"],
                    word_embedding_size = model_config["word_embedding_size"],
                    hidden_size_lstm = model_config["hidden_size_lstm"],
                    num_lstm_layer = model_config["num_lstm_layer"],
                    vocab_size = model_config["vocab_size"],
                    initializer_scale = model_config["initializer_scale"],
                    learning_rate = model_config["learning_rate"],
                    mode="inference",
                    rnn1_input_keep_prob=model_config["rnn1_input_keep_prob"],
                    rnn1_output_keep_prob=model_config["rnn1_output_keep_prob"],
                    rnn2_input_keep_prob=model_config["rnn2_input_keep_prob"],
                    rnn2_output_keep_prob=model_config["rnn2_output_keep_prob"],
                    embedding_file = model_config["embedding_file"]
                    )

In [6]:
model_graph = tf.Graph()
inception_graph = tf.Graph()

In [7]:
with model_graph.as_default() as g:
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    model_sess = tf.Session(config=config)
    model.build()
    saver = tf.train.Saver(var_list=tf.trainable_variables())
    print(tf.trainable_variables())
    saver.restore(model_sess,model_checkpoint)

[<tf.Variable 'embedding/spc_embedding:0' shape=(4, 300) dtype=float32_ref>, <tf.Variable 'embedding/glove_embedding:0' shape=(12905, 300) dtype=float32_ref>, <tf.Variable 'RNN_1/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0' shape=(2536, 4000) dtype=float32_ref>, <tf.Variable 'RNN_1/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0' shape=(4000,) dtype=float32_ref>, <tf.Variable 'RNN_2/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0' shape=(1300, 4000) dtype=float32_ref>, <tf.Variable 'RNN_2/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0' shape=(4000,) dtype=float32_ref>, <tf.Variable 'word_decoding/weights:0' shape=(1000, 12909) dtype=float32_ref>, <tf.Variable 'word_decoding/biases:0' shape=(12909,) dtype=float32_ref>]
INFO:tensorflow:Restoring parameters from ./new_models/model-12180


In [8]:
with inception_graph.as_default():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    inception_sess = tf.Session(config=config)
    image_feed = tf.placeholder(dtype=tf.float32,shape=[None,299,299,3],name="image_feed")
    inception_output = inception_base.get_base_model(image_feed)
    inception_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,scope="InceptionV4")
    saver = tf.train.Saver(var_list=inception_variables)
    saver.restore(inception_sess,inception_checkpoint)

INFO:tensorflow:Restoring parameters from ../../dataset/inception_v4.ckpt


In [9]:
infer_util = Inference(model,data_gen.word_to_idx,data_gen.idx_to_word)

In [10]:
def generate_captions(processed_video):
    start_time = time.time()
    with inception_graph.as_default():
        inception_processed_frame = inception_sess.run(inception_output,feed_dict={image_feed:processed_video})
    incep_time = time.time()
    with model_graph.as_default():
        embedded_frames = np.expand_dims(inception_processed_frame,0)
        if beam_size==1:
            caption_batch = infer_util.generate_caption_batch(model_sess,embedded_frames,max_len=max_cap_len)
        else:
            caption_batch = infer_util.generate_caption_batch_beam(model_sess,beam_size,embedded_frames,max_len=max_cap_len)
    end_time = time.time()
    caption_generated=""
    for cap in caption_batch[0]:
        caption_generated += cap[1]+", "
    print(caption_generated)
    engine.say(caption_batch[0][0][1])
    engine.runAndWait()

In [11]:
def show_webcam(mirror=False):
    cam = cv2.VideoCapture(0)
    start_time = time.time()
    pool_frames=[]
    while True:
        ret_val, frame = cam.read()
        if mirror:
            frame = cv2.flip(frame, 1)
        cv2.imshow('my webcam 2', frame)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = misc.imresize(frame,[299,299,3])
        frame = ((2*(frame.astype(np.float32) / 255 ))-1)
        pool_frames.append(frame)
        if len(pool_frames) == POOL_SIZE:
            temp_pool = pool_frames
            processed_video = pool_frames
            pool_frames = []
            t = threading.Thread(generate_captions(temp_pool))
            t.start()
        cv2.imshow('my webcam', frame)
        if cv2.waitKey(1) == 27: 
            break  # esc to quit
    cv2.destroyAllWindows()

def main():
    show_webcam(mirror=False)


In [13]:
main()

a man is talking to a camera, a man is talking to a man in a room, a man is talking to a man, 
a man is talking to a man in a room, a man is talking, a man is talking to a man, 
a man is talking to a man in a room, a man is talking to a man, a man is talking to another man in a room, 
a man is talking to a man in a room, a man is talking to a man, a man is talking to a camera, 
a man is talking, a man is talking to a man in a room, a man is talking to a man, 
a man is talking, a man is talking to a camera, a man is talking to a man in a room, 
a man is talking, a man is talking to a camera, a man is talking to a man in a room, 
a man is sitting in a room, a man is talking to a man in a room, a man is talking to a camera, 
a man is sitting in a room, a man is talking to a camera, a man is sitting in a room and talking to the camera, 
a man is sitting in a room, a man is talking to a man in a room, a man is talking to a camera, 
a man is talking about a computer program, a man is sitting

In [None]:
processed_video[0].shape

In [15]:
engine.say("caption_batch[0][0][1]")
engine.runAndWait()