In [1]:
from general_tools.notebook.gpu_utils import setup_one_gpu
GPU = 0
setup_one_gpu(GPU)

Picking GPU 0


In [2]:
import tensorflow as tf
import numpy as np
from general_tools.in_out.basics import unpickle_data
from tflearn.layers.core import fully_connected

from general_tools.notebook.tf import reset_tf_graph
from tf_lab.point_clouds.encoders_decoders import decoder_with_fc_only
from language_3d_io import GeoWordsDataSet
from tf_lab.rnn import deep_lstm, length_of_sequence, last_relevant_rnn_output, get_state_variables,\
                       get_state_update_op, get_state_reset_op

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline



In [13]:
training_data = unpickle_data('train_130_all_cond_max_seq_15_.pkl').next()
test_data = unpickle_data('test_10_all_cond_max_seq_15_.pkl').next()
word_to_int, int_to_word, bias_init_word_vec = unpickle_data('word_context.pickle')

In [15]:
bias_init_word_vec.shape
# len(word_to_int)

(489,)

In [16]:
class Caption_Generator():
    def __init__(self, dim_in, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, init_b):
        '''
        dim_in: conditional info: geo dims
        dim_embed: word_embedding dims
        dim_hidden: lstm_hidden
        n_lsmt_step: max_lstm_steps
        '''

        self.dim_in = dim_in
        self.dim_embed = dim_embed
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_lstm_steps = n_lstm_steps
        self.n_words = n_words
        
        # declare the variables to be used for our word embeddings
        with tf.device("/cpu:0"):
            self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='word_embedding')

        self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')
        
        
        # declare the variables to be used to embed the image feature embedding to the word embedding space
        self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_hidden], -0.1, 0.1), name='img_embedding')
        self.img_embedding_bias = tf.Variable(tf.zeros([dim_hidden]), name='img_embedding_bias')

        # declare the variables to go from an LSTM output to a word encoding output
        self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='word_encoding')
        # initialize this bias variable from the preProBuildWordVocab output
        self.word_encoding_bias = tf.Variable(init_b, dtype=tf.float32, name='word_encoding_bias')
        
        # declare the LSTM itself
        self.lstm = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden, state_is_tuple=False)
        
    def build_model(self):
        # declaring the placeholders for our extracted geometric feature vectors, our caption, and our mask
        # (describes how long our caption is with an array of 0/1 values of length `maxlen`  
        img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])
        caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
        mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])
        
        # getting an initial LSTM embedding from our image_imbedding
        image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
        
        # setting initial state of our LSTM
        state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)
        
        total_loss = 0.0
        with tf.variable_scope("RNN"):            
            for i in range(self.n_lstm_steps):
                if i == 0:
                    # if this is the first iteration of our LSTM we utilize the embedded image as our input
                    current_embedding = image_embedding
                else:
                   # if this isn’t the first iteration of our LSTM we need to get the word_embedding corresponding
                   # to the (i-1)th word in our caption 
                    with tf.device("/cpu:0"):
                        current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias
                    tf.get_variable_scope().reuse_variables()
                    
                out, state = self.lstm(current_embedding, state)
                
                if i > 0:
                    # get the one-hot representation of the next word in our caption 
                    labels = tf.expand_dims(caption_placeholder[:, i], 1)
                    ix_range = tf.range(0, self.batch_size, 1)
                    ixs = tf.expand_dims(ix_range, 1)
                    concat = tf.concat_v2([ixs, labels], axis=1)
                    
                    onehot = tf.sparse_to_dense(
                            concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)

                    #perform a softmax classification to generate the next word in the caption
                    
                    logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=onehot)
                    xentropy = xentropy * mask[:, i]

                    loss = tf.reduce_sum(xentropy)
                    total_loss += loss

            total_loss = total_loss / tf.reduce_sum(mask[:, 1:])
            
        return total_loss, img, caption_placeholder, mask
        
    def build_generator(self, maxlen, batchsize=1):
        #same setup as `build_model` function 
        img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])
        image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
        state = self.lstm.zero_state(batchsize,dtype=tf.float32)

        #declare list to hold the words of our generated captions
        all_words = []
        with tf.variable_scope("RNN"):
            tf.get_variable_scope().reuse_variables()
            # in the first iteration we have no previous word, so we directly pass in the image embedding
            # and set the `previous_word` to the embedding of the start token ([0]) for the future iterations
            output, state = self.lstm(image_embedding, state)
            previous_word = tf.nn.embedding_lookup(self.word_embedding, [0]) + self.embedding_bias

            for i in range(maxlen):
                out, state = self.lstm(previous_word, state)

                # get a one-hot word encoding from the output of the LSTM
                logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                best_word = tf.argmax(logit, 1)

                with tf.device("/cpu:0"):
                    # get the embedding of the best_word to use as input to the next iteration of our LSTM 
                    previous_word = tf.nn.embedding_lookup(self.word_embedding, best_word)

                previous_word += self.embedding_bias

                all_words.append(best_word)

        return img, all_words

In [17]:
geo_feat_size = 128 * 3
word_emb_dim = 100
n_hidden = word_emb_dim
max_steps = 15
batch_size = 40
n_words = len(word_to_int)

In [18]:
reset_tf_graph()
cp = Caption_Generator(geo_feat_size, n_hidden, word_emb_dim, batch_size, max_steps, n_words, bias_init_word_vec)
problem_loss, geo_pl, word_pl, mask_pl = cp.build_model()



In [19]:
def optimizer_step(loss, learning_rate=0.003):
    opt = tf.train.AdamOptimizer(learning_rate)
    grad_params = opt.compute_gradients(loss)
    capped_gp = [(tf.clip_by_value(grad, -5., 5.), param) for grad, param in grad_params]
    return opt.apply_gradients(capped_gp)

opt_step = optimizer_step(problem_loss)

In [20]:
gpu_config = tf.ConfigProto()
gpu_config.gpu_options.allow_growth = True
init = tf.global_variables_initializer()
sess = tf.Session(config=gpu_config)
sess.run(init)

In [41]:
display_step = 1
stats = []
batches_for_epoch = training_data.num_examples / batch_size
n_epochs = 100

for epoch in range(n_epochs):
    epoch_loss = 0

    for _ in range(batches_for_epoch):    
        words_i, geo_i, labels_i, mask_i, cond_i = training_data.next_batch(batch_size)
        
        feed_dict = {word_pl: words_i,
                     geo_pl: geo_i,
                     mask_pl: mask_i}
        
        _, step_loss = sess.run([opt_step, problem_loss], feed_dict=feed_dict)
        
        epoch_loss += step_loss
    
    epoch_loss /= batches_for_epoch

    if epoch % display_step == 0 :
        stats.append(epoch_loss)
        print epoch + 1, stats[-1]

ValueError: Fetch argument <tensorflow.python.framework.ops.Operation object at 0x7f1c4440f510> cannot be interpreted as a Tensor. (Operation name: "Adam"
op: "NoOp"
input: "^Adam/NoOp"
input: "^Adam/NoOp_1"
 is not an element of this graph.)

In [23]:
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, 'models.ckpt', global_step=n_epochs)

'models.ckpt-50'

In [24]:
# [v.name for v in tf.global_variables()]  # CHECK Variable here

[u'word_embedding:0',
 u'embedding_bias:0',
 u'img_embedding:0',
 u'img_embedding_bias:0',
 u'word_encoding:0',
 u'word_encoding_bias:0',
 u'RNN/BasicLSTMCell/Linear/Matrix:0',
 u'RNN/BasicLSTMCell/Linear/Bias:0',
 u'beta1_power:0',
 u'beta2_power:0',
 u'word_embedding/Adam:0',
 u'word_embedding/Adam_1:0',
 u'embedding_bias/Adam:0',
 u'embedding_bias/Adam_1:0',
 u'img_embedding/Adam:0',
 u'img_embedding/Adam_1:0',
 u'img_embedding_bias/Adam:0',
 u'img_embedding_bias/Adam_1:0',
 u'word_encoding/Adam:0',
 u'word_encoding/Adam_1:0',
 u'word_encoding_bias/Adam:0',
 u'word_encoding_bias/Adam_1:0',
 u'RNN/BasicLSTMCell/Linear/Matrix/Adam:0',
 u'RNN/BasicLSTMCell/Linear/Matrix/Adam_1:0',
 u'RNN/BasicLSTMCell/Linear/Bias/Adam:0',
 u'RNN/BasicLSTMCell/Linear/Bias/Adam_1:0']

In [25]:
geo_feat_size = 128 * 3
word_emb_dim = 100
n_hidden = word_emb_dim
max_steps = 15
batch_size = 1
n_words = len(word_to_int)

reset_tf_graph()

gpu_config = tf.ConfigProto()
gpu_config.gpu_options.allow_growth = True
init = tf.global_variables_initializer()
sess = tf.Session(config=gpu_config)
sess.run(init)

cp = Caption_Generator(geo_feat_size, n_hidden, word_emb_dim, batch_size, max_steps, n_words, bias_init_word_vec)
problem_loss, geo_pl, word_pl, mask_pl = cp.build_model()

saver = tf.train.Saver(tf.global_variables())
saver.restore(sess, '/orions4-zfs/projects/lins2/Panos_Space/Git_Repos/tf_lab/notebooks/panos/rnn/models.ckpt-50')



In [40]:
img_pl, res_words = cp.build_generator(10)
words_i, geo_i, labels_i, mask_i, cond_i = test_data.next_batch(1)
r = sess.run(res_words, feed_dict={img_pl: geo_i})
print cond_i
[int_to_word[j[0]] for j in r]

['far_31']


['straight',
 'legs',
 'are',
 'connected',
 'to',
 'it',
 'of',
 'the',
 'middle',
 'on']

['close_45']


['not', 'the', 'one', 'with', 'the', 'slatted']