In [1]:
import math
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle

import tensorflow.python.platform
from keras.preprocessing import sequence
from collections import Counter

Using TensorFlow backend.


In [2]:
model_path = '../models/tensorflow_kr'
# model_path = '../models/tensorflow_kr_dong'
model_path_transfer = '../models/tf_final'
# feature_path = '../data/feats.npy'
# annotation_path_kr = '../data/korean_data/ko_translateGoogle_caption.txt'
feature_path = '../data/word_vec_200.txt'
annotation_path_kr = '../data/korean_data/sentence.txt'

In [3]:
def get_data(annotation_path, feature_path):
    with open('sentence.txt', 'rb') as f:
        captions = np.array(pickle.load(f))
    
    with open('words_vec_200.txt', 'rb') as f:
        feats = np.array(pickle.load(f))
        
    return feats, captions

In [4]:
# model_path = '../models/tensorflow_kr'
# feature_path = '../data/feats.npy'
# annotation_path_kr = '../data/korean_data/ko_translateGoogle_caption.txt'

### Set Hyperparameters ###
dim_embed = 256
dim_hidden = 256
# dim_in = 4096
dim_in = 200
batch_size = 1
learning_rate = 0.001
momentum = 0.9
n_epochs = 25

'''
def get_data(annotation_path, feature_path):
    
    with open(annotation_path_kr, 'r', encoding='utf-8') as f:
        data = f.readlines()
    
    for i in range(len(data)):
        data[i] = data[i].replace('\n', '')
        data[i] = data[i].replace('.', '')
    annotations = np.array(data)
    return np.load(feature_path,'r'), annotations
'''

feats, captions = get_data(annotation_path_kr, feature_path)

print(feats.shape)
print(captions.shape)

print(captions[0])

(187400, 200)
(187400,)
‘무임승차하지 않는 삶’ 중학 시절, 은사님께서 해주신 말씀입니다


# Build Model

In [5]:
class Caption_Generator():
    def __init__(self, dim_in, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, init_b=None):

        self.dim_in = dim_in
        self.dim_embed = dim_embed
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_lstm_steps = n_lstm_steps
        self.n_words = n_words
        
        # declare the variables to be used for our word embeddings
        with tf.device("/cpu:0"):
            self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='word_embedding')

        self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')
        
        # declare the LSTM itself
        self.lstm = tf.contrib.rnn.BasicLSTMCell(dim_hidden)
        
        # declare the variables to be used to embed the image feature embedding to the word embedding space
        self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_hidden], -0.1, 0.1), name='img_embedding')
        self.img_embedding_bias = tf.Variable(tf.zeros([dim_hidden]), name='img_embedding_bias')

        # declare the variables to go from an LSTM output to a word encoding output
        self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1),
                                             name='word_encoding')
        
        # optional initialization setter for encoding bias variable 
        if init_b is not None:
            self.word_encoding_bias = tf.Variable(init_b, name='word_encoding_bias')
        else:
            self.word_encoding_bias = tf.Variable(tf.zeros([n_words]), name='word_encoding_bias')

    def build_model(self):
        # declaring the placeholders for our extracted image feature vectors, our caption, and our mask
        # (describes how long our caption is with an array of 0/1 values of length `maxlen`  
        img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])
        caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
        mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])
        
        # getting an initial LSTM embedding from our image_imbedding
        image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
        
        # setting initial state of our LSTM
        state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)

        total_loss = 0.0
        with tf.variable_scope("RNN"):
            for i in range(self.n_lstm_steps): 
                if i > 0:
                   # if this isn’t the first iteration of our LSTM we need to get the word_embedding corresponding
                   # to the (i-1)th word in our caption 
                    with tf.device("/cpu:0"):
                        current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias
                else:
                     #if this is the first iteration of our LSTM we utilize the embedded image as our input 
                    current_embedding = image_embedding
                if i > 0: 
                    # allows us to reuse the LSTM tensor variable on each iteration
                    tf.get_variable_scope().reuse_variables()

                current_embedding = tf.expand_dims(current_embedding, 1)
                out, state = tf.nn.dynamic_rnn(cell=self.lstm, inputs=current_embedding,
                                                               initial_state=state)


                if i > 0:
                    #get the one-hot representation of the next word in our caption 
                    labels = tf.expand_dims(caption_placeholder[:, i], 1)
                    ix_range=tf.range(0, self.batch_size, 1)
                    ixs = tf.expand_dims(ix_range, 1)
                    concat = tf.concat([ixs, labels],1)
                    onehot = tf.sparse_to_dense(
                            concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)


                    #perform a softmax classification to generate the next word in the caption
                    #out = tf.squeeze(out, [1])
                    logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=onehot)
                    xentropy = xentropy * mask[:,i]

                    loss = tf.reduce_sum(xentropy)
                    total_loss += loss

            total_loss = total_loss / tf.reduce_sum(mask[:,1:])
            return total_loss, img,  caption_placeholder, mask


    def build_generator(self, maxlen, batchsize=1):
        #same setup as `build_model` function 
        img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])
        image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
        state = self.lstm.zero_state(batchsize,dtype=tf.float32)

        #declare list to hold the words of our generated captions
        all_words = []
        with tf.variable_scope("RNN"):
            # in the first iteration we have no previous word, so we directly pass in the image embedding
            # and set the `previous_word` to the embedding of the start token ([0]) for the future iterations
            
            image_embedding = tf.expand_dims(image_embedding, 1)
            out, state = tf.nn.dynamic_rnn(cell=self.lstm, inputs=image_embedding,initial_state=state)
            previous_word = tf.nn.embedding_lookup(self.word_embedding, [0]) + self.embedding_bias
            
            for i in range(maxlen):
                tf.get_variable_scope().reuse_variables()
                
                previous_word = tf.expand_dims(previous_word, 1)
                out, state = tf.nn.dynamic_rnn(cell=self.lstm, inputs=previous_word, initial_state=state)
                out = tf.squeeze(out, [0])
                # get a get maximum probability word and it's encoding from the output of the LSTM
                
                logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                best_word = tf.argmax(logit, 1)
                
                with tf.device("/cpu:0"):
                    # get the embedding of the best_word to use as input to the next iteration of our LSTM 
                    previous_word = tf.nn.embedding_lookup(self.word_embedding, best_word)

                previous_word += self.embedding_bias

                all_words.append(best_word)
                
        return img, all_words

In [6]:
if not os.path.exists('../data/ixtoword_kr.npy'):
    print ('You must run 1. O\'reilly Training.ipynb first.')
else:
    ixtoword = np.load('../data/ixtoword_kr.npy').tolist()
    n_words = len(ixtoword)
    maxlen=15
    
    tf.reset_default_graph()
    sess = tf.InteractiveSession()
    
    caption_generator = Caption_Generator(dim_in, dim_hidden, dim_embed, batch_size, maxlen+2, n_words)

    image, generated_words = caption_generator.build_generator(maxlen=maxlen)

# Test

In [7]:
def test(sess,image,generated_words,ixtoword,idx=0): # Naive greedy search

    feats, captions = get_data(annotation_path_kr, feature_path)
    feat = np.array([feats[idx]])
    
    saver = tf.train.Saver()
    sanity_check= False
    #sanity_check=True
    if not sanity_check:
        saved_path=tf.train.latest_checkpoint(model_path+'/')
        saver.restore(sess, saved_path)
    else:
        tf.global_variables_initializer().run()

    generated_word_index= sess.run(generated_words, feed_dict={image:feat})
    generated_word_index = np.hstack(generated_word_index)
    generated_sentence = [ixtoword[x] for x in generated_word_index]
    print(generated_sentence)

In [40]:
test(sess,image,generated_words,ixtoword,19)

INFO:tensorflow:Restoring parameters from ../models/tensorflow_kr/model-29
['', '수업', '중', '수업이', '끝나고', '학교에', '남아서', '학교에', '.', '.', '.', '.', '교환학생을', '온', '멘토링']


In [12]:
captions[0:20]

array(['‘무임승차하지 않는 삶’ 중학 시절, 은사님께서 해주신 말씀입니다',
       ' 직접 노력해서 결과를 얻어야 한다는 기본적 원칙은 곧 제 인생의 가치관이 되었습니다',
       ' 인생의 가치관이 생기자 새로운 목표가 생겼습니다',
       ' 그 계기는 고등학생 시절 학창시절 우연히 하게 된 일용직 근로자 경험에서 시작되었습니다',
       ' 작은 벽돌이 쌓여서 큰 건물이 완성되는 모습은 저에겐 큰 놀라움이었습니다',
       ' 당시 어린 눈으로 봤던 건설현장은 저의 가치관이 그대로 투영된 곳으로 다가왔고 곧, 건설현장은 저에게 목표가 되었습니다',
       ' 저는 건축엔지니어로서의 기본을 다지기 위해 아래와 같은 노력을 했습니다', ' 다양한 분야의 지식을 습득했습니다',
       ' 건축은 구조, 시공, 안전 등 다양한 분야로 이루어져 있습니다',
       ' 그 때문에 한 가지 분야만 능통해서는 온전한 건물을 지을 수 없습니다',
       ' 그래서 저는 설계, 안전, 환경, 시공 등 다양한 전공수업을 이수하였습니다',
       ' 다양한 변수가 있는 현장에서 습득한 지식을 토대로 공기단축에 이바지하도록 하겠습니다',
       ' 다양한 경험을 위해 경진대회에 참여했습니다', ' 학부 시절 시공경진 대회에 참여했습니다',
       ' 대회를 준비하는 과정에서의 공부는 시공과 하자에 대한 다양한 지식을 얻을 수 있었습니다',
       " 또한, 21개 팀이 참가한 경진대회에서 100명이 넘는 인원과 충남지역 대기업 실무자분들 앞에서 발표한 경험은 저를 더욱 단단하게 만들어 주었고, '대상'이라는 성과도 얻을 수 있었습니다",
       '가장 기억에 남는 전공은 안전공학입니다',
       ' 현직에 종사하시는 교수님 아래에서 안전의 이론과 더불어 현장에서의 적용을 동시에 배울 수 있었습니다',
       ' 안전교육법, 감성접근법 등을 배움은 분명 저의 강점이 될 것입니다',
  

In [23]:
## pickle load 하기
with open('words_1_dic.txt', 'rb') as f:
    words_1_dic = pickle.load(f)

In [33]:
words_1_dic[0:20]

['무임승차',
 '노력',
 '인생',
 '계기',
 '벽돌',
 '당시',
 '건축',
 '분야',
 '건축',
 '때문',
 '설계',
 '변수',
 '경험',
 '학부',
 '대회',
 '참가',
 '기억',
 '현직',
 '안전',
 '수업']