In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

from tensorflow.contrib.slim.python.slim.nets.inception_v3 import inception_v3_base

slim = tf.contrib.slim

Instructions for updating:
Use the retry module or similar alternatives.


## Config

In [9]:
class ModelConfig(object):
    def __init__(self):
        # File pattern of sharded TFRecord file containing SequenceExample protos.
        # Must be provided in training and evaluation modes.
        # 이미지의 TFRecord 파일 위치를 나타냄.
        self.input_file_pattern = None
        # Image format ("jpeg" or "png").
        self.image_format = "jpeg"
        # Approximate number of values per input shard. Used to ensure sufficient
        # mixing between shards in training.
        self.values_per_input_shard = 2300
        # Minimum number of shards to keep in the input queue.
        # 인풋 queue를 유지하기 위한 최소 shard수
        self.input_queue_capacity_factor = 2
        # Number of threads for prefetching SequenceExample protos.
        self.num_input_reader_threads = 1
        
        # Name of the SequenceExample context feature containing image data.
        self.image_feature_name = "image/data"
        # Name of the SequenceExample feature list containing integer captions.
        self.caption_feature_name = "image/caption_ids"
        # Number of unique words in the vocab (plus 1, for <UNK>).
        self.vocab_size = 12000
        # Number of threads for image preprocessing. Should be a multiple of 2.
        self.num_preprocess_threads = 4
        
        self.batch_size = 32
        # File containing an Inception v3 checkpoint to initialize the variables
        # of the Inception model.
        self.inception_checkpoint_file = None
        
        # Dimensions of Inception v3 input images.
        self.image_height = 299
        self.image_width = 299
        # Scale used to initialize model variables.
        self.initializer_scale = 0.08
        
        # LSTM input and output dimensionality, respectively.
        self.embedding_size = 512
        self.num_lstm_units = 512
        # If < 1.0, the dropout keep probability applied to LSTM variables.
        self.lstm_dropout_keep_prob = 0.7

        

In [None]:
class TrainingConfig(object):
    def __init__(self):
        # Number of examples per epoch of training data.
        # epoch당 학습용 데이터 수
        self.num_examples_per_epoch = 586363
        
        # Optimizer for training the model.
        self.optimizer = "SGD"
        
        # Learning rate for the initial phase of training.
        self.initial_learning_rate = 2.0
        self.learning_rate_decay_factor = 0.5
        self.num_epochs_per_decay = 8.0
        
        # Learning rate when fine tuning the Inception v3 parameters.
        self.train_inception_learning_rate = 0.0005
        
        # If not None, clip gradients to this value.
        self.clip_gradients = 5.0
        
        # How many model checkpoints to keep.
        self.max_checkpoints_to_keep = 5

## Inception

이미지를 받아서 representation을 내놓는 inception 모델을 그리는 함수

In [5]:
def inception_v3(images,
                trainable=True,
                is_training=True,
                weight_decay=0.00004,
                stddev=0.1,
                dropout_keep_prob=0.8,
                use_batch_norm=True,
                batch_norm_params=None,
                add_summaries=True,
                scope="InceptionV3"):
    
    """Builds an Inception V3  subgraph for image embeddings
    
    Args:
        images : A float32 Tensor of shape [batch, height, width, channels]
        trainable : Whether the inception submodel should be trainable or not.
        is_training : Boolean indicationg traing mode or not
        weight_decay : Coefficinet for weight regularization.
        stddev  :The standard deviation of the truncated normal weight initializer
        dropout_keep_prob :  Dropout keep probability
        use_batch_norm : Whether to use batch normalization
        batch_norm_params : Parameters for batch nomalization
        add_summaries
    
    
    Returns:
        end_points : A dictionary of activations from inception_v3 layers
    """
    
    # Only consider the inception model to be in training mode if it's trainable
    # trainable은 인셉션 모델 자체의 파라미터까지 train할 것인지 결정
    # is_training은 그냥 train과 eval, inference를 구분하는 모드값
    is_inception_model_training = trainable and is_training
    
    if use_batch_norm:
        # Default parameters for batch normlization.
        if not batch_norm_params:
            batch_norm_params = {
                "is_training" : is_inception_model_training,
                "trainable" : trainable,
                # Decay for the moving averages
                "decay": 0.9997,
                # Epsilon ot prevent 0s in variance.
                "epsilon": 0.001,
                # Collection containing the moving mean and moving variance.
                "variables_collections":{
                    "beta":None,
                    "gamma":None,
                    "moving_mean":["moving_vars"],
                    "moving_variance":["moving_vars"],
                }
            }
    else:
        batch_norm_params=None
    
    # 만약 inception model 자체도 학습한다면 regularizer를 같이 사용한다.
    if trainable:
        weights_regularizer = tf.contrib.layers.l2_regularizer(scale=weight_decay)
        
    else:
        weights_regularizer = None
    
    # values: The list of `Tensor` arguments that are passed to the op function.
    with tf.variable_scope(scope, "InceptionV3", [images]) as scope:
        with slim.arg_scope(
        [slim.conv2d, slim.fully_connected],
        weights_regularizer=weights_regularizer,
        trainable=trainable):
            # conv2d의 경우에는 relu, batch_norm을 쓰도록 한다.
            with slim.arg_scope(
            [slim.conv2d],
            weights_initalizer=tf.truncated_normal_initializer(stddev=stddev),
            activation_fn=tf.nn.relu,
            normalizer_fn=slim.batch_norm,
            normalizer_params=batch_norm_params):
                # mixed_8x8x2048b   | Mixed_7c
                # net=tensor_out: output tensor corresponding to the final_endpoint
                # end_points: a set of activations for external use, for example summaries or losses.
                net, end_points = inception_v3_base(inputs=images, scope=scope)
                
                with tf.variable_scope("logits"):
                    # [bacth x 8 x 8 x 2048]
                    shape = net.get_shape()
                    # 8x8 커널을 사용한 avg_pooling -> 1x1으로 만든다.
                    net = slim.avg_pool2d(net, shape[1:3], padding="VALID", scope="pool")
                    # incetpion 모델을 같이 학습하는 경우에만 dropout을 적용
                    net = slim.dropout(
                    net, keep_prob=dropout_keep_prob, 
                    is_training=is_inception_model_training,
                    scope="dropout")
                    # [bacth, 1x1x2048]로 만든다.
                    net = slim.flatten(net, scope="flatten")
    
    # Add summaries:
    if add_summaries:
        for v in end_points.values():
            tf.contrib.layers.summaries.summarize_activation(v)
    
    return net

## Process_image
TFRecords에 인코딩된 이미지를 실제 float32 텐서로 바꾸고 전처리를 해준다.

In [19]:
def process_image(encoded_image, is_training, height, width, resize_height=346,
                 resize_width=346, thread_id=0, image_format="jpeg"):
    """
    Decode an image, resize and apply random distortions.
    Args:
        encoded_image: String Tensor containing the image.
        is_training: Boolean; whether preprocessing for training or eval.
        height: Height of the output image.
        width: Width of the output image.
        resize_height: If > 0, resize height before crop to final dimensions.
        resize_width: If > 0, resize width before crop to final dimensions.
        thread_id: Preprocessing thread id used to select the ordering of color
        distortions. There should be a multiple of 2 preprocessing threads.
        image_format: "jpeg" or "png"
        
    Returns:
        A float32 Tensor of shape [height, width, 3] with values in [-1, 1].
    """
    
    # String 텐서를 이미지로 디코딩한다. unit8로 기본적으로 디코딩된다.
    with tf.name_scope("decode", values=[encoded_image]):
        if image_foramt == "jpeg":
            # Decode a JPEG-encoded image to a uint8 tensor.
            image = tf.image.decode_jpeg(contents=encoded_image, channels=3)
        elif image_format == "png":
            image = tf.image.decode_png(encoded_image, channels=3)
        else:
            raise ValueError("Invalid image format: %s" % image_format)
    # unit8에서 float32로 바꾼다.        
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    
    # 이미지 크기를 리사이즈 - cropping을 걸기전에 크게 만든다.
    assert (resize_height > 0) == (resize_width > 0)
    if resize_height:
        image = tf.image.resize_images(image,
                                      size=[resize_height, resize_width],
                                      method=tf.image.ResizeMethod.BILINEAR)
    # 만약 학습 과정이라면 정중앙을 기준으로 하는게 아니라 랜덤한 위치를 기준으로 crop하여 
    # 일반화 능력을 높인다.
    if is_training:
        image = tf.random_crop(image, [height, width, 3])
    # eval과정이라면 정중앙으로 crop한다.    
    else:
        image = tf.image.resize_image_with_crop_or_pad(image, height, width)
        
    # 이미지 값의 범위를 [-1,1]로 맞춘다.
    image = tf.subtract(image, 0.5)
    image = tf.multiply(image, 2.0)
    return image

In [None]:
def prefetch_input_data(reader,
                       file_pattern,
                       is_training,
                       batch_size,
                       values_per_shard,
                       input_queue_capacity_factor=16,
                       num_reader_threads=1,
                       shard_queue_name="filename_queue",
                       value_queue_name="input_queue"):
    """Prefetches string values from disk into an input queue.
    
    In training the capacity of the queue is important because a larger queue
    means better mixing of training examples between shards.
    학습을 할때에 queue의 용량이 중요한데 queue가 클수록 분할파일 사이의 training example이
    더 잘 섞이기 때문이다.
    
    The minimum number of values kept in the queue is 
    values_per_shard * input_queue_capacity_factor,
    where input_queue_memory factor should be chosen to trade-off better mixing with memory usage.
    
    input_queue_memory factor는 메모리 사용량과 더 잘 mixing되느냐 사이에 trade-off 관계이다.
    Args:
        reader: Instance of tf.ReaderBase.
        file_pattern: Comma-separated list of file patterns (e.g. /tmp/train_data-?????-of-00100).
        is_training: Boolean; whether prefetching for training or eval.
        batch_size: Model batch size used to determine queue capacity.
        values_per_shard: Approximate number of values per shard.
        input_queue_capacity_factor: Minimum number of values to keep in the queue
            in multiples of values_per_shard. See comments above.
        num_reader_threads: Number of reader threads to fill the queue.
        shard_queue_name: Name for the shards filename queue.
        value_queue_name: Name for the values input queue.
    Returns:
        A Queue containing prefetched string values.
    """
    data_files = []
    for pattern in file_pattern.split(","):
        data_files.extend(tf.gfile.Glob(pattern))
    if not data_files:
        tf.logging.fatal("Found no input files matching %s", file_pattern)
    else:
        tf.logging.info("Prefetching values from %d files matching %s",len(data_files), file_pattern)
    
    # 학습 단계라면 랜덤하게 shuffle을 하는 queue를 사용한다.
    if is_training:
        # string_input_producer: Output strings (e.g. filenames) to a queue for an input pipeline.
        filename_queue = tf.train.string_input_producer(
            data_files, shuffle=True, capacity=16, name=shard_queue_name)
        # min_queue_examples = 2300 * 2
        min_queue_examples = values_per_shard * input_queue_capacity_factor
        capacity = min_queue_examples + 100 * batch_size
        # RandomShuffleQueue:A queue implementation that dequeues elements in a random order.
        values_queue = tf.RandomShuffleQueue(capacity=capacity,
                                             min_after_dequeue=min_queue_examples,
                                             dtypes=[tf.string],
                                             name="random_" + value_queue_name)
    # 학습이 아니라면 FIFOqueue를 사용한다. (순서대로 캡션이 나오도록)    
    else:
        filename_queue = tf.train.string_input_producer(data_files, shuffle=False, 
                                                        capacity=1, name=shard_queue_name)
        capacity = values_per_shard + 3 * batch_size
        # FIFOQueue:A queue implementation that dequeues elements in first-in first-out order.
        values_queue = tf.FIFOQueue(capacity=capacity, dtypes=[tf.string],
                                    name="fifo_" + value_queue_name)
        
    
    # enqueue_ops: List of enqueue ops to run in threads later.
    enqueue_ops = []
    for _ in range(num_reader_threads):
        # queue=A Queue or a mutable string Tensor representing a handle to a Queue
        # read():Returns the next record (key, value) pair produced by a reader.
        _, value = reader.read(queue=filename_queue)
        # enqueue() : Enqueues one element to this queue.
        enqueue_ops.append(values_queue.enqueue([value]))
    
    # add_queue_runne : Adds a `QueueRunner` to a collection in the graph.
    # queue_runners라는 collection에 QueueRunner를 더한다.
    tf.train.queue_runner.add_queue_runner(tf.train.queue_runner.QueueRunner(values_queue, enqueue_ops))
    tf.summary.scalar("queue/%s/fraction_of_%d_full" % (values_queue.name, capacity),
                      tf.cast(values_queue.size(), tf.float32) * (1. / capacity))
    return values_queue

dequeue해서 나온 SequenceExample proto를 parsing 한다.

In [20]:
def parse_sequence_example(serialized, image_feature, caption_feature):
    """
    Parses a tensorflow.SequenceExample into an image and caption.
    Args:
       serialized: A scalar string Tensor; a single serialized SequenceExample.
       image_feature: Name of SequenceExample context feature containing image data.
       caption_feature: Name of SequenceExample feature list containing integer captions.
       
    
    Returns:
        encoded_image: A scalar string Tensor containing a JPEG encoded image.
        caption: A 1-D uint64 Tensor with dynamically specified length.
    """
    # parse_single_sequence_example : Parses a single `SequenceExample` proto.
    # Returns : The first dict contains the context key/values.
    # The second dict contains the feature_list key/values
    context, sequence = tf.parse_single_sequence_example(serialized=serialized,
                                                        context_features={
                                                            image_feature: tf.FixedLenFeature([], dtype=tf.string)
                                                        },
                                                        sequence_features={
                                                            caption_feature: tf.FixedLenSequenceFeature([], dtype=tf.int64)
                                                        })
    
    encoded_image = context[image_feature]
    caption = sequence[caption_feature]
    return encoded_image, caption


In [21]:
def batch_with_dynamic_pad(images_and_captions, batch_size, queue_capacity,add_summaries=True):
    """
    Batches input images and captions.
    
    This function splits the caption into an input sequence and a target sequence,
    where the target sequence is the input sequence right-shifted by 1. Input and
    target sequences are batched and padded up to the maximum length of sequences
    in the batch. A mask is created to distinguish real words from padding words.
    
    
    Args:
        images_and_captions: A list of pairs [image, caption], where image is a
        Tensor of shape [height, width, channels] and caption is a 1-D Tensor of
        any length. Each pair will be processed and added to the queue in a
        separate thread.
        
        batch_size: Batch size
        queue_capacity: Queue capacity.
        add_summaries: If true, add caption length summaries.
        
    Returns:
        images: A Tensor of shape [batch_size, height, width, channels].
        input_seqs: An int32 Tensor of shape [batch_size, padded_length].
        target_seqs: An int32 Tensor of shape [batch_size, padded_length].
        mask: An int32 0/1 Tensor of shape [batch_size, padded_length].
    """
    enqueue_list = []
    for image, caption in images_and_captions:
        caption_length = tf.shape(caption)[0]
        # 배치 차원을 더한다. intput_length = [caption_length-1]
        input_length = tf.expand_dims(tf.subtract(caption_length, 1), 0)
        # 0번째 idx부터 input_length까지 자른다. 제일 마지막 위치 idx가 잘림.
        input_seq = tf.slice(input_=caption, begin=[0], size=input_length)
        # 1번째부터 idx부터 input_length만큼 자른다. 제일 첫번째 0번째 idx가 잘림.
        target_seq = tf.slice(input_=caption, begin=[1], size=input_length)
        # input_length 길이만큼 1로채운 벡터
        indicator = tf.ones(input_length, dtype=tf.int32)
        enqueue_list.append([image, input_seq, target_seq, indicator])
    
    # batch_join: Runs a list of tensors to fill a queue to create batches of examples.
    # dynamic_pad :The given dimensions are padded upon dequeue so that tensors within a
    # batch have the same shapes.
    # tensors_list: A list of tuples or dictionaries of tensors to enqueue.
    
    # Enqueues a different list of tensors in different threads.
    # Implemented using a queue -- a `QueueRunner` for the queue
    # is added to the current `Graph`'s `QUEUE_RUNNER` collection.
    images, input_seqs, target_seqs, mask = tf.train.batch_join(enqueue_list,
                                                                batch_size=batch_size,
                                                                capacity=queue_capacity,
                                                                dynamic_pad=True,
                                                                name="batch_and_pad")
    
    if add_summaries:
        lengths = tf.add(tf.reduce_sum(mask, 1), 1)
        tf.summary.scalar("caption_length/batch_min", tf.reduce_min(lengths))
        tf.summary.scalar("caption_length/batch_max", tf.reduce_max(lengths))
        tf.summary.scalar("caption_length/batch_mean", tf.reduce_mean(lengths))
        
    return images, input_seqs, target_seqs, mask

## Show and Tell model

In [2]:
class ShowAndTellModel(object):
    
    def __init__(self, config, mode, train_inception=False):
        
        assert mode in ["train", "eval", "inference"]
        self.config = config
        self.mode = mode
        # inception도 같이 학습할 것인지 여부
        self.train_inception = train_inception
        # A Reader that outputs the records from a TFRecords file.
        self.reader = tf.TFRecordReader()
        # 기본적으로 사용될 initializer
        self.initializer = tf.random_uniform_initializer(minval=-self.config.initializer_scale,
                                                        maxval=self.config.initializer_scale)
        
        # float32 Tensor with shape [batch_size, height, width, channels]
        self.images = None
        # int32 Tensor with shape [batch_size, padded_length]
        self.input_seqs = None
        # int32 Tensor with shape [batch_size, padded_length]
        self.target_seqs = None
        # int32 0/1 Tesnor with shape [batch_size, padded_length]
        self.input_mask = None
        # float32 Tensor with shape [batch_size, embeddign_size]
        self.image_embeddings = None
        # A float32 Tensor with shape [batch_size, padded_length, embedding_size].
        self.seq_embeddings = None
        
        # A float32 scalar Tensor; the total loss for the trainer to optimize.
        self.total_loss = None
        # A float32 Tensor with shape [batch_size * padded_length].
        self.target_cross_entropy_losses = None
        # A float32 Tensor with shape [batch_size * padded_length].
        # seq2seq loss를 계산하기 위한 mask값
        self.target_cross_entropy_loss_weights = None
        
        # Collection of variables from the inception submodel.
        self.inception_variables = []
        # Function to restore the inception submodel from checkpoint.
        self.init_fn = None
        # Global step Tensor.
        self.global_step = None
    
    # 그냥 train 모드인지 확인하는 함수
    def is_training(self):
        """Returns true if the model is built for training mode."""
        return self.mode == "train"
    
    # config에 맞춰서 이미지 string으로 부터 처리하는 함수
    def process_image(self, encoded_image, thread_id=0):
        """
        Decodes and processes an image string.
        Args:
            encoded_image: A scalar string Tensor; the encoded image.
            thread_id: Preprocessing thread id used to select the ordering of color distortions.
            
        Returns:
            A float32 Tensor of shape [height, width, 3]; the processed image.
        """
        
        return process_image(encoded_image, is_training=self.is_training(),
                            height=self.config.image_height,
                            width=self.config.image_width,
                            thread_id=thread_id,
                            image_format=self.config.image_format)
    
    
    def build_inputs(self):
        """Input prefetching, preprocessing and batching
        
        Outputs:
            self.images
            self.input_seqs
            self.target_seqs
            self.input_mask
        """
        # 만약 inference 모드라면 사용자가 임의의 주는 데이터를 받아야 한다. 
        if self.mode == "inference":
            # In Inference mode, images and inputs are fed via placeholders
            # encoded image를 받는 placeholder
            image_feed = tf.placeholder(dtype=tf.string, shape=[], name="image_feed")
            # Inference 할때에 [batch_size] 짜리 input_feed를 받는다.
            input_feed = tf.placeholder(dtype=tf.int64, shape=[None],  name="input_feed")
            
            # Process image and insert batch dimensions
            # tf.expand_dims: Inserts a dimension of 1 into a tensor's shape.
            # [height, width, 3] -> [batch,height, width, 3] 
            images = tf.expand_dims(self.process_image(image_feed), 0)
            # [batch] -> [batch, length]
            input_seqs = tf.expand_dims(input_feed, 1)
            
            # No target sequneces or input mask in inference mode
            # inference 중에는 loss를 계산할 필요가 없으므로
            target_seqs = None
            input_mask = None
        # mode is train or evaluation    
        else:
            # Prefetch serialized SequenceExample protos.
            # 자신의 TFRecoder reader와 인풋 파일 위치, configuration 값을 넘겨서 queue를 얻는다.
            # 이 queue에는 prefetched된 string 값들이 있다.
            input_queue = prefetch_input_data(self.reader,
                                             self.config.input_file_pattern,
                                             is_training=self.is_training(),
                                             batch_size=self.config.batch_size,
                                             values_per_shard=self.config.values_per_input_shard,
                                             input_queue_capacity_factor=self.config.input_queue_capacity_factor,
                                             num_reader_threads=self.config.num_input_reader_threads)
            
            
            # Image processing and radnom distortion. Split across multiple threads
            # with each thread applying a slightly difference distortion.
            assert self.config.num_preprocess_threads % 2 ==0
            images_and_captions = []
            # thread 갯수만큼 TFRecorde에 있는 SeqeunceExample protos를 본격적으로FIFOQueue float32 텐서로 바꾼다.
            for thread_id in range(self.config.num_preprocess_threads):
                # queue에서 SequenceExample을 dequeue를 한다.
                serialized_sequence_example = input_queue.dequeue()
                
                # dequeue한 녀석에서 실제 이미지에 해당되는 위치(string)와 이에 대응되는 캡션을 얻는다.
                encoded_image, caption = parse_sequence_example(
                serialized_seqeunce_example,
                image_feature=self.config.image_feature_name,
                caption_feature=self.config.caption_feature_name)
                
                # thread 별로 distorting을 걸 수 있게 된다.
                image = self.process_image(encoded_image, thread_id=thread_id)
                # float32 텐서와 캡션을 리스트로 엮는다.
                images_and_captions.append([image, caption])
            
            # Batch inputs
            queue_capacity = (2*self.config.num_preprocess_threads * 
                             self.config.batch_size)
            
            # 배치단위로 나오는 이미지, 그리고 padding이 적용된 input_seq, target_seqs, 
            # padding words를 구분짓는 mask를 얻는다.
            # target은 그냥 input_seq가 오른쪽으로 한칸 shift된 것이다.
            images, input_seqs, target_seqs, input_masks = batch_with_dynamic_pad(images_and_captions,
                                            batch_size=self.config.batch_size,
                                            queue_capacity=queue_capacity)
        
        # [batch_size, height, width, channels]
        self.images = images
        # [batch_size, padded_length].
        self.input_seqs = input_seqs
        # [batch_size, padded_length].
        self.target_seqs = target_seqs
        # [batch_size, padded_length].
        self.input_masks = input_masks
    

    # 배치단위로 이미지를 inception에 넣고 나온 결과를 LSTM에 들어갈 수 있도록
    # embedding 사이즈에 맞추도록 한다.
    def build_image_embeddings(self):
        """Builds the image model subgraph and generates image embeddigns
        
        Inputs:
        self.images
        
        Outputs:
        self.image_embeddings
        """
        inception_output = inception_v3(images=self.images,
                                        trainable=self.train_inception,
                                        is_training=self.is_training())
        self.inception_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="InceptionV3")
        
        # Map inception output into embedding space.
        with tf.variable_scope("image_embedding") as scope:
            # self.config.embedding_size = 512
            image_embeddings = tf.contrib.layers.fully_connected(inputs=inception_output,
                                                                num_outputs=self.config.embedding_size,
                                                                activation_fn=None,
                                                                weights_initializer=self.initializer,
                                                                biases_initializer=None,
                                                                scope=scope)
        # Save the embedding size in the graph.
        tf.constant(self.config.embedding_size, name="embedding_size")
        
        # [bactch, embedding_size]
        self.image_embeddings = image_embeddings
    
    # word embedding을 사이즈에 맞춰서 얻도록 한다. 
    def build_seq_embeddings(self):
        """Builds the input sequence embeddings
        
        Inputs:
        self.input_seqs
        
        Outputs:
        self.seq_embeddings
        """
        
        with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"):
            embedding_map = tf.get_variable(name="map", shape=[self.config.vocab_size, self.config.embedding_size],
                                           initializer=self.initializer)
            # [batch_size, padded_length]
            seq_embeddings = tf.nn.embedding_lookup(embedding_map, self.input_seqs)
        # [batch_size, padded_length, embedding_size]    
        self.seq_embeddings = seq_embeddings
    
    def build_model(self):
        """Builds the model.
        
        Inputs:
            self.image_embeddings
            self.seq_embeddings
            self.target_seqeunces(training and eval only)
            self.input_mask(training and eval only)
        
        Outputs:
            self.total_loss(training and eval only)
            self.target_cross_entropy_losses(training and eval only)
            self.target_cross_entropy_loss_weights(training and eval only)
        
        """
        
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.config.num_lstm_units, 
                                                state_is_tuple=True)
        # 만약 train 모드일 경우 LSTM cell의 인풋과 아웃풋 connection에 dropout을 건다.
        if self.mode == "train":
            lstm_cell = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cell, 
                                                      input_keep_prob=self.config.lstm_dropout_keep_prob,
                                                     output_keep_prob=self.config.lstm_dropout_keep_prob)
            
        with tf.variable_scope("lstm", initializer=self.initializer) as lstm_scope:
            # Feed the image embeddings to set the initial LSTM state.
            # self.image_embeddings : [bactch, embedding_size]
            zero_state = lstm_cell.zero_state(batch_size=self.image_embeddings.get_shape()[0], 
                                              dtype=tf.float32)
            _, initial_state = lstm_cell(self.image_embeddings, zero_state)
            
            # Allow the LSTM variavles to be reused.
            # scope안에 AUTO_REUSE 하는 것과 같다.
            lstm_scope.reuse_variables()
            self.state_size = lstm_cell.state_size
            
            # inference일 경우에는 state_feed placeholder로 생성된 이전 step의 캡션에 대한
            # state값을 넘겨서 결과를 계산한다.
            if self.mode == "inference":
                # In inference mode, use concatenated states for convinient feeding and fetcing
                # [bacth_size, 2*num_lstm_units]
                tf.concat(axis=1, values=initial_state, name="initial_state")
                
                # Placeholder for feeding a batch of concatenated states.
                # lstm_cell.state_size=LSTMStateTuple(c=512, h=512)
                # sum(lstm_cell.state_size) = 1024
                state_feed = tf.placeholder(dtype=tf.float32,
                                           shape=[None, sum(lstm_cell.state_size)],
                                           name="state_feed")
                # Splits a tensor into sub tensors.
                # [<tf.Tensor 'split:0' shape=(?, 512) dtype=float32>,
                # <tf.Tensor 'split:1' shape=(?, 512) dtype=float32>]
                state_tuple = tf.split(value=state_feed, num_or_size_splits=2, axis=1)
                
                # Run a single LSTM step.
                # embedding된 단어와 placeholder로 동작하는 state_tuple을 넣어서
                # 새로운 time step에 해당되는 결과와 state_tuple을 얻는다.
                lstm_outputs, state_tuple = lstm_cell(inputs=tf.squeeze(self.seq_embeddings, axis=[1]),
                                                     state=state_tuple)
                
                # Concatenate the resulting state.
                tf.concat(values=state_tuple, axis=1, name="state")
            
            # train or eval 이라면, 훈련용 seq_embedding이 들어가서
            # 전체 step에 해당되는 lstm_output을 얻는다.
            else:
                # Run the batch of sequence embeddings through the LSTM.
                # 1의 갯수가 groud truth가 되는 sequence의 길이이다.
                # sequence_length = [batch_size]
                seqeunce_length = tf.reduce_sum(self.input_mask, axis=1)
                lstm_outputs, _ = tf.nn.dynamic_rnn(cell=lstm_cell,
                                                   inputs=self.seq_embeddings,
                                                   sequence_length=sequence_length,
                                                   initial_state=initial_state,
                                                   dtype=tf.float32,
                                                   scope=lstm_scope)
        
        # Stack batches vertically. [batch_size*sequence_length, output_size]
        lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size])
        
        with tf.variable_scope("logits") as logits_scope:
            # [batch_size*sequence_length, self.config.vocab_size]
            logits = tf.contrib.layers.fully_connected(
            inputs=lstm_outputs, num_outputs=self.config.vocab_size,
            activation_fn=None, weights_initializer=self.initializer,
            scope=logits_scope)
        
        # Inference일떄에는 loss를 구할 필요가 없다.
        # 바로 softmax를 돌려서 vocab에 대한 확률분포를 얻도록 하자
        if self.mode == "inference":
            tf.nn.softmax(logits, name="softmax")
            
        else:
            # int32, [batch_size, padded_length] -> [-1]
            targets = tf.reshape(self.target_seqs, [-1])
            
            # int32 0/1 Tesnor with shape [batch_size, padded_length] -> [-1], float
            weights = tf.to_float(tf.reshape(self.input_mask, [-1]))
            
            
            # Compute losses
            # logits =[d_0, d_1, ..., d_{r-1}, num_classes]
            # lables = [d_0, d_1, ..., d_{r-1}]
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(lables=targets, logits=logits)
            # 배치 단위로 loss를 나눈다.
            batch_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)),
                               tf.reduce_sum(weights), name="batch_loss")
            # tf.losses.add_loss(loss, loss_collection='losses')
            tf.losses.add_loss(batch_loss)
            # tf.losses.get_total_loss(add_regularization_losses=True, name='total_loss')
            total_loss = tf.losses.get_total_loss()
            
            # Add summaries.
            tf.summary.scalar("losses/batch_loss", batch_loss)
            tf.summary.scalar("losses/total_loss", total_loss)
            
            for var in tf.trainable_variables():
                tf.summary.histogram("parameters/" + var.op.name, var)
                
            self.total_loss = total_loss
            self.target_cross_entropy_losses = losses
            self.target_cross_entropy_loss_weights = weights
    
    # inception의 파라미터를 불러오는 saver와 closure를 설정한다.
    def setup_inception_initializer(self):
        """Set ip the function to restore inception variables from checkpoint."""
        if self.mode != "inference":
            # Restroe inception variables only.
            # self.inception_variables은 build_image_embeddings()에서 만들어진다.
            saver = tf.train.Saver(var_list=self.inception_variables)
            
            def restore_fn(sess):
                tf.logging.info("Restoring Inception variables from checkpoint file %s",
                               self.config.inception_checkpoint_file)
                saver.restore(sess, self.config.inception_checkpoint_file)
                
            self.init_fn = restore_fn
            
    def setup_global_step(self):
        """Sets up the global step Tensor."""
        # collections: List of graph collections keys. The new variable is added to
        # these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]
        global_step = tf.Variable(initial_value=0, name="global_step", trainable=False, 
                                  collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
    def build(self):
        """Creates all ops for training and evaluation"""
        # image, image_seq, target_seq, target_mask를 얻는다.
        self.build_inputs()
        # inceptionV3에서 나온 결과물에 fully connected를 걸어서 512짜리 representation을 얻는다.
        self.build_image_embeddings()
        # input으로 사용되는 sequence에 대해 embedding lookup을 적용한다. 
        self.build_seq_embeddings()
        # total_loss, cross_entropy_loss를 구한다.
        self.build_model()
        # Train와 eval일 경우 inception 파라미터를 복구하도록 하는 closures에 대한 alias를 가진다.
        self.setup_inception_intiailizer()
        # global_step을 만든다.
        self.setup_global_step()
        

## Train

In [3]:
# 로깅 모드 변경 : 기본 WARN에서 INFO로
tf.logging.set_verbosity(tf.logging.INFO)

20

In [15]:
model_config = ModelConfig()
model_config.input_file_pattern = "E:/mscoco/outputs"
# 사전 학습이 된 inception 체크포인트
model_config.inception_checkpoint_file = "./inception_v3/inception_v3.ckpt"
training_config = TrainingConfig()
log_every_n_steps = 1
number_of_steps = 1000000

In [7]:
# Directory for saving and loading model checkpoints.
train_dir = "D:\PythonLab\CS20\Show and Tell"

In [8]:
# inception 모델도 학습할 것인지 여부
train_inception = False

In [None]:
# Build the Tensorflow graph
g = tf.Graph()

with g.as_default():
    # Build model
    model = ShowAndTellModel(config=model_config, mode="train", train_inception=train_inception)
    # 모델의 그래프를 그린다.
    model.build()
    
    t_vars = tf.trainable_variables()
    slim.model_analyzer.analyze_vars(t_vars, print_info=True)
    # Set up the learning rate.
    learnin_rate_decay_fn = None
    # 만약 inception 모델 파라미터 또한 학습한다면
    if train_inception:
        # 0.0005로 설정
        learning_rate = tf.constant(training_config.train_inception_learning_rate)
    # 첫번째 phase로만 학습한다면(inception model의 파라미터는 학습에서 제외)->learning rate decay
    # learning_rate과 global_step을 받아들이는 함수를 만든다.
    else:
        learning_rate = tf.constant(training_config.initial_learning_rate)
        # learning rate decay를 쓴다면
        if training_config.learning_rate_decay_factor > 0:
            # epoch당 배치수
            num_batches_per_epoch = (training_config.num_examples_per_epoch/
                                     model_config.batch_size)
            
            # decay한번 할때마다의 배치수
            decay_steps = int(num_batches_per_epoch * training_config.num_epochs_per_decay)
            
            
            def _learning_rate_decay_fn(learning_rate, global_step):
                #Applies exponential decay to the learning rate.
                return tf.train.exponential_decay(learning_rate,global_step,
                                                 decay_steps=decay_steps,
                                                  decay_rate=training_config.learning_rate_decay_factor,
                                                 staricase=True)
            # closure에 alias를 준다.
            learning_rate_decay_fn = _learning_rate_decay_fn
            
    # Set up the training ops.
    train_op = tf.contrib.layers.optimize_loss(loss=model.total_loss,
                                              global_step=model.global_step,
                                              learning_rate=learning_rate,
                                              optimizer=training_config.optimizer,
                                              clip_gradients=training_config.clip_gradients,
                                              learning_rate_decay_fn=learning_rate_decay_fn)
    # Set up the saver for saving and restoring model checkpoints.
    saver = tf.train.Saver(max_to_keep=training_config.max_checkpoints_to_keep)
    
    # Run training
    # logdir: The directory where training logs are written to.
    # init_fn: An optional callable to be executed after `init_op` is called. The
    # callable must accept one argument, the session being initialized.
    tf.contrib.slim.learning.train(train_op=train_op, logdir=train_dir, log_every_n_steps=log_every_n_steps,
                                  graph=g, global_step=model.global_step,
                                  number_of_steps=number_of_steps, init_fn=model.init_fn,
                                  saver=saver)
    

## Inference

In [18]:
# Vocabulary를 담당하는 클래스
class Vocabulary(object):
    
    def __init__(self, vocab_file, start_word="<S>", end_word="</S>", unk_word="<UNK>"):
        """
        Initializes the vocabulary.
        
        """
        # 만약 vocab_file이 없다면 에러를 띄운다.
        if not tf.gfile.Exists(vocab_file):
            tf.logging.fatal("Vocab file %s not found.", vocab_file)
            
        tf.logging.info("Initializing vocabulary from file: %s", vocab_file)
        
        with tf.gfile.GFile(name=vocab_file, mode="r") as f:
            reverse_vocab = list(f.readlines())
        # escape sequence를 제거하는 듯    
        reverse_vocab = [line.split()[0] for line in reverse_vocab]
        # 각 token이 vocab 리스트에 들어있는지 확인한다.
        assert start_word in reverse_vocab
        assert end_word in reverse_vocab
        if unk_word not in reverse_vocab:
            reverse_vocab.append(unk_word)
        
        # 순서를 바로 잡아서 dict로 만든다. 단어가 key이고 value가 id이다.
        vocab = dict([(x, y) for (y, x) in enumerate(reverse_vocab)]) 
        
        tf.logging.info("Created vocabulary with %d words" % len(vocab))
        
        # Attribute로 등록한다.
        self.vocab = vocab
        self.reverse_vocab = reverse_vocab
        
        self.start_id = vocab[start_word]
        self.end_id = vocab[end_word]
        self.unk_id = vocab[unk_word]
        
    def word_to_id(self, word):
        if word in self.vocab:
            return self.vocab[word]
        
        else:
            return self.unk_id
        
    def id_to_word(self, word_id):
        if word_id >= len(self.reverse_vocab):
            return self.reverse_vocab[self.unk_id]
        else:
            return self.reverse_vocab[word_id]

In [14]:
class InferenceWrapper(InferenceWrapperBase):
    """Base wrapper class for performing inference with an image-to-text model."""
    def __init__(self):
        pass
    
    def _create_restore_fn(self, checkpoint_path, saver):
        """Creates a function that restores a model from checkpoint."""
        # 만약 체크포인트 디렉토리가 있다면
        if tf.gfile.IsDirectory(checkpoint_path):
            # 체크포인트 파일중 가장 최신 파일의 이름을 가져온다.
            # Finds the filename of latest saved checkpoint file.
            checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
            if not checkpoint_path:
                raise ValueError("No checkpoint file found in: %s" % checkpoint_path)
                
        # 가장 최근의 체크포인트로 복구하는 함수        
        def _restore_fn(sess):
            tf.logging.info("Loading model from checkpoint: %s", checkpoint_path)
            saver.restore(sess, checkpoint_path)
            tf.logging.info("Successfully loaded checkpoint: %s", os.path.basename(checkpoint_path))
        
        return _restore_fn
    
    # 모델 그래프를 그리고 체크포인트를 복구하는 함수를 얻는다.
    def build_graph_from_config(self, model_config, checkpoint_path):
        """Builds the inference graph from a configuration object."""
        tf.logging.info("Building model.")
        # build model method를 실행한다. 
        self.build_model(model_config)
        # 체크포인트 복구를 위한 saver
        saver = tf.train.Saver()
        
        # 여기의 리턴값으로 _restore_fn closure를 다시 리턴한다.
        return self._create_restore_fn(checkpoint_path, saver)
    
    
    def build_model(self, model_config):
        """Builds the model for inference.
        Args:
            model_config: Object containing configuration for building the model.
        Returns:
            model: The model object.
        """
        model = ShowAndTellModel(model_config, mode="inference")
        # 모델 그래프를 그린다.
        model.build()
        return model
    
    # 인코딩된 이미지를 넣어서 초기 state를 얻는 함수.
    def feed_image(self, sess, encoded_image):
        """Feeds an image and returns the initial model state.
        Args:
            sess: TensorFlow Session object.
            encoded_image: An encoded image string.
        
        Returns:
            state: A numpy array of shape [1, state_size].
        """
        
        # image_feed placeholder에 인코딩된 이미지를 넣어서 lstm의 initial state opeartion node을 실행한다.
        initial_state = sess.run(fetches="lstm/initial_state:0",
                                feed_dict={"image_feed:0": encoded_image})
        # [bacth_size, 2*num_lstm_units]
        return initial_state
    
    # LSTM에서 나온 logit값과 state값을 리턴한다.
    def inference_step(self, sess, input_feed, state_feed):
        """
        Runs one step of inference.
        Args:
            sess: TensorFlow Session object.
            input_feed: A numpy array of shape [batch_size].
            state_feed: A numpy array of shape [batch_size, state_size].
            
        Returns:
            softmax_output: A numpy array of shape [batch_size, vocab_size].
            new_state: A numpy array of shape [batch_size, state_size].
            
        """
        # input_feed값은 infernece일떄의 build_inputs() 함수에 있는
        # input_feed = tf.placeholder(dtype=tf.int64, shape=[None],  name="input_feed")
        # state_feed값은 build_model() 함수에 있는
        # tf.placeholder(dtype=tf.float32,shape=[None, sum(lstm_cell.state_size)],name="state_feed")
        # 결과로 tf.nn.softmax(logits, name="softmax")와
        # tf.concat(values=state_tuple, axis=1, name="state")을 state_output으로 받는다.
        softmax_output, state_output = sess.run(fetches=["softmax:0", "lstm/state:0"],
                                               feed_dict={
                                                   "input_feed:0": input_feed,
                                                   "lstm/state_feed:0" : state_feed,
                                               })
        return softmax_output, state_output, None

In [12]:
import heapq

class Caption(object):
    """Represents a complete or partial caption."""
    def __init__(self, sentence, state, logprob, score, metadata=None):
        self.sentence = sentence
        self.state = state
        self.logprob = logprob
        self.score = score
        self.metadata = metadata
        
    def __cmp__(self, other):
        """Compares Captions by score."""
        assert isinstance(other, Caption)
        if self.score == other.score:
            return 0
        elif self.score < other.score:
            return -1
        else:
            return 1
        
    # For Python 3 compatibility (__cmp__ is deprecated).
    def __lt__(self, other):
        assert isinstance(other, Caption)
        return self.score < other.score
    
    # Also for Python 3 compatibility.
    def __eq__(self, other):
        assert isinstance(other, Caption)
        return self.score == other.score
    
class TopN(object):
    """
    Maintains the top n elements of an incrementally provided set.
    
    """
    # n=beam_size
    def __init__(self, n):
        self._n = n
        self._data = []
        
    def size(self):
        assert self._data is not None
        return len(self._data)
    
    def push(self, x):
        """Pushes a new element."""
        assert self._data is not None
        if len(self._data) < self._n:
            heapq.heappush(self._data, x)
            
        else:
            heapq.heappushpop(self._data, x)
            
    def extract(self, sort=False):
        """
        Extracts all elements from the TopN. This is a destructive operation.
        
        Args:
            sort: Whether to return the elements in descending sorted order.
        Returns:
            A list of data; the top n elements provided to the set.
        """
        assert self._data is not None
        data = self._data
        self._data = None
        if sort:
            data.sort(reverse=True)
            
        return data
    def reset(self):
        """Returns the TopN to an empty state."""
        self._data = []
        

class CaptionGenerator(object):
    """
    Class to generate captions from an image-to-text model.
    """
    
    def __init__(self, model, vocab, beam_size=3, max_caption_length=20, 
                length_normalization_factor=0.0):
        """
        Initializes the generator.
        Args:
            model: Object encapsulating a trained image-to-text model. Must have
            methods feed_image() and inference_step(). For example, an instance of
            InferenceWrapperBase.
            
            vocab: A Vocabulary object.
            beam_size: Beam size to use when generating captions.
            max_caption_length: The maximum caption length before stopping the search.
            length_normalization_factor: If != 0, a number x such that captions are
            scored by logprob/length^x, rather than logprob. This changes the
            relative scores of captions depending on their lengths. For example, if
            x > 0 then longer captions will be favored.
        """
        self.vocab = vocab
        self.model = model
        
        self.beam_size = beam_size
        self.max_caption_length = max_caption_length
        self.length_normalization_factor = length_normalization_factor
        
    def beam_search(self, sess, encoded_image):
        """
        Runs beam search caption generation on a single image.
        
        Args:
            sess: TensorFlow Session object.
            encoded_image: An encoded image string.
        Returns:
            A list of Caption sorted by descending score.
        """
        # Feed in the image to get the initial state.
        # 먼저 이미지를 넣어서 LSTM의 time step0을 돌릴 수 있게끔 initial state를 확보한다. 
        # -1st time step에 해당되는 과정
        initial_state = self.model.feed_image(sess, encoded_image)
        
        initial_beam = Caption(sentence=[self.vocab.start_id],
                               # 1024짜리로 h와 c state를 뭉쳐져 저장한다.
                              state=initial_state[0],
                              logprob=0.0,
                              score=0.0,
                              metadata=[""])
        # 3개만 고려하는 partial_captions
        partial_captions = TopN(self.beam_size)
        # initial caption을 hqueue에 
        partial_captions.push(initial_beam)
        complete_captions = TopN(self.beam_size)
        
        for _ in range(self.max_caption_length - 1):
            # 부분 캡션리스트에 저장된 캡션들 beam_size만큼을 뽑아낸다.
            partial_captions_list = partial_captions.extract()
            partial_captions.reset()
            # 부분 캡션 리스트에 있는 캡션들의 마지막 단어의 id값들을 numpy array로 만든다.
            input_feed = np.array([c.sentence[-1] for c in partial_captions_list])
            # 이전 단어를 만들고 난 후의 모델의 state값들을 numpy array로 만든다.
            state_feed = np.array([c.state for c in partial_captions_list])
            # LSTM cell에 이전 state와 마지막 문장 id값을 넣어서 
            # 다음 state와 다음 단어에 대한 softmax값을 얻는다. 
            softmax, new_states, metadata = self.model.inference_step(sess,
                                                                      input_feed,
                                                                      state_feed)
            # beam_size만큼 for문을 돌린다.
            for i, partial_caption in enumerate(partial_captions_list):
                word_probabilities = softmax[i]
                state = new_states[i]
                # For this partial caption, get the beam_size most probable next words.
                words_and_probs = list(enumerate(word_probabilities))
                words_and_probs.sort(key=lambda x: -x[1])
                # beam_size 만큼만 뽑
                words_and_probs = words_and_probs[0:self.beam_size]
                # Each next word gives a new partial caption.
                for w, p in words_and_probs:
                    if p < 1e-12:
                        continue  # Avoid log(0).
                    sentence = partial_caption.sentence + [w]
                    logprob = partial_caption.logprob + math.log(p)
                    score = logprob
                    if metadata:
                        metadata_list = partial_caption.metadata + [metadata[i]]
                        
                    else:
                        metadata_list = None
                    
                    # 만약 eos token을 만났을 경우
                    if w == self.vocab.end_id:
                        if self.length_normalization_factor > 0:
                            score /= len(sentence)**self.length_normalization_factor
                        beam = Caption(sentence, state, logprob, score, metadata_list)
                        complete_captions.push(beam)
                    else:
                        beam = Caption(sentence, state, logprob, score, metadata_list)
                        partial_captions.push(beam)
            if partial_captions.size() == 0:
                # We have run out of partial candidates; happens when beam_size = 1.
                break
                
            # If we have no complete captions then fall back to the partial captions.
            # But never output a mixture of complete and partial captions because a
            # partial caption could have a higher score than all the complete captions.
            if not complete_captions.size():
                complete_captions = partial_captions
                
            # 완전한 캡션 3개를 내놓는다.
            return complete_captions.extract(sort=True)

In [None]:
# 학습이 끝난 후 저장된 체크포인트 파일이 있는 위치
checkpoint_path = ""
# vocab이 담긴 text파일
vocab_file = "E:/mscoco/outputs/word_counts.txt"
# 이미지 파일
input_file = "E:/mscoco/test.jpg"

사용법

- 1. build_graph_from_config()을 써서 model inference graph를 만든다.
- 2. 결과로 나오는 restore_fn을 불러서 모델의 체크포인트를 로드한다.
- 3. 배치에 있는 각 이미지에 대해
    - feed_image()를 불러서 initial_state를 얻는다.
    - 캡션 생성의 각 단계마다 inference_step()을 호출한다.

In [None]:
g = tf.Graph()
with g.as_default():
    model = InferenceWrapper()
    restore_fn = model.build_graph_from_config(model_config, checkpoint_path)
g.finalize()

# vocab을 담당하는 객체를 만든다.
vocab = Vocabulary(vocab_file=vocab_file)

# 각 이미지의 위치를 저장하는 리스트
filenames=[]
# 이미지 인풋 파일이 여러개 일 경우 ,을 기준으로 나눈다.
for file_pattern in FLAGS.input_files.split(","):
    # Returns a list of files that match the given pattern(s)
    filenames.extend(tf.gfile.Glob(file_pattern))
    

with tf.Session(graph=g) as sess:
    
    # 체크포인트 폴더에 있는 파일을 현재 sess으로 복구한다.
    restore_fn(sess)
    
    # beam search를 사용하는 caption generator
    generator = caption_generator.CaptionGenerator(model, vocab)
    
    for filename in filenames:
        # 이미지 파일을 연다
        with tf.gfile.GFile(name=filename, "rb") as f:
            image = f.read()
        # 캡션이 생성된다.
        captions = generator.beam_search(sess, image)
        print("Captions for image %s:" % os.path.basename(filename))\
        # 3개의 캡션에 대해서 
        for i, caption in enumerate(captions):
            # 캡션의 제일 첫번째와 마지막 token은 무시한다. id->word로 바꾼 리스트
            sentence = [vocab.id_to_word(w) for w in caption.sentence[1:-1]]
            # str 리스트를 연결한다.
            sentence = " ".join(sentence)
            print("  %d) %s (p=%f)" % (i, sentence, math.exp(caption.logprob)))