# Tensorflow Mechanics: MNIST

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math

import tensorflow as tf

# 识别的类别数
NUM_CLASSES = 10

# 输入图像的尺寸
IMAGE_SIZE = 28
IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE

## Part I. Build the network
三个构造的步骤
1. Inference: 构造计算图用来得到prediction
2. Loss: 在Inference之上，添加节点来计算网络的损失
3. Train: 在Loss之上，添加训练网络的节点
4. Evaluation: 模型性能评估的计算节点

#### Some function
- tf.name_scope(): 返回一个context manager，来管理属于这个scope的操作。
- tf.to_int64(): 进行tensor数据类型的变换
- tf.summary: 用来进行训练过程的记录的模块\*\*
- tf.nn.in_top_k(prediction, target, k, name): target的分类（ground truth）是否在prediction的各类概率的top-k之内
- tf.Variable(0, trainable = False): 指定不可训练的变量 （往往将训练的步数作为不可训练的变量）

#### Some concepts:
- tensorflow会通过collection来记录可以训练的变量，并在optimize的时候更新这个collection中的变量

In [22]:
# 网络由两个全连接的隐层，和一个sofmax层组成
# 使用tf.name_scope简化命名

def inference(images, hidden1_units, hidden2_units):
  """Build the MNIST model up to where it may be used for inference.

  Args:
    images: Images placeholder, from inputs().
    hidden1_units: Size of the first hidden layer.
    hidden2_units: Size of the second hidden layer.

  Returns:
    softmax_linear: Output tensor with the computed logits.
  """
  # Hidden 1
  with tf.name_scope('hidden1'):
    weights = tf.Variable(
        tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
                            stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
        name='weights')
    biases = tf.Variable(tf.zeros([hidden1_units]),
                         name='biases')
    hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
    
  # Hidden 2
  with tf.name_scope('hidden2'):
    weights = tf.Variable(
        tf.truncated_normal([hidden1_units, hidden2_units],
                            stddev=1.0 / math.sqrt(float(hidden1_units))),
        name='weights')
    biases = tf.Variable(tf.zeros([hidden2_units]),
                         name='biases')
    hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
    
  # Linear
  with tf.name_scope('softmax_linear'):
    weights = tf.Variable(
        tf.truncated_normal([hidden2_units, NUM_CLASSES],
                            stddev=1.0 / math.sqrt(float(hidden2_units))),
        name='weights')
    biases = tf.Variable(tf.zeros([NUM_CLASSES]),
                         name='biases')
    logits = tf.matmul(hidden2, weights) + biases
    
    return logits

In [23]:
# 使用交叉熵定义网络的loss

def loss(logits, labels):
  """Calculates the loss from the logits and the labels.

  Args:
    logits: Logits tensor, float - [batch_size, NUM_CLASSES].
    labels: Labels tensor, int32 - [batch_size].

  Returns:
    loss: Loss tensor of type float.
  """
  with tf.name_scope('loss'):
    labels = tf.to_int64(labels)
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
      labels=labels, logits=logits, name='xentropy')
    return tf.reduce_mean(cross_entropy, name='xentropy_mean')

In [24]:
# 使用梯度下降来优化参数，其中将运行的步骤也作为一个全局的输入值
# summary可以用来记录一个节点的变化情况

def training(loss, learning_rate):
  """Sets up the training Ops.

  Creates a summarizer to track the loss over time in TensorBoard.

  Creates an optimizer and applies the gradients to all trainable variables.

  The Op returned by this function is what must be passed to the
  `sess.run()` call to cause the model to train.

  Args:
    loss: Loss tensor, from loss().
    learning_rate: The learning rate to use for gradient descent.

  Returns:
    train_op: The Op for training.
  """
  with tf.name_scope('training'):
    # Add a scalar summary for the snapshot loss.
    tf.summary.scalar('loss', loss)
    # Create the gradient descent optimizer with the given learning rate.
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    # Create a variable to track the global step.
    global_step = tf.Variable(0, name='global_step', trainable=False)
    # Use the optimizer to apply the gradients that minimize the loss
    # (and also increment the global step counter) as a single training step.
    train_op = optimizer.minimize(loss, global_step=global_step)
    return train_op

In [25]:
# 构造模型性能评估的计算节点

def evaluation(logits, labels):
  """Evaluate the quality of the logits at predicting the label.

  Args:
    logits: Logits tensor, float - [batch_size, NUM_CLASSES].
    labels: Labels tensor, int32 - [batch_size], with values in the
      range [0, NUM_CLASSES).

  Returns:
    A scalar int32 tensor with the number of examples (out of batch_size)
    that were predicted correctly.
  """
  with tf.name_scope('eval'):
    # For a classifier model, we can use the in_top_k Op.
    # It returns a bool tensor with shape [batch_size] that is true for
    # the examples where the label is in the top k (here k=1)
    # of all logits for that example.
    correct = tf.nn.in_top_k(logits, labels, 1)
    # Return the number of true entries.
    return tf.reduce_sum(tf.cast(correct, tf.int32))

# Part II. Train the network


#### Some functions:
- //用来取商；%用来取余数
- tf.Graph(): 创建一个新的graph(图是包含数据节点和操作节点的一个collection)
- graph.as_default(): 将graph作为当前默认的graph，所有产生的node都会添加到这个graph对应的collection中
- xrange和range: 在2中，range产生list而xrange产生range生成器对象；在3中，两者都产生range对象

#### Some concepts:
- graph: 其中定义了计算图的数据节点(placeholder % variable)，以及相应的计算操作
- session: 定义了运行一次graph所对应的环境，同时会保存graph运行过程中各个变量的值
- graph和session可以分别与static和runtime相对应
- tf.group是一个将多个计算操作打包的class
- global_variable_initializer是对所有包含initializer的变量进行初始化的操作
- Tensor节点在session.run()返回时，数据类型是numpy.ndarray；如果操作没有输出，则返回None

In [33]:
# pylint: disable=missing-docstring
import argparse  #解析命令行参数
import os.path
import sys
import time

from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.examples.tutorials.mnist import mnist

# Basic model parameters as external flags.
# 类似于记录网络和训练的控制信号
# batch_size, log_dir, input_data_dir, fake_data， hidden1, hidden2, learning_rate, 
FLAGS = None

In [27]:
# 获得模型用来喂入数据的placeholder
# 由于这里使用的均为全连接层，因此将图像的数据需要flatten成一维的向量

def placeholder_inputs(batch_size):
  """Generate placeholder variables to represent the input tensors.

  These placeholders are used as inputs by the rest of the model building
  code and will be fed from the downloaded data in the .run() loop, below.

  Args:
    batch_size: The batch size will be baked into both placeholders.

  Returns:
    images_placeholder: Images placeholder.
    labels_placeholder: Labels placeholder.
  """
  # Note that the shapes of the placeholders match the shapes of the full
  # image and label tensors, except the first dimension is now batch_size
  # rather than the full size of the train or test data sets.
  images_placeholder = tf.placeholder(tf.float32, shape=(batch_size,
                                                         mnist.IMAGE_PIXELS))
  labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))
  return images_placeholder, labels_placeholder

In [28]:
# 获得下一个batch喂入的数据

def fill_feed_dict(data_set, images_pl, labels_pl):
  """Fills the feed_dict for training the given step.

  A feed_dict takes the form of:
  feed_dict = {
      <placeholder>: <tensor of values to be passed for placeholder>,
      ....
  }

  Args:
    data_set: The set of images and labels, from input_data.read_data_sets()
    images_pl: The images placeholder, from placeholder_inputs().
    labels_pl: The labels placeholder, from placeholder_inputs().

  Returns:
    feed_dict: The feed dictionary mapping from placeholders to values.
  """
  # Create the feed_dict for the placeholders filled with the next
  # `batch size` examples.
  images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size,
                                                 FLAGS.fake_data)
  feed_dict = {
      images_pl: images_feed,
      labels_pl: labels_feed,
  }

  return feed_dict

In [29]:
# 评价网络的性能
# 这里对于输入的dataset，运行一个epoch，但是每个epoch分成多个batch运行并统计结果

def do_eval(sess,
            eval_correct,
            images_placeholder,
            labels_placeholder,
            data_set):
    """Runs one evaluation against the full epoch of data.

    Args:
    sess: The session in which the model has been trained.
    eval_correct: The Tensor that returns the number of correct predictions.
    images_placeholder: The images placeholder.
    labels_placeholder: The labels placeholder.
    data_set: The set of images and labels to evaluate, from
      input_data.read_data_sets().
      """
# And run one epoch of eval.。
    true_count = 0  # Counts the number of correct predictions.
    steps_per_epoch = data_set.num_examples // FLAGS.batch_size
    num_examples = steps_per_epoch * FLAGS.batch_size
    for step in xrange(steps_per_epoch):
        # 获得喂入的数据集合
        feed_dict = fill_feed_dict(data_set, images_placeholder, labels_placeholder)
        # 运行性能评估的节点
        true_count += sess.run(eval_correct, feed_dict=feed_dict)
    precision = float(true_count) / num_examples
    print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' %(num_examples, true_count, precision))

#### About Summary module:
- 在定义网络结构的时候，添加不同的summary，用名字来区分
- 在运行之前，将所有的summary合并成一个summary节点
- 创建session实例之后，可以使用这个实例来创建summary_writer（writer只和session相关联）
- 在session内部，完成一步训练需要记录的时候，运行summary节点获得summary_str
- 通过summary_writer添加这条summary(需要flush writer)
- <b>summary是计算图的一部分，可以产生summary string; 而summary writer只和session和文件相关，将summary_str写到文件中</b>

#### About Checkpoint module:
- 创建saver实例：tf.train.Saver()（saver的创建不与session相关）
- 需要保存的时候，使用saver.save接口保存session的运行状态，包括参数的当前值；可以指定global_step来标记不同的save文件
- 可以通过reload接口来恢复保存的parameter

In [30]:
def run_training():
  print("starting")
  """Train MNIST for a number of steps."""
  # Get the sets of images and labels for training, validation, and
  # test on MNIST.
  data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)
  print('load data: finished')
  # Tell TensorFlow that the model will be built into the default Graph.
  with tf.Graph().as_default():
    # 构造计算图
    # Generate placeholders for the images and labels.
    images_placeholder, labels_placeholder = placeholder_inputs(FLAGS.batch_size)
    # Build a Graph that computes predictions from the inference model.
    logits = inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2)
    # Add to the Graph the Ops for loss calculation.
    loss_ = loss(logits, labels_placeholder)
    # Add to the Graph the Ops that calculate and apply gradients.
    train_op = training(loss_, FLAGS.learning_rate)
    # Add the Op to compare the logits to the labels during evaluation.
    eval_correct = evaluation(logits, labels_placeholder)
    # Build the summary Tensor based on the TF collection of Summaries.
    summary = tf.summary.merge_all()
    
    # 构造全局变量的初始化，打开会话，打开summary
    # Add the variable initializer Op.
    init = tf.global_variables_initializer()
    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()
    # Create a session for running Ops on the Graph.
    sess = tf.Session()
    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

    # And then after everything is built:

    # Run the Op to initialize the variables.
    sess.run(init)

    # Start the training loop.
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()

      # Fill a feed dictionary with the actual set of images and labels
      # for this particular training step.
      # 构造一个字典表示喂入的数据
      feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder)

      # Run one step of the model.  The return values are the activations
      # from the `train_op` (which is discarded) and the `loss` Op.  To
      # inspect the values of your Ops or variables, you may include them
      # in the list passed to sess.run() and the value tensors will be
      # returned in the tuple from the call.
      # 训练一次，并计算初始的loss
      _, loss_value = sess.run([train_op, loss_], feed_dict=feed_dict)

      duration = time.time() - start_time

      # 定期记录训练的状态（summary）
      # Write the summaries and print an overview fairly often.
      if step % 100 == 0:
        # Print status to stdout.
        print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
        # Update the events file.
        summary_str = sess.run(summary, feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, step)
        summary_writer.flush()

      # (checkpoint)
      # Save a checkpoint and evaluate the model periodically.
      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
        saver.save(sess, checkpoint_file, global_step=step)
        # Evaluate against the training set.
        print('Training Data Eval:')
        do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train)
        # Evaluate against the validation set.
        print('Validation Data Eval:')
        do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation)
        # Evaluate against the test set.
        print('Test Data Eval:')
        do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)

In [32]:
def main(_):
  if tf.gfile.Exists(FLAGS.log_dir):
    tf.gfile.DeleteRecursively(FLAGS.log_dir)
  tf.gfile.MakeDirs(FLAGS.log_dir)
  run_training()


if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--learning_rate',
      type=float,
      default=0.01,
      help='Initial learning rate.'
  )
  parser.add_argument(
      '--max_steps',
      type=int,
      default=5000,
      help='Number of steps to run trainer.'
  )
  parser.add_argument(
      '--hidden1',
      type=int,
      default=128,
      help='Number of units in hidden layer 1.'
  )
  parser.add_argument(
      '--hidden2',
      type=int,
      default=32,
      help='Number of units in hidden layer 2.'
  )
  parser.add_argument(
      '--batch_size',
      type=int,
      default=50,
      help='Batch size.  Must divide evenly into the dataset sizes.'
  )
  parser.add_argument(
      '--input_data_dir',
      type=str,
      default='/tmp/tensorflow/mnist/input_data',
      help='Directory to put the input data.'
  )
  parser.add_argument(
      '--log_dir',
      type=str,
      default='/tmp/tensorflow/mnist/logs/fully_connected_feed',
      help='Directory to put the log data.'
  )
  parser.add_argument(
      '--fake_data',
      default=False,
      help='If true, uses fake data for unit testing.',
      action='store_true'
  )

  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

starting
Extracting /tmp/tensorflow/mnist/input_data/train-images-idx3-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data/train-labels-idx1-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data/t10k-images-idx3-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data/t10k-labels-idx1-ubyte.gz
load data: finished
Step 0: loss = 2.30 (0.129 sec)
Step 100: loss = 2.07 (0.003 sec)
Step 200: loss = 1.82 (0.002 sec)
Step 300: loss = 1.26 (0.002 sec)
Step 400: loss = 1.05 (0.002 sec)
Step 500: loss = 0.84 (0.002 sec)
Step 600: loss = 0.79 (0.002 sec)
Step 700: loss = 0.68 (0.002 sec)
Step 800: loss = 0.69 (0.002 sec)
Step 900: loss = 0.60 (0.002 sec)
Training Data Eval:
  Num examples: 55000  Num correct: 47506  Precision @ 1: 0.8637
Validation Data Eval:
  Num examples: 5000  Num correct: 4373  Precision @ 1: 0.8746
Test Data Eval:
  Num examples: 10000  Num correct: 8736  Precision @ 1: 0.8736
Step 1000: loss = 0.48 (0.021 sec)
Step 1100: loss = 0.57 (0.129 sec)
Step 1200: loss = 0.39 (0.001 s

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
