In [1]:
# Step1 load MNITST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [2]:
# Start tensorflow interactiveSession
# Tensorflow relies on a highly efficient C++ backend to do its computation. The connection to this backend
# to this backend is called as session.
# The common usage for Tensorflow programs is to first create a graph and then launch it in a session.
import tensorflow as tf
sess = tf.InteractiveSession()

## Computation graph
Tensorflow lets us describe a graph of interacting operations that run entirely outside Python, which is similar to that used in Theano or Torch. The role of the Python code is therefore to build this external computation graph, and to dictate which parts of the computation graph should be run.

In [3]:
# Build a softmax regression model
# Placeholders
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10]) # one-hot 10-dimensional vector

In [7]:
# Variables
# Define the weights W and biases b for the model.
# A variable is a value that lives in TensorFlow's computation graph.
# In machine learning applications, one generally has the model parameters be variables
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))

In [8]:
# Initialize the variables
sess.run(tf.global_variables_initializer())

In [9]:
# Predicted class and loss function
# The regression model, multiply the vectorized input images x by the weight matrix W, add the bias b
y = tf.matmul(x, W) + b

## Specify the loss function
Loss indicates how bad the model's prediction was on a single example; we try to minimize that while training across all the examples.
The loss function in this tutorial is the cross-entropy between the target and the softmax activation function applied to the model's prediction.

In [10]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))

## Train the model
Because Tensorflow knows the entire computation graph, it can use automatic differentiation to find the gradients of the loss with respect to each of the variables.
Tensorflow has a variety of built-in optimization algorithms.

In [11]:
# Applying the steepest gradient descent, with a step of 0.5, to descend the cross entropy
# Tensorflow then add new operations to the computation graph, including ones to compute gradients, 
# compute parameters update steps, and apply update steps to the parameters.
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

In [12]:
# Training the model by repeatedly running train_step
for i in range(1000):
    batch = mnist.train.next_batch(100)
    train_step.run(feed_dict={x: batch[0], y_: batch[1]})

## Evaluate the model
First we'll figure out where we predicted the correct label. `tf.argmax` is an extremely useful function which gives you the index of the highest entry in a tensor along some axis. Use `tf.equal` to check if the prediction matches the truth.

In [13]:
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
# accuracy
# Because the dtype of correct_prediction is boolean, it should be cast to float before adjusting the accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Display the result
print(accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels}))

0.9164


# Build a muitilayer convolutional network
## weight initialization
1. Initialize weights with a small amount of noise for symmetry breaking, and to prevent 0 gradients
2. Since we use ReLU neurons, it is also good practice to initialize them with a slightly positive initial bias to avoid "dead neurons"

In [4]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [5]:
# Convolution and pooling
# TF gives us flexibility in convolution and pooling operations, which including how we handle the boundaries,
# what the stride size is. 
# In this toturial, it uses a stride of one and zero padding.
def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

In [6]:
# First convolutional layer
# Consisting of convolution, followed by max pooling
# 32 features for each 5x5 patch. [5, 5, 1, 32]
# Biases are also added for each 32 channel
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])

# Adjust the image
x_image = tf.reshape(x, [-1, 28, 28, 1])

# Convolve x_image with the weight tensor, add the bias, apply the ReLU runction, and max pooling.
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

In [7]:
# Second convolutional layer
# 64 features for each 5x5 patch
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

In [8]:
# Densely connected layer
# Reshape the tensor from the pooling layer into a batch of vectors
W_fc1 = weight_variable([7*7*64, 1024])
b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

In [10]:
# Dropout 
# To reduce overfitting, we apply dropout before the readout layer.
# Create a placeholder for the probability that a neuron's output is kept during dropout.
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

In [11]:
# Readout layer
# Like for the sofrmax regression layer
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

## Train and evaluate the model
- Replace the sttepest gradient descent optimizer with the more sophisticated ADAM optimizer.
- Include the additional parameter keep_prob in feed_dict to control the dropout rate.
- Add logging to every 100th iteration in the training process.

In [12]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.global_variables_initializer())

for i in range(1000):
    batch = mnist.train.next_batch(100)
    if i % 100 == 0:
        train_accuracy = accuracy.eval(feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0})
        print("Step %d, training accuracy %g" % (i, train_accuracy))
    train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})

# Final result
print("test accuracy %g" % accuracy.eval(feed_dict={x: mnist.test.images, 
                                                    y_: mnist.test.labels, 
                                                    keep_prob: 1.0}))

Step 0, training accuracy 0.11
Step 100, training accuracy 0.87
Step 200, training accuracy 0.93
Step 300, training accuracy 0.95
Step 400, training accuracy 0.95
Step 500, training accuracy 0.97
Step 600, training accuracy 0.94
Step 700, training accuracy 0.94
Step 800, training accuracy 0.95
Step 900, training accuracy 0.96
test accuracy 0.9679
