# MNIST example notebook

This is an example of how a notebook for an ML task should be structured when using TF Eager Execution.

In [None]:
wd = %pwd
print('Current directory:', wd)
if wd.endswith('models'):
    %cd ..

In [None]:
%load_ext autoreload
%autoreload 2

import tensorflow as tf
import tensorflow.contrib.eager as tfe
import sys, os, time, datetime

try:
    tfe.enable_eager_execution()
except ValueError:
    print('Eager exec already enabled.')

from models.base import BaseModel
from resources.flags import FLAGS, define_flags

In [None]:
# This can only be run once.
define_flags()

## Read the input data

In this case, MNIST + batch and shuffle it. In our case, it will be quite different.

In [None]:
from tensorflow.examples.tutorials.mnist import input_data

def load_data(data_dir):
    """Returns training and test tf.data.Dataset objects."""
    data = input_data.read_data_sets(data_dir, one_hot=True)
    train_ds = tf.data.Dataset.from_tensor_slices((data.train.images,
                                                   data.train.labels))
    test_ds = tf.data.Dataset.from_tensors(
        (data.test.images, data.test.labels))
    return (train_ds, test_ds)

device, data_format = ('/gpu:0', 'channels_first')
if FLAGS.no_gpu or tfe.num_gpus() <= 0:
    device, data_format = ('/cpu:0', 'channels_last')
print('Using device %s, and data format %s.' % (device, data_format))

# Load the datasets
train_ds, test_ds = load_data(FLAGS.in_data_dir)
train_ds = train_ds.shuffle(60000).batch(FLAGS.batch_size)

In [None]:
class MNISTModel(BaseModel):
    """MNIST model.
    Network structure is equivalent to:
    https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/examples/tutorials/mnist/mnist_deep.py
    and
    https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py

    But written using the tf.layers API.
    """

    def __init__(self, data_format, name=''):
        """Creates a model for classifying a hand-written digit.

        Args:
          data_format: Either 'channels_first' or 'channels_last'.
            'channels_first' is typically faster on GPUs while 'channels_last' is
            typically faster on CPUs. See
            https://www.tensorflow.org/performance/performance_guide#data_formats
        """
        super(MNISTModel, self).__init__(name=name)
        if data_format == 'channels_first':
            self._input_shape = [-1, 1, 28, 28]
        else:
            assert data_format == 'channels_last'
            self._input_shape = [-1, 28, 28, 1]
        self.conv1 = self.track_layer(tf.layers.Conv2D(
                FLAGS.conv_size,
                5,
                data_format=data_format,
                activation=tf.nn.relu))
        self.conv2 = self.track_layer(tf.layers.Conv2D(
                FLAGS.conv_size * 2,
                5,
                data_format=data_format,
                activation=tf.nn.relu))
        self.fc1 = self.track_layer(tf.layers.Dense(1024, activation=tf.nn.relu))
        self.fc2 = self.track_layer(tf.layers.Dense(10))
        self.dropout = self.track_layer(tf.layers.Dropout(0.5))
        self.max_pool2d = self.track_layer(tf.layers.MaxPooling2D(
                (2, 2), (2, 2), padding='SAME', data_format=data_format))
        
    def call(self, inputs, training):
        """Computes labels from inputs.

        Users should invoke __call__ to run the network, which delegates to this
        method (and not call this method directly).

        Args:
          inputs: A batch of images as a Tensor with shape [batch_size, 784].
          training: True if invoked in the context of training (causing dropout to
            be applied).  False otherwise.

        Returns:
          A Tensor with shape [batch_size, 10] containing the predicted logits
          for each image in the batch, for each of the 10 classes.
        """

        x = tf.reshape(inputs, self._input_shape)
        x = self.conv1(x)
        x = self.max_pool2d(x)
        x = self.conv2(x)
        x = self.max_pool2d(x)
        x = tf.layers.flatten(x)
        x = self.fc1(x)
        if training:
            x = self.dropout(x)
        x = self.fc2(x)
        return x


In [None]:
# Create a unique experiment name for each run:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
expname = 'MNIST_conv_demo-{}-conv32_64_fc1024_10'.format(timestamp)

# Create the model and optimizer
model = MNISTModel(data_format)
optimizer = tf.train.MomentumOptimizer(FLAGS.learning_rate, FLAGS.momentum)

if FLAGS.out_data_dir:
    train_dir = os.path.join(FLAGS.out_data_dir, expname)
    test_dir = os.path.join(FLAGS.out_data_dir, expname, 'eval')
    tf.gfile.MakeDirs(FLAGS.out_data_dir)
else:
    train_dir = None
    test_dir = None

summary_writer = tf.contrib.summary.create_file_writer(
    train_dir, flush_millis=10000, name='train')
test_summary_writer = tf.contrib.summary.create_file_writer(
    test_dir, flush_millis=10000, name='test')
checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt')


In [None]:
with tf.device(device):
    for epoch in range(0, FLAGS.epochs):
        with tfe.restore_variables_on_create(
                tf.train.latest_checkpoint(FLAGS.checkpoint_dir)):
            global_step = tf.train.get_or_create_global_step()
            start = time.time()
            with summary_writer.as_default():
                model.train_one_epoch(optimizer, train_ds, FLAGS.log_interval)
            end = time.time()
            print('\nTrain time for epoch #%d (global step %d): %f' % (
                epoch, global_step.numpy(), end - start))
        with test_summary_writer.as_default():
            model.test(test_ds)
        all_variables = (
            model.variables +
            optimizer.variables() +
            [global_step])
        tfe.Saver(all_variables).save(
            checkpoint_prefix, global_step=global_step)