### Path setup

In [None]:
# noqa
import os
COLAB = 'DATALAB_DEBUG' in os.environ
if COLAB:
    %cd /content
    ROOT_DIR = '/content'
    REPO_DIR = os.path.join(ROOT_DIR, 'ml_project_template')
    LOG_DIR = os.path.join(REPO_DIR, 'data_out')

    if not os.path.isdir(REPO_DIR):
      !git clone https://github.com/oskopek/ml_project_template.git
    if not os.path.isdir(LOG_DIR):
      os.makedirs(LOG_DIR)
    %cd 'ml_project_template'
    !git pull
    %ls
else:
    wd = %pwd
    print('Current directory:', wd)
    if wd.endswith('notebooks'):
        %cd ..

### Install missing packages

Do not want to do `pip install -r requirements.txt` because that will overwrite the versions on Colab

In [None]:
# noqa
if COLAB:
    !pip install dotmap==1.2.20

### Branch selection

In [None]:
# noqa
if COLAB:
    !git checkout master

### Tensorboard setup

In [None]:
if COLAB:
    import os
    import resources.colab_utils.tboard as tboard

    # will install `ngrok`, if necessary
    # will create `log_dir` if path does not exist
    tboard.launch_tensorboard(bin_dir=REPO_DIR, log_dir=LOG_DIR)

# MNIST example notebook

This is an example of how a notebook for an ML task should be structured when using TF Eager Execution.

In [None]:
# noqa
%load_ext autoreload
%autoreload 2

from datetime import datetime
import os
import sys
import time

import tensorflow as tf
import tensorflow.contrib.eager as tfe
import tensorflow.contrib.summary as tf_summary

try:
    tfe.enable_eager_execution()
except ValueError:
    print('Eager exec already enabled.')

from models.base import BaseModel

# Flags
from flags import flags_parser
flags_parser.parse('flags/cnn.json', None)
FLAGS = flags_parser.FLAGS
assert FLAGS is not None

## Read the input data

In this case, MNIST + batch and shuffle it. In our case, it will be quite different.

In [None]:
from tensorflow.examples.tutorials.mnist import input_data


def load_data(data_dir):
    """Returns training and test tf.data.Dataset objects."""
    data = input_data.read_data_sets(data_dir, one_hot=True)
    train_ds = tf.data.Dataset.from_tensor_slices((data.train.images, data.train.labels))
    test_ds = tf.data.Dataset.from_tensors((data.test.images, data.test.labels))
    return (train_ds, test_ds)


device, data_format = ('/gpu:0', 'channels_first')
if FLAGS.no_gpu or tfe.num_gpus() <= 0:
    device, data_format = ('/cpu:0', 'channels_last')
print('Using device %s, and data format %s.' % (device, data_format))

# Load the datasets
train_ds, test_ds = load_data(FLAGS.data.in_dir)
train_ds = train_ds.shuffle(60000).batch(FLAGS.model.optimization.batch_size)

In [None]:
class MNISTModel(tfe.Network):
    """MNIST model.
    Network structure is equivalent to:
    https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/examples/tutorials/mnist/mnist_deep.py
    and
    https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py

    But written using the tf.layers API.
    """

    def __init__(self, data_format, name=''):
        """Creates a model for classifying a hand-written digit.

        Args:
          data_format: Either 'channels_first' or 'channels_last'.
            'channels_first' is typically faster on GPUs while 'channels_last' is
            typically faster on CPUs. See
            https://www.tensorflow.org/performance/performance_guide#data_formats
        """
        super(MNISTModel, self).__init__()
        conv_size = 32
        if data_format == 'channels_first':
            self._input_shape = [-1, 1, 28, 28]
        else:
            assert data_format == 'channels_last'
            self._input_shape = [-1, 28, 28, 1]
        self.conv1 = self.track_layer(tf.layers.Conv2D(conv_size, 5, data_format=data_format, activation=tf.nn.relu))
        self.conv2 = self.track_layer(
            tf.layers.Conv2D(conv_size * 2, 5, data_format=data_format, activation=tf.nn.relu))
        self.fc1 = self.track_layer(tf.layers.Dense(1024, activation=tf.nn.relu))
        self.fc2 = self.track_layer(tf.layers.Dense(10))
        self.dropout = self.track_layer(tf.layers.Dropout(0.5))
        self.max_pool2d = self.track_layer(
            tf.layers.MaxPooling2D((2, 2), (2, 2), padding='SAME', data_format=data_format))

    def call(self, inputs, training):
        """Computes labels from inputs.

        Users should invoke __call__ to run the network, which delegates to this
        method (and not call this method directly).

        Args:
          inputs: A batch of images as a Tensor with shape [batch_size, 784].
          training: True if invoked in the context of training (causing dropout to
            be applied).  False otherwise.

        Returns:
          A Tensor with shape [batch_size, 10] containing the predicted logits
          for each image in the batch, for each of the 10 classes.
        """

        x = tf.reshape(inputs, self._input_shape)
        x = self.conv1(x)
        x = self.max_pool2d(x)
        x = self.conv2(x)
        x = self.max_pool2d(x)
        x = tf.layers.flatten(x)
        x = self.fc1(x)
        if training:
            x = self.dropout(x)
        x = self.fc2(x)
        return x

In [None]:
def loss(logits, labels):
    print("logits:", tf.shape(logits))
    print("labels:", tf.shape(labels))
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels))

In [None]:
def compute_accuracy(logits, labels):
    predictions = tf.argmax(logits, axis=1, output_type=tf.int64)
    labels = tf.cast(labels, tf.int64)
    batch_size = int(logits.shape[0])
    return tf.reduce_sum(tf.cast(tf.equal(predictions, labels), dtype=tf.float32)) / batch_size

In [None]:
def train(model, optimizer, dataset, log_interval=None):
    """Trains model on `dataset` using `optimizer`."""

    global_step = tf.train.get_or_create_global_step()

    start = time.time()
    for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)):
        with tf_summary.record_summaries_every_n_global_steps(10):
            # Record the operations used to compute the loss given the input,
            # so that the gradient of the loss with respect to the variables
            # can be computed.
            with tfe.GradientTape() as tape:
                logits = model(images, training=True)
                loss_value = loss(logits=logits, labels=labels)
                tf_summary.scalar('loss', loss_value)
                tf_summary.scalar('accuracy', compute_accuracy(logits=logits, labels=labels))
            grads = tape.gradient(loss_value, model.variables)
            optimizer.apply_gradients(zip(grads, model.variables), global_step=global_step)

            if log_interval and batch % log_interval == 0:
                rate = log_interval / (time.time() - start)
                print('Step #%d\tLoss: %.6f (%d steps/sec)' % (batch, loss_value, rate))
                start = time.time()

In [None]:
def test(model, dataset):
    """Perform an evaluation of `model` on the examples from `dataset`."""
    avg_loss = tfe.metrics.Mean('loss')
    accuracy = tfe.metrics.Accuracy('accuracy')

    for (images, labels) in tfe.Iterator(dataset):
        logits = model(images, training=False)
        avg_loss(loss(logits, labels))
        accuracy(tf.argmax(logits, axis=1, output_type=tf.int64), tf.cast(labels, tf.int64))

    print('Test set: Average loss: %.4f, Accuracy: %4f%%\n' % (avg_loss.result(), 100 * accuracy.result()))
    with tf_summary.always_record_summaries():
        tf_summary.scalar('loss', avg_loss.result())
        tf_summary.scalar('accuracy', accuracy.result())

In [None]:
# Create a unique experiment name for each run:
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
expname = "MNIST_conv32_64_fc1024_10-{}".format(timestamp)

# Create the model and optimizer
model = MNISTModel(data_format)
optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.model.optimization.learning_rate)

if FLAGS.data.out_dir:
    train_dir = os.path.join(FLAGS.data.out_dir, expname)
    test_dir = os.path.join(FLAGS.data.out_dir, expname, 'eval')
    tf.gfile.MakeDirs(FLAGS.data.out_dir)
else:
    train_dir = None
    test_dir = None

summary_writer = tf_summary.create_file_writer(train_dir, flush_millis=10000, name='train')
test_summary_writer = tf_summary.create_file_writer(test_dir, flush_millis=10000, name='test')
checkpoint_dir = os.path.join(train_dir, 'ckpt')

In [None]:
# Train and evaluate for 11 epochs.
# TODO: Fix this, sometime in the future, when the API changes 10 more times.
with tf.device(device):
    for epoch in range(FLAGS.model.optimization.epochs):
        with tfe.restore_variables_on_create(tf.train.latest_checkpoint(checkpoint_dir)):
            global_step = tf.train.get_or_create_global_step()
            start = time.time()
            with summary_writer.as_default():
                train(model, optimizer, train_ds, log_interval=FLAGS.training.log_interval)
            end = time.time()
            print('\nTrain time for epoch #%d (global step %d): %f' % (epoch, global_step.numpy(), end - start))

        with test_summary_writer.as_default():
            test(model, test_ds)

        all_variables = (model.variables + optimizer.variables() + [global_step])
        tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)