# TensorFlow Mechanics 101

#### Changes Chong Min made

* Optimizer: from Gradient Descent to Adam
* batch size: set to 10
* size of hidden layer: 8
* weight initializer: random_normal_initializer
* max steps: 20000

Among the above changes, `batch size`, `optimizer`, and `max steps` values were critical.

Because batches were randomly generated, the accuracies were flucturated. It should be fixed.

- This tutorial is meant as a companion to the code [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/tutorials/mnist/)
- The goal of this tutorial is to show how to use TensorFlow to train and evaluate a simple feed-forward neural network for handwritten digit classification using the (classic) MNIST data set. 

- [`mnist.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist.py), the code for making a fully-connected MNIST model
- [`fully_connected_feed.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py), the main code to train the built MNIST model against the downloaded dataset using a feed dictionary.

In [1]:
import math
import time
import random
from os import getcwd
from os.path import join

import pandas as pd
import tensorflow as tf
import numpy as np

## Using a Custom Data-set

In [2]:
# Download test_data_revised.zip (in email since it can't be shared) and
# save it somewhere, preferably in the directory in which this notebook is
# stored. If it is somewhere else, just make sure to pass in the path when
# this function is used.

def read_data(dataset, macro_or_micro="macro",
              dev_set=True,
              data_path=join(getcwd(), "test_data_revised")):
    """
    Read in data from a directory

    Either read in the "macro" files or the "micro" files.

    Returns 1) training IDs, features, and labels,
            2) test IDs, features, and labels, and
            3) development set IDs, features, and labels (if `dev_set` is
               False, the test test set will contain all data originally in
               the "testing" file).
    """
    
    train_data_path = join(data_path,
                           "{}_dataset_training_{}.csv"
                           .format(dataset, macro_or_micro if macro_or_micro == "macro"
                                   else "micro_revised"))
    train_data = pd.read_csv(train_data_path, dtype={'appointment_id': str})
    train_labels = train_data['H1'].apply(lambda x: x - 1)
    train_features = train_data[[a for a in train_data.columns
                                 if a not in ['appointment_id', 'H1']]]
    train_ids = train_data['appointment_id']

    test_data_path = join(data_path,
                          "{}_dataset_testing_{}.csv"
                          .format(dataset, macro_or_micro if macro_or_micro == "macro"
                                  else "micro_revised"))
    test_data = pd.read_csv(test_data_path, dtype={'appointment_id': str})
     
    test_labels = test_data['H1'].apply(lambda x: x - 1)
    test_features = test_data[[a for a in test_data.columns
                               if a not in ['appointment_id', 'H1']]]
    test_ids = test_data['appointment_id']

    dev_features = None
    dev_labels = None
    dev_ids = None
    if dev_set:
        test_amount = len(test_features) - 1000
        dev_features = test_features.head(1000)
        dev_features.index = range(len(dev_features))
        test_features = test_features.tail(test_amount)
        test_features.index = range(len(test_features))
        dev_labels = test_labels.head(1000)
        dev_labels.index = range(len(dev_labels))
        test_labels = test_labels.tail(test_amount)
        test_labels.index = range(len(test_labels))
        dev_ids = test_ids[:1000]
        test_ids = test_ids[1000:]

    return (train_ids,
            train_features,
            train_labels,
            test_ids,
            test_features,
            test_labels,
            dev_ids,
            dev_features,
            dev_labels)

In [3]:
# Define some parameters
log_dir_path = join(getcwd(), "logs")


In [4]:
class DataSet:

    def __init__(self, ids, features, labels, random_=True):
        prng = np.random.RandomState(12345)
        self._ids = ids
        self._features = features
        self._labels = labels
        self._index_in_epoch = 0
        self._num_examples = len(self._ids)
        if random_:
            reindex = prng.permutation(self._features.index)
            self._features = self._features.reindex(reindex)
            self._ids = self._ids.reindex(reindex)
            self._labels = self._labels.reindex(reindex)

    def get_size(self):
        return self._num_examples

    def next_batch(self, batch_size):

        start = self._index_in_epoch

        # Go to the next epoch
        if start + batch_size > self._num_examples:

            # Get the rest examples in this epoch
            remaining_examples = self._num_examples - start
            features_rest_part = self._features[start:self._num_examples]
            labels_rest_part = self._labels[start:self._num_examples]
            ids_rest_part = self._ids[start: self._num_examples]


            # Start next epoch
            start = 0
            self._index_in_epoch = batch_size - remaining_examples
            end = self._index_in_epoch
            features_new_part = self._features[start:end]
            labels_new_part = self._labels[start:end]
            ids_new_part = self._ids[start:end]
            labels_new_part = self._labels[start:end]
            return (np.concatenate((ids_rest_part, ids_new_part), axis=0),
                    np.concatenate((features_rest_part, features_new_part), axis = 0), 
                   np.concatenate((labels_rest_part, labels_new_part), axis = 0))

        else:
    
            self._index_in_epoch += batch_size
            end = self._index_in_epoch
            return self._ids[start:end], self._features[start:end], self._labels[start:end]

## Functions Based on `mnist.py`

In [5]:
def inference(inputs, num_features, num_classes, hidden1_units,
              hidden2_units, hidden3_units=None):
    """
    Build a model on the inputs up to where it may be used for
    inference.

    Args:
        inputs: Placeholder for input data samples.
        num_features: Number of features in input data.
        num_classes: Number of classes/score labels.
        hidden1_units: Size of the first hidden layer.
        hidden2_units: Size of the second hidden layer.
        hidden3_units: Size of the third hidden layer (None if no
                       third layer).

    Returns:
        softmax_linear: Output tensor with the computed logits.
    """

    # Hidden 1
    with tf.name_scope('hidden1'):
        weights = tf.Variable(
              tf.random_normal_initializer(0.0, 0.05)([num_features, hidden1_units]),
              name='weights')
        biases = tf.Variable(tf.zeros([hidden1_units]),
                             name='biases')
        hidden1 = tf.nn.relu(tf.matmul(inputs, weights) + biases)

    # Hidden 2
    with tf.name_scope('hidden2'):
        weights = tf.Variable(
              tf.random_normal_initializer(0.0, 0.05)([hidden1_units, hidden2_units]),
              name='weights')
        biases = tf.Variable(tf.zeros([hidden2_units]),
                             name='biases')
        hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)

    if hidden3_units is not None:

        # Hidden 3
        with tf.name_scope('hidden3'):
            weights = tf.Variable(
                tf.random_normal_initializer(0.0, 0.05)([hidden2_units, hidden3_units],
                                    stddev=1.0 / math.sqrt(float(hidden2_units))),
                name='weights')
            biases = tf.Variable(tf.zeros([hidden3_units]),
                                 name='biases')
            hidden3 = tf.nn.relu(tf.matmul(hidden2, weights) + biases)
        
    # Linear
    with tf.name_scope('softmax_linear'):
        
        weights = tf.Variable(
              tf.random_normal_initializer(0.0, 0.05)([hidden1_units, num_classes]),
              name='weights')
        biases = tf.Variable(tf.zeros([num_classes]),
                             name='biases')
        logits = tf.matmul(hidden2 if hidden3_units is None else hidden3,
                           weights) + biases

    return logits


def loss(logits, labels):
    """
    Calculates the loss from the logits and the labels.

    Args:
        logits: Logits tensor, float - [batch_size, NUM_CLASSES].
        labels: Labels tensor, int32 - [batch_size].

    Returns:
        loss: Loss tensor of type float.
    """

    labels = tf.to_int64(labels)
    
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=labels, logits=logits, name='xentropy')
    return tf.reduce_mean(cross_entropy, name='xentropy_mean')    


def training_adam(_loss, learning_rate):
    """
    Sets up the training Ops.
    
    Creates a summarizer to track the loss over time in TensorBoard.
    Creates an Adam optimizer and applies the gradients to all trainable
    variables.
    
    The Op returned by this function is what must be passed to the
    `sess.run()` call to cause the model to train.

    Args:
        _loss: Loss tensor, from loss().
        learning_rate: The learning rate to use for gradient descent.

    Returns:
        train_op: The Op for training.
    """

    # Add a scalar summary for the snapshot loss.
    tf.summary.scalar('loss', _loss)

    # Create the Adam optimizer with the given learning
    # rate.
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

    # Create a variable to track the global step.
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Use the optimizer to apply the gradients that minimize the loss
    # (and also increment the global step counter) as a single
    # training step.
    train_op = optimizer.minimize(_loss, global_step=global_step)

    return train_op


def training_gradient_descent(_loss, learning_rate):
    """
    Sets up the training Ops.
    
    Creates a summarizer to track the loss over time in TensorBoard.
    Creates a gradient descent optimizer and applies the gradients to
    all trainable variables.
    
    The Op returned by this function is what must be passed to the
    `sess.run()` call to cause the model to train.

    Args:
        _loss: Loss tensor, from loss().
        learning_rate: The learning rate to use for gradient descent.

    Returns:
        train_op: The Op for training.
    """

    # Add a scalar summary for the snapshot loss.
    tf.summary.scalar('loss', _loss)

    # Create the gradient descent optimizer with the given learning
    # rate.
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)

    # Create a variable to track the global step.
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Use the optimizer to apply the gradients that minimize the loss
    # (and also increment the global step counter) as a single
    # training step.
    train_op = optimizer.minimize(_loss, global_step=global_step)

    return train_op


def training_momentum(_loss, learning_rate):
    """
    Sets up the training Ops.
    
    Creates a summarizer to track the loss over time in TensorBoard.
    Creates a MomentumOptimizer and applies the gradients to
    all trainable variables.
    
    The Op returned by this function is what must be passed to the
    `sess.run()` call to cause the model to train.

    Args:
        _loss: Loss tensor, from loss().
        learning_rate: The learning rate to use for gradient descent.

    Returns:
        train_op: The Op for training.
    """

    # Add a scalar summary for the snapshot loss.
    tf.summary.scalar('loss', _loss)

    # Create the gradient descent optimizer with the given learning
    # rate.
    optimizer = tf.train.MomentumOptimizer(learning_rate, 2)

    # Create a variable to track the global step.
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Use the optimizer to apply the gradients that minimize the loss
    # (and also increment the global step counter) as a single
    # training step.
    train_op = optimizer.minimize(_loss, global_step=global_step)

    return train_op


def training_adadelta(_loss, learning_rate):
    """
    Sets up the training Ops.
    
    Creates a summarizer to track the loss over time in TensorBoard.
    Creates a AdaDelata optimizer and applies the gradients to
    all trainable variables.
    
    The Op returned by this function is what must be passed to the
    `sess.run()` call to cause the model to train.

    Args:
        _loss: Loss tensor, from loss().
        learning_rate: The learning rate to use for gradient descent.

    Returns:
        train_op: The Op for training.
    """

    # Add a scalar summary for the snapshot loss.
    tf.summary.scalar('loss', _loss)

    # Create the gradient descent optimizer with the given learning
    # rate.
    optimizer = tf.train.AdadeltaOptimizer(learning_rate)

    # Create a variable to track the global step.
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Use the optimizer to apply the gradients that minimize the loss
    # (and also increment the global step counter) as a single
    # training step.
    train_op = optimizer.minimize(_loss, global_step=global_step)

    return train_op

def evaluation(logits, labels):
    """
    Evaluate the quality of the logits at predicting the label.

    Args:
        logits: Logits tensor, float - [batch_size, NUM_CLASSES].
        labels: Labels tensor, int32 - [batch_size], with values in the
                range [0, NUM_CLASSES).

    Returns:
        A scalar int32 tensor with the number of examples (out of
        batch_size) that were predicted correctly.
    """

    # For a classifier model, we can use the in_top_k Op.
    # It returns a bool tensor with shape [batch_size] that is true for
    # the examples where the label is in the top k (here k=1)
    # of all logits for that example.
    correct = tf.nn.in_top_k(logits, labels, 1)

    # Return the number of true entries.
    return tf.reduce_sum(tf.cast(correct, tf.int32))

In [6]:
# From fully_connected_feed.py
def fill_feed_dict(data, inputs_pl, labels_pl, batch_size):
    """
    Fills the feed_dict for training the given step.

    A feed_dict takes the form of:
    feed_dict = {
        <placeholder>: <tensor of values to be passed for placeholder>,
        ....
    }

    Args:
        data_set: The set of features and labels.
        inputs_pl: The input data placeholder.
        labels_pl: The input labels placeholder.
        batch_size: Size of each batch.

    Returns:
        feed_dict: The feed dictionary mapping from placeholders to values.
    """

    # Create the feed_dict for the placeholders filled with the next
    # `batch_size` samples.
    ids, inputs_feed, labels_feed = data.next_batch(batch_size)
    feed_dict = {
        inputs_pl: inputs_feed,
        labels_pl: labels_feed,
    }

    return feed_dict


def do_eval(sess, eval_correct, inputs_placeholder, labels_placeholder, data):
    """
    Runs one evaluation against the full epoch of data.

    Args:
        sess: The session in which the model has been trained.
        eval_correct: The Tensor that returns the number of correct
                      predictions.
        inputs_placeholder: The input data placeholder.
        labels_placeholder: The labels placeholder.
        data: The set of images and labels to evaluate.
    """

    # And run one epoch of eval.
    true_count = 0  # Counts the number of correct predictions.
    steps_per_epoch = data.get_size() // batch_size
    num_examples = steps_per_epoch * batch_size
    for step in range(steps_per_epoch):
        feed_dict = fill_feed_dict(data,
                                   inputs_placeholder,
                                   labels_placeholder,
                                   batch_size)
        true_cnt = sess.run(eval_correct,
                                          feed_dict=feed_dict)
        true_count += true_cnt

    acc = float(true_count) / num_examples
    return {'num_examples': num_examples, 'num_correct': true_count, 'accuracy': acc}

In [7]:
def run_graph_for_evaluation(train_data, test_data, optimizer_type, max_steps, dev_data=None):
    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():

        # Generate placeholders for the input feature data and labels.
        inputs_placeholder = tf.placeholder(tf.float32, shape=(batch_size,
                                                               NUM_FEATURES))
        labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))

        # Build a Graph that computes predictions from the inference model.
        logits = inference(inputs_placeholder,
                           NUM_FEATURES,
                           NUM_CLASSES,
                           hidden1,
                           hidden2,
                           hidden3_units=hidden3)

        # Add to the Graph the Ops for loss calculation.
        loss_ = loss(logits, labels_placeholder)

        # Add to the Graph the Ops that calculate and apply gradients.
        if optimizer_type == "adam":
            train_op = training_adam(loss_, learning_rate)
        elif optimizer_type == "gradient_descent":
            train_op = training_gradient_descent(loss_, learning_rate)
        elif optimizer_type == "momentum":
            train_op = training_momentum(loss_, learning_rate)
        elif optimizer_type == "adadelta":
            train_op = training_adadelta(loss_, learning_rate)
        else:
            raise ValueError("Choose either \"adam\" or \"gradient descent\" for "
                             "`optimizer_type`.")

        # Add the Op to compare the logits to the labels during evaluation.
        eval_correct = evaluation(logits, labels_placeholder)

        # Build the summary Tensor based on the TF collection of Summaries.
        summary = tf.summary.merge_all()

        # Add the variable initializer Op.
        init = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints.
        saver = tf.train.Saver()

        # Create a session for running Ops on the Graph.
        sess = tf.Session()

        # Instantiate a SummaryWriter to output summaries and the Graph.
        summary_writer = tf.summary.FileWriter(log_dir_path, sess.graph)

        # And then after everything is built:

        # Run the Op to initialize the variables.
        sess.run(init)

        # Start the training loop.
        for step in range(max_steps):
            start_time = time.time()

            # Fill a feed dictionary with the actual set of images and labels
            # for this particular training step.
            feed_dict = fill_feed_dict(train_data,
                                       inputs_placeholder,
                                       labels_placeholder,
                                       batch_size)

            # Run one step of the model.  The return values are the activations
            # from the `train_op` (which is discarded) and the `loss` Op.  To
            # inspect the values of your Ops or variables, you may include them
            # in the list passed to sess.run() and the value tensors will be
            # returned in the tuple from the call.
            _, loss_value = sess.run([train_op, loss_],
                                     feed_dict=feed_dict)

            duration = time.time() - start_time

            # Save a checkpoint and evaluate the model periodically.
            #if (step + 1) % 1000 == 0 or (step + 1) == max_steps:
            if (step + 1) == max_steps:
                #print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
                # Update the events file.
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()


                checkpoint_file = join(log_dir_path, 'model.ckpt')
                saver.save(sess, checkpoint_file, global_step=step)

                # Evaluate against the training set.
                #print('Train Data Eval:')
                train_data_eval_dict = do_eval(sess,
                        eval_correct,
                        inputs_placeholder,
                        labels_placeholder,
                        train_data)

                # Evaluate against the development set.
                if dev_labels is not None:
                    print('Development Data Eval:')
                    do_eval(sess,
                            eval_correct,
                            inputs_placeholder,
                            labels_placeholder,
                            dev_data)

                # Evaluate against the test set.
                #print('Test Data Eval:')
                test_data_eval_dict = do_eval(sess,
                        eval_correct,
                        inputs_placeholder,
                        labels_placeholder,
                        test_data)
                train_data_eval_dict['datatype'] = 'train'
                train_data_eval_dict['optimizer'] = optimizer_type
                train_data_eval_dict['max_steps'] = max_steps

                test_data_eval_dict['datatype'] = 'test'
                test_data_eval_dict['optimizer'] = optimizer_type
                test_data_eval_dict['max_steps'] = max_steps


                return [train_data_eval_dict, test_data_eval_dict]

In [8]:
# Choose "micro" or "macro". This will change the types of features we're
# using. There are 220 "micro" features in total while thre are 9 macro
# features.
dataset_type = "macro"
show_data=False
random_sampler = True

if dataset_type == "macro":
    learning_rate = 0.01
    hidden1 = 8
    hidden2 = 8
    hidden3 = None
    NUM_FEATURES = 9
    batch_size = 10
else:
    learning_rate = 0.01
    hidden1 = 512
    hidden2 = 128
    hidden3 = 16
    NUM_FEATURES = 220
    batch_size = 200

for dataset in ['first', 'second', 'third', 'fourth', 'fifth']:
    all_evaluation_result = []
    if dataset =='third' or dataset == 'fourth':
        NUM_CLASSES = 5
    else:
        NUM_CLASSES = 6
    
    # Read in data
    (train_ids, train_features, train_labels,
     test_ids, test_features, test_labels,
     dev_ids, dev_features, dev_labels) = read_data(dataset, macro_or_micro=dataset_type,
                                                    dev_set=False)
    #random_sampler = False
    train_data = DataSet(train_ids, train_features, train_labels)
    test_data = DataSet(test_ids, test_features, test_labels)
    if dev_labels is not None:
        dev_data = DataSet(dev_ids, dev_features, dev_labels)
    
    if show_data:
        print("dataset: {}".format(dataset))
        print("Shape of data:\n\tTraining: {}\n\t{}Test: {}"
              .format(train_features.shape,
                      "" if dev_features is None
                         else "Development: {}".format(dev_features.shape),
                      test_features.shape))

        print("train features\n")
        print(train_features.head())
        
        print("test features\n")
        print(test_features.head())
        print("train labels\n")
        print(train_labels[:10])
    
    # run evaluation
    for optimizer_type in ['gradient_descent', 'adam', 'momentum', 'adadelta']:
        for max_steps in [10000, 20000, 40000, 60000, 80000, 100000]:
            print("dataset: {}, optimizer: {}, max steps: {}".format(dataset, 
                                                                     optimizer_type, 
                                                                     max_steps))
            train_data_eval_dict, test_data_eval_dict = run_graph_for_evaluation(train_data, 
                                                                                 test_data, 
                                                                                 optimizer_type, 
                                                                                 max_steps)
            train_data_eval_dict['dataset'] = dataset
            test_data_eval_dict['dataset'] = dataset
            all_evaluation_result.extend([train_data_eval_dict, 
                                          test_data_eval_dict])
    print("")

    df = pd.DataFrame(all_evaluation_result)
    df = df[['dataset', 'optimizer', 'max_steps', 'datatype', 
             'num_examples', 'num_correct', 
             'accuracy']].sort_values(by=['dataset', 'optimizer', 
                                          'max_steps', 'datatype'])
    df.to_csv("{}_evaluation_results.csv".format(dataset), index=False)
    print(df)



dataset: first, optimizer: gradient_descent, max steps: 10000
dataset: first, optimizer: gradient_descent, max steps: 20000
dataset: first, optimizer: gradient_descent, max steps: 40000
dataset: first, optimizer: gradient_descent, max steps: 60000
dataset: first, optimizer: gradient_descent, max steps: 80000
dataset: first, optimizer: gradient_descent, max steps: 100000
dataset: first, optimizer: adam, max steps: 10000
dataset: first, optimizer: adam, max steps: 20000
dataset: first, optimizer: adam, max steps: 40000
dataset: first, optimizer: adam, max steps: 60000
dataset: first, optimizer: adam, max steps: 80000
dataset: first, optimizer: adam, max steps: 100000
dataset: first, optimizer: momentum, max steps: 10000
dataset: first, optimizer: momentum, max steps: 20000
dataset: first, optimizer: momentum, max steps: 40000
dataset: first, optimizer: momentum, max steps: 60000
dataset: first, optimizer: momentum, max steps: 80000
dataset: first, optimizer: momentum, max steps: 100000
d