In [1]:
import numpy as np
import os
import tensorflow as tf

###### Do not modify here ###### 

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

reset_graph()

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")

# training on MNIST but only on digits 0 to 4
X_train1 = mnist.train.images[mnist.train.labels < 5]
y_train1 = mnist.train.labels[mnist.train.labels < 5]
X_valid1 = mnist.validation.images[mnist.validation.labels < 5]
y_valid1 = mnist.validation.labels[mnist.validation.labels < 5]
X_test1 = mnist.test.images[mnist.test.labels < 5]
y_test1 = mnist.test.labels[mnist.test.labels < 5]

###### Do not modify here ###### 



Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [2]:
from IPython.display import clear_output, Image, display, HTML
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = graph_def
    #strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [3]:
reset_graph()

# Specified params
num_layers = 5
neurons_per_layer = 128
input_dims = X_train1.shape[1]
n_classes = 5

# Costomized params
batch_size = 64
learning_rate = 0.001
dropout_rate = 0.0
train_steps = 20000

# If validation accuracy does not improve after certain steps of training, apply early stopping
early_stopping_steps = 1000

############ Network construction ###############

x = tf.placeholder(tf.float32, [None, input_dims])
y = tf.placeholder(tf.int32, [None]) # label

# Indicating if current operation is training (apply dropout) or testing (no dropout)
training_mode = tf.placeholder(tf.bool, shape=[], name='is_training')

# Layers
hidden_layers = []
dropouts = []

# First layer
hidden_layers.append(tf.layers.dense(inputs=x, units=neurons_per_layer, activation=tf.nn.elu, kernel_initializer=
                                    tf.contrib.layers.variance_scaling_initializer(), name='dense0'))
dropouts.append(tf.layers.dropout(inputs=hidden_layers[-1], rate=dropout_rate, 
                                  training=training_mode))

# Middle layers
for i in range(1, num_layers):
    hidden_layers.append(tf.layers.dense(inputs=dropouts[-1], units=neurons_per_layer, activation=tf.nn.elu,
                                         kernel_initializer=tf.contrib.layers.variance_scaling_initializer(), name='dense'+str(i)))
    dropouts.append(tf.layers.dropout(inputs=hidden_layers[-1], rate=dropout_rate, 
                                      training=training_mode))

# Logits layer
y_ = tf.layers.dense(inputs=dropouts[-1], units=n_classes, kernel_initializer=
                                    tf.contrib.layers.variance_scaling_initializer(), name='logits')

# dropout = tf.layers.dropout(inputs=hidden_layer, rate=dropout_rate, 
#                                   training=training_mode)

# # Logits layer
# y_ = tf.layers.dense(inputs=dropouts, units=n_classes, kernel_initializer=
#                                     tf.contrib.layers.variance_scaling_initializer(), name='logits')

loss = tf.reduce_mean(
      tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=y_))

train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)

# Test trained model
prediction = tf.argmax(y_, 1, output_type=tf.int32) # get the index of y_ of max prob and value will be 0,1,2,3,4
correct_prediction = tf.equal(prediction, tf.cast(y, tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))



init_g = tf.global_variables_initializer()
saver = tf.train.Saver()


# for _ in range(1000):
#     batch_xs, batch_ys = mnist.train.next_batch(100)
#     sess.run(train_step, feed_dict={x: batch_xs, y: batch_ys, training_mode:True})
    

# print(sess.run(accuracy, feed_dict={x: mnist.test.images,
#                                       y: mnist.test.labels, training_mode:False}))


In [4]:
# Calculate precision and recall
def print_pr_each_label(predictions, labels, classes):
    from collections import Counter
    tps = Counter()
    tns = Counter()
    fps = Counter()
    fns = Counter()
    
    for prediction, label in zip(predictions, labels):
        if prediction == label:
            tps[label] += 1
            for i in range(classes):
                if i != label:
                    tns[i] += 1
        else:
            fps[prediction] += 1
            fns[label] += 1
    print(tps,tns)
    print(fps, fns)
    print("Label\tPrecision\tRecall")
    for i in range(classes):
        print("{}\t{}\t\t{}".format(i, round(float(tps[i])/max((tps[i]+fps[i]), 1),5), 
                                      round(float(tps[i])/max((tps[i]+fns[i]), 1),5)))

def train_tf_and_print_result(X_train, y_train, X_valid, y_valid, X_test, y_test):

    dataset_batch = tf.contrib.data.Dataset.from_tensor_slices((X_train, y_train)).repeat().batch(batch_size)
    dataset_batch = dataset_batch.make_initializable_iterator()

    # Early stopping variables
    best_accuracy = 0
    best_step = 0
    early_stopped = False
    sess.run(init_g)
    sess.run(dataset_batch.initializer)

    # Training steps
    for i in range(train_steps):
        X_in, y_in = sess.run(dataset_batch.get_next())
        sess.run(train_step, feed_dict={x: X_in, y: y_in, training_mode: True})

        # Validate accuracy every 100 steps
        if i % 100 == 0:
            curr_accuracy = sess.run(accuracy, feed_dict={x: X_valid, y: y_valid, training_mode: False})

            # Print accuracy every 1000 steps
            if i % 1000 == 0:
                print("Step {}: ".format(i), curr_accuracy)

            # Save checkpoint of current model if it performs better
            if best_accuracy < curr_accuracy:
                best_accuracy = curr_accuracy
                save_path = saver.save(sess, "./Team20_HW2.ckpt")
                best_step = i

            # Early stop if model does not improve for certain steps
            elif i - best_step >= early_stopping_steps:
                early_stopped = True
                break

    # Save checkpoint in case the training is not early-stopped
    if not early_stopped:
        print("save best model")
        save_path = saver.save(sess, "./Team20_HW2.ckpt")
#     Get the best model
    saver.restore(sess, "./Team20_HW2.ckpt")

    # Total accuracy
    final_accuracy = sess.run(accuracy, feed_dict={x: X_test, y: y_test, training_mode: False})
    print("Test accuracy: ", final_accuracy)

    # Precision and recall
    ps = sess.run(prediction, feed_dict={x: X_test, training_mode: False})
    print_pr_each_label(ps, y_test, n_classes)

    return final_accuracy



In [5]:
# Draw the graph in separate session so the data batching and processing will not freeze the program
with tf.Session() as sess:
    show_graph(tf.get_default_graph().as_graph_def())

###### Start TF session ######
# with tf.Session() as sess:
#     train_tf_and_print_result(X_train1, y_train1, X_valid1, y_valid1, X_test1, y_test1)

In [None]:
# One Hidden Layer result

# Step 0:  0.423769
# Step 1000:  0.988663
# Step 2000:  0.982017
# INFO:tensorflow:Restoring parameters from ./Team20_HW2.ckpt
# Test accuracy:  0.9928
# Counter({1: 1132, 2: 1018, 3: 1006, 0: 975, 4: 971}) Counter({0: 5102, 1: 5102, 2: 5102, 3: 5102, 4: 5102})
# Counter({2: 12, 1: 9, 0: 7, 3: 5, 4: 4}) Counter({2: 14, 4: 11, 0: 5, 3: 4, 1: 3})
# Label	Precision	Recall
# 0	0.99287		0.9949
# 1	0.99211		0.99736
# 2	0.98835		0.98643
# 3	0.99505		0.99604
# 4	0.9959		0.9888

# Training process explanation

In this homework, instead of applying matrix multiplications and additions directly, we try to use the *layer* classes provided by Tensorflow as an attempt. It turns out that the *layers* classes are easy to use if the operations are relatively simple as the basic methods are encapsulated, we just need to follow the document and provide correct input.

The dropout operations need to be applied as layers in this approach, so the graph looks longer.

Using the specs provided by TAs in the homework document, we construct the 5-layer NN with dropouts. The precision and recall are calculated by our own function as it seems that Tensorflow does not provide methods to calculate precision and recall for each label.

Early stopping is applied when the validation accuracy is not improving for 1000 steps (or minibatches) of training, although we can see there are still some improvements if we disabled early stopping. We decide to still use this hyperparameter because of the training time, however we expect the model can achieve better accuracy if we allow more steps without improvement.

Implementation of cross validation is done below, using *KFold* class in *sklearn* package.

# Bonus: Cross validation and dropout

In [None]:
from sklearn.model_selection import KFold
from sklearn.utils import shuffle

folds = 5
dropout_rates = [0.2, 0.5]

# Merge and shuffle original train and test dataset
X_cross = np.concatenate((X_train1, X_valid1), axis=0)
y_cross = np.concatenate((y_train1, y_valid1), axis=0)

# 5-fold cross validation
k_fold = KFold(n_splits=folds, random_state=87)

# Uses 5-fold cross validation to test which dropout rate is better
avg_accuracy = []

with tf.Session() as sess:
    for dropout_rate in dropout_rates:
        print('Current dropout: ', dropout_rate)
        acc = 0
        for train_index, valid_index in k_fold.split(X_cross):
            print("TRAIN:", train_index, "VALID:", valid_index)
            X_train, X_valid = X_cross[train_index], X_cross[valid_index]
            y_train, y_valid = y_cross[train_index], y_cross[valid_index]
            acc += train_tf_and_print_result(X_train, y_train, X_valid, y_valid, X_test1, y_test1)
            
        # Compute average dropout
        avg_accuracy.append(acc/folds)

# Final result
for i, dropout_rate in enumerate(dropout_rates):
    print('Dropout rate {}: {}'.format(dropout_rate, avg_accuracy[i]))

Current dropout:  0.2
TRAIN: [ 6120  6121  6122 ..., 30593 30594 30595] VALID: [   0    1    2 ..., 6117 6118 6119]
Step 0:  0.620425
Step 1000:  0.976797
Step 2000:  0.978268
Step 3000:  0.98317
INFO:tensorflow:Restoring parameters from ./Team20_HW2.ckpt
Test accuracy:  0.991243
Counter({1: 1127, 2: 1019, 3: 1006, 0: 979, 4: 963}) Counter({4: 4131, 0: 4115, 3: 4088, 2: 4075, 1: 3967})
Counter({0: 17, 2: 11, 3: 11, 1: 3, 4: 3}) Counter({4: 19, 2: 13, 1: 8, 3: 4, 0: 1})
Label	Precision	Recall
0	0.98293		0.99898
1	0.99735		0.99295
2	0.98932		0.9874
3	0.98918		0.99604
4	0.99689		0.98065
TRAIN: [    0     1     2 ..., 30593 30594 30595] VALID: [ 6120  6121  6122 ..., 12236 12237 12238]
Step 0:  0.569211
Step 1000:  0.983004
Step 2000:  0.988724
Step 3000:  0.985782
INFO:tensorflow:Restoring parameters from ./Team20_HW2.ckpt
Test accuracy:  0.992605
Counter({1: 1132, 2: 1017, 3: 1002, 0: 977, 4: 973}) Counter({4: 4128, 0: 4124, 3: 4099, 2: 4084, 1: 3969})
Counter({2: 16, 3: 10, 0: 8, 4: 4})

Seems like adding dropout can help a little bit on the final accuracy, however the original network is good enough so the benefit of dropout is not very significant.