# Setup

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import time
import itertools as it

import helpers_08

%matplotlib inline

# From Old NB lab 06 (make VGG)

# Modern CNN Architectures

## Loading TensorBoard Graphs for pre-built models

Inside of the `prebuilt` folder, there are TensorBoard graphs exported for VGGNet, InceptionV1, and ResNet models. You will use these as guidance for creating your own layer functions. Load them up in TensorBoard by using the following command (assuming you're running this command from the `notebooks` directory:

```shell
tensorboard --logdir=prebuilt
```

Navigate to `localhost:6006` in your browser. After you click on the "Graphs" link, you'll be able to switch to the various reference graphs by choosing from the dropdown "runs" option.

![](images/tb1.png)

![](images/tb2.png)

Below is a description of each graph:

* **vgg_19**: The entire VGGNet network (19-layer version)

The goal of this notebook/lab is to recreate VGG in TensorFlow.

### Provided Layer functions

#### `conv()`

Creates a 2D convolutional layer with Xavier-initialized weights. Automatically detects depth from previous layer

* **Arguments**
    * `inputs`: 4D `Tensor` with shape `[batch_size, height, width, channels]`
    * `depth`: The number of output channels this convolution should create. Scalar number.
    * `ksize`: 2D list of integers. The dimensions of convolutional kernel (ie. [3,3], [5,5], etc)
    * `strides`: 2D list of integers. The strides of the convolution (defaults to `[1, 1]`)
    * `padding`: String, accepted values `'SAME'` or `'VALID'`. The type of padding to use. Defaults to `SAME`.
    * `bval`: Floating point number. The initial values for biases
    * `activation_fn`: Lambda function. The activation function to use. defaults to `tf.nn.relu`
    * `scope`: The name to use for the variable scope.
    
* **Returns**
    * A 4D `Tensor` after the convolution operation.

In [38]:
def conv(inputs, depth, ksize, strides=[1, 1], padding='SAME',
         bval=0.01, activation_fn=tf.nn.relu, scope=None):
    prev_shape = inputs.get_shape().as_list()
    prev_depth = prev_shape[-1]
    kshape = ksize + [prev_depth, depth]
    strides = [1] + strides + [1]
    fan_in = np.prod(prev_shape[1:], dtype=np.float32)
    with tf.variable_scope(scope, 'conv_layer'):
        xavier_stddev = tf.sqrt(tf.constant(2.0, dtype=tf.float32) / fan_in, name='xavier_stddev')
        w = tf.Variable(tf.truncated_normal(kshape, stddev=xavier_stddev), name='kernel')
        b = tf.Variable(tf.constant(bval, shape=[depth]), name='bias')
        conv = tf.nn.conv2d(inputs, w, strides, padding, name='conv')
        z = tf.nn.bias_add(conv, b)
        return z if activation_fn is None else activation_fn(z)

#### `fully_connected()`

Creates a 2D fully connected layer with Xavier-initialized weights. Automatically detects depth from previous layer.

* **Arguments**
    * `inputs`: 2D `Tensor` with shape `[batch_size, depth]`
    * `depth`: Scalar. The number of neurons in this layer
    * `bval`: Floating point number. The initial values for biases
    * `activation_fn`: Lambda function. The activation function to use. defaults to `tf.nn.relu`
    * `keep_prob`: Scalar float indicating the keep probability for dropout (if any)
    * `scope`: The name to use for the variable scope.
    
* **Returns**
    * A 2D `Tensor` 

In [39]:
def fully_connected_layer(inputs, depth, bval=0.01, activation_fn=tf.nn.relu, 
                          keep_prob=None, scope=None):
    inputs = tf.convert_to_tensor(inputs)
    prev_shape = inputs.get_shape().as_list()
    fan_in = prev_shape[-1]
    with tf.variable_scope(scope, 'fully_connected'):
        xavier_stddev = tf.sqrt(tf.constant(2.0, dtype=tf.float32) / fan_in, name='xavier_stddev')
        w = tf.Variable(tf.truncated_normal([fan_in, depth], stddev=xavier_stddev), name='W')
        b = tf.Variable(tf.constant(bval, shape=[depth]), name='bias')
        z = tf.matmul(inputs, w) + b
        a = z if activation_fn is None else activation_fn(z)
        return a if keep_prob is None else tf.nn.dropout(a, keep_prob)

#### `avgpool()` and `maxpool()`

Performs average pooling and max pooling, respectively

* **Arguments**
    * `inputs`: 4D `Tensor` with shape `[batch_size, height, width, channels]`
    * `ksize`: 2D list of integers. The dimensions of pooling kernel (ie. [2,2] etc)
    * `strides`: 2D list of integers. The strides of the pooling kernel.
    * `padding`: String, accepted values `'SAME'` or `'VALID'`. The type of padding to use. Defaults to `VALID`.
    * `name`: The name to use for the variable scope.
    
* **Returns**
    * A 4D `Tensor` after the pooling operation.

In [11]:
def avgpool(inputs, ksize, strides, padding='VALID', name=None):
    with tf.name_scope(name, 'avgpool'):
        ksize = [1] + ksize + [1]
        strides = [1] + strides + [1]
        return tf.nn.avg_pool(inputs, ksize, strides, padding)

    
def maxpool(inputs, ksize, strides, padding='VALID', name=None):
    with tf.name_scope(name, 'maxpool'):
        ksize = [1] + ksize + [1]
        strides = [1] + strides + [1]
        return tf.nn.max_pool(inputs, ksize, strides, padding)

#### `flatten()`

Flattens an N dimensional `Tensor` into a 2D `Tensor` (ie from shape `[batch_size, a, b, c]` to shape `[batch_size, a*b*c]`

* **Arguments**
    * `inputs`: The input `Tensor` to flatten
    * `scope`: The name to use for the variable scope.
    
* **Returns**
    * A 2D flattened `Tensor`

In [None]:
def flatten(inputs, name=None):
    prev_shape = inputs.get_shape().as_list()
    fan_in = np.prod(prev_shape[1:])
    with tf.name_scope(name, 'flatten'):
        return tf.reshape(inputs, [-1, fan_in])

## VGGNet

[VGGNet paper on arXiv.org](https://arxiv.org/abs/1409.1556)

![](images/vggtable.png)

Use the above layer functions to recreate the 19 layer VGGNet from the above table (column E). Your model function should expect two parameter inputs:

* `inputs`: a 4D tensor with dtype `float32` and shape `[batch_size, 224, 224, 3]`
* `keep_prob`: A scalar `Tensor` with dtype `float32` representing the keep_probability for dropout

In [None]:
def vggnet(inputs, keep_prob):
    ### YOUR CODE HERE
    pass
    ###

In [None]:
# Test module: Run once you're ready to check your work
graph = tf.Graph()
with graph.as_default():
    inputs = tf.random_normal([10, 224, 224, 3])
    keep_prob = tf.placeholder(tf.float32)
    output = vggnet(inputs, keep_prob)
    writer = tf.summary.FileWriter('tbout/vggnet', graph=graph)
    writer.close()

# Variable Scope [from old NB 11]

## `tf.variable_scope()`

Throughout the class, we've created our variables directly, using `tf.Variable()`. This is the simplest way to use Variables, as it doesn't involve any programming "magic". However, TensorFlow includes another way to create Variables so that it is easier to access previously created Variables. It also forces you to be more precise with how you use Variables, and allows you to assign "presets" for various parameters in your Variables, such as the initialization values.

Let's look at an example:

In [67]:
var_graph = tf.Graph()
with var_graph.as_default():
    with tf.variable_scope('my_var_scope') as scope:
        w_init = tf.truncated_normal_initializer()
        b_init = tf.zeros_initializer()
        w = tf.get_variable('w', shape=[10, 10], initializer=w_init)
        b = tf.get_variable('b', shape=[10], initializer=b_init)

The above code creates two variables, `w` and `b`, using the [`tf.get_variable()`](https://www.tensorflow.org/api_docs/python/tf/get_variable) method. The primary parameter is the string `name` of the `Variable` you'd like to retrieve. If a `Variable` in the scope already has that name, it will retrieve that `Variable` object. Otherwise, it will create that `Variable`. Because neither `w` nor `b` were created before, it creates them from scratch.

### Reusing variables

Now if we want to reuse them at a later time, we can access them by calling the `variable_scope` again, and setting its `reuse` parameter to `True`:

In [70]:
with var_graph.as_default():
    with tf.variable_scope('my_var_scope', reuse=True) as scope:
        w_again = tf.get_variable('w')
        b_again = tf.get_variable('b')
        assert w == w_again
        assert b == b_again

Note that we _must_ set `reuse` to `True`. If we don't, TensorFlow will complain at us:

In [79]:
with var_graph.as_default():
    with tf.variable_scope('my_var_scope') as scope:
        try:
            w_again = tf.get_variable('w')
            b_again = tf.get_variable('b')
        except ValueError as e:
            print(e)

Variable my_var_scope/w already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:

  File "<ipython-input-67-fb67dcccc716>", line 6, in <module>
    w = tf.get_variable('w', shape=[10, 10], initializer=w_init)
  File "/Users/Sam/anaconda/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/Users/Sam/anaconda/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):



As an alternative to passing `reuse` into the `variable_scope` parameter, we can set it after the fact by using the `variable_scope.

In [86]:
with var_graph.as_default():
    with tf.variable_scope('my_var_scope') as scope:
        scope.reuse_variables()
        w_again = tf.get_variable('w')
        b_again = tf.get_variable('b')
        assert w == w_again
        assert b == b_again

You can get the current variable scope with `tf.get_variable_scope()`; similar to `tf.get_default_graph()`:

In [87]:
with var_graph.as_default():
    with tf.variable_scope('my_var_scope'):
        scope = tf.get_variable_scope()
        scope.reuse_variables()
        w_again = tf.get_variable('w')
        b_again = tf.get_variable('b')
        assert w == w_again
        assert b == b_again

### Variable initializers

Notice that we used two Operations we've never seen before: `truncated_normal_initializer` and `zeros_initializer`. They are similar to what we've used in the past to initialize Variables: `truncated_normal` and `zeros`. The `initializer` Operations are designed to be used with `tf.get_variable()`- they define a way to create an arbitrary initial value inside a Tensor, regardless of shape.

Notice how we don't specify the shape of the `Variable` until we call `tf.get_variable()`. This separation allows us to reuse the same initialization `Operation` for multiple Variables:

In [None]:
init_graph = tf.Graph()
with init_graph.as_default():
    with tf.variable_scope('my_var_scope') as scope:
        w_init = tf.truncated_normal_initializer()
        w1 = tf.get_variable('w1', shape=[10, 10], initializer=w_init)
        w2 = tf.get_variable('w2', shape=[200], initializer=w_init)
        w3 = tf.get_variable('w3', shape=[300,10,10], initializer=w_init)

### Setting default parameters

Above, we use the line `initializer=w_init` over and over again. It would be nice if we could have that automatically be done for us. Luckily, we can! The `variable_scope()` function includes several options that we can provide as a default for any `Variables` we create inside that scope. To set `w_init` as our default initializer, we simple pass in `initializer=w_init` inside of the call to `variable_scope`. Then, we can leave the `initializer=` portion out of `get_variable()`

In [98]:
default_init_graph = tf.Graph()
with default_init_graph.as_default():
    w_init = tf.truncated_normal_initializer()
    with tf.variable_scope('my_var_scope', initializer=w_init) as scope:
        w1 = tf.get_variable('w1', shape=[10, 10])
        w2 = tf.get_variable('w2', shape=[200])
        w3 = tf.get_variable('w3', shape=[300,10,10])

These default parameters can be nested, too.

In [100]:
nested_init_graph = tf.Graph()
with nested_init_graph.as_default():
    init1 = tf.truncated_normal_initializer()
    init2 = tf.zeros_initializer()
    with tf.variable_scope('var_scope_1', initializer=init1) as scope:
        w1 = tf.get_variable('w1', shape=[10, 10])
        with tf.variable_scope('var_scope_2', initializer=init2):
            w2 = tf.get_variable('w2', shape=[200])
            w3 = tf.get_variable('w3', shape=[300,10,10])

# VGG, Devices, etc. [from old NB 12]

In [1]:
import bz2
import collections
import os.path
import re
import random
import zipfile

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from lxml import etree

import helpers

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# TFRecords

The below code downloads the Tiny ImageNet dataset, which is a miniaturized and simplified version of the ILSVRC dataset. There are only 200 classes as opposed to 1000, and each of the files has been scaled to 64x64 pixels. We're mainly using it due to the fact that it is a smaller dataset!

In [None]:
url = 'http://cs231n.stanford.edu/tiny-imagenet-200.zip'
helpers.mkdir('data')
helpers.mkdir(os.path.join('data', '12'))
zip_path = helpers.download(url, os.path.join('data', '12', 'tinyimagenet.zip'))
zipped = zipfile.ZipFile(zip_path, 'r')
data_path = os.path.join('data', '12', 'tinyimagenet')
zipped.extractall(data_path)
zipped.close()
os.remove(zip_path)

We now create helper structures for converting from an integer id to a string synset label (and back)

In [None]:
id_to_synset = []
with open(os.path.join(data_path, 'tiny-imagenet-200', 'wnids.txt'), 'r') as f:
    for line in f:
        id_to_synset.append(line.strip())
synset_to_id = {
    synset: i for i, synset in enumerate(id_to_synset)
}
# Check that ids were assigned correctly
assert all(
    synset_to_id[id_to_synset[i]] == i for i in range(len(id_to_synset))
)

In [None]:
# Common helper function pattern for creating Feature protos
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [None]:
# Train data
train_path = os.path.join('data', '12', 'tinyimagenet_train.tfrecords')
writer = tf.python_io.TFRecordWriter(tfrecord_path)
for root, dirs, files in os.walk(os.path.join(data_path, 'tiny-imagenet-200', 'train')):
    for file in files:
        if file.endswith('.JPEG'):
            synset = file[:9]
            label = synset_to_id[synset]
            with open(os.path.join(root, file), 'rb') as f:
                image_bytes = f.read()
            example = tf.train.Example(features=tf.train.Features(feature={
                'image_bytes': _bytes_feature(image_bytes),
                'label': _int64_feature(label)
            }))
            writer.write(example.SerializeToString())
writer.close()

In [None]:
name_to_label = {}
with open(os.path.join(data_path, 'tiny-imagenet-200', 'val', 'val_annotations.txt')) as f:
    for line in f:
        info = line.split()
        filename = info[0]
        synset = info[1]
        label = synset_to_id[synset]
        name_to_label[filename] = label

In [None]:
# Validation data
val_path = os.path.join('data', '12', 'tinyimagenet_val.tfrecords')
writer = tf.python_io.TFRecordWriter(tfrecord_path)
for root, dirs, files in os.walk(os.path.join(data_path, 'tiny-imagenet-200', 'val')):
    for file in files:
        if file.endswith('.JPEG'):
            label = name_to_label[file]
            with open(os.path.join(root, file), 'rb') as f:
                image_bytes = f.read()
            example = tf.train.Example(features=tf.train.Features(feature={
                'image_bytes': _bytes_feature(image_bytes),
                'label': _int64_feature(label)
            }))
            writer.write(example.SerializeToString())
writer.close()

# Queues

Using Queues will help us ensure that we're consistently filling the GPU ram on our system. They asynchronously pull in and preprocess data before putting them into queues.

In [None]:
# Should increase these for GPU systems
BATCH_SIZE = 5
NUM_THREADS = 2
CAPACITY = 100 + 3 * BATCH_SIZE

In [None]:
def image_label_queue():
    with tf.name_scope('input_queue'):
        # Step 1: create the list of filenames
        # TensorFlow has a helper for this: tf.train.match_filenames_once()
        # https://www.tensorflow.org/api_docs/python/tf/train/match_filenames_once
        train_path = os.path.join('data', '12', 'tinyimagenet_train.tfrecords')
        val_path = os.path.join('data', '12', 'tinyimagenet_val.tfrecords')
        filenames = [train_path, val_path]

        # Step 2: create a Queue that goes through 
        filename_queue = tf.train.string_input_producer([train_path])

        reader = tf.TFRecordReader()

        key, example = reader.read(filename_queue)
        features = tf.parse_single_example(
            example,
            features={
                'image_bytes': tf.FixedLenFeature([], tf.string),
                'label': tf.FixedLenFeature([], tf.int64)
            }
        )

        image_bytes = features['image_bytes']
        label = tf.to_int32(features['label'])

        # PREPROCESS BEFORE ADDING TO QUEUE
        image = tf.image.decode_jpeg(image_bytes, channels=3)
        image = tf.expand_dims(image, 0)
        image = tf.image.resize_bilinear(image, [224, 224])
        image = tf.squeeze(image)
        image = tf.to_float(image)
        image = (image - 127.5) / 127.5
        # 

        image_batch, label_batch = tf.train.shuffle_batch(
            [image, label],
            batch_size=BATCH_SIZE,
            num_threads=NUM_THREADS,
            capacity=CAPACITY,
            min_after_dequeue=15
        )
        return image_batch, label_batch

# Configuration Proto

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto#L172-L250

When creating a `Session`, you can pass in a set of options in the form of a ConfigProto protocol buffer. You simply add the options you want to the `ConfigProto` initialization function. Here are a couple of the things you can do with the config proto:

* Quietly place Ops on different devices if you explicitly call `with tf.device()` using a device that doesn't exist
* Print where devices are placed as they are created
* Tell TensorFlow to automatically use all of GPU memory immediately, but rather allocate it as necessary
* Set a timeout for Operations

In [None]:
gpu_config = tf.GPUOptions(
    allow_growth=True
)

config=tf.ConfigProto(
    allow_soft_placement=True,
    log_device_placement=True,
    gpu_options=gpu_config
)    
    
sess = tf.Session(config=config)

# Multi GPU TensorFlow

By default in TensorFlow, Operations are automatically placed on a CPU or GPU (if available). In general, an Operation will be automatically placed on a GPU unless there isn't a GPU implementation of that `Operation` (assuming you have TensorFlow installed for GPUs).

In [None]:
tf.reset_default_graph()

with tf.device('/cpu:0'):
    a = tf.random_normal([100, 100])
    b = tf.random_normal([100, 100])

with tf.device('/gpu:0'):
    c = tf.matmul(a, b)
    
config=tf.ConfigProto(
    allow_soft_placement=True,
    log_device_placement=True
)    
    
with tf.Session(config=config) as sess:
    print(sess.run(c))

## Adjusting our layer functions

In order to properly utilize multiple GPUs, we're going to need to adjust our layer functions to place all Variables on CPU. This will ensure that we have proper sharing of our variables

In [None]:
def conv2d(inputs, depth, ksize, strides=[1, 1], padding='SAME',
         bval=0.01, activation=tf.nn.relu, scope=None):
    with tf.variable_scope(scope, 'conv_layer'):
        with tf.device('/cpu:0'):
            prev_shape = inputs.get_shape().as_list()
            prev_depth = prev_shape[-1]
            kshape = ksize + [prev_depth, depth]
            strides = [1] + strides + [1]
            fan_in = np.prod(prev_shape[1:], dtype=np.float32)
            xavier_stddev = tf.sqrt(tf.constant(2.0, dtype=tf.float32) / fan_in, name='xavier_stddev')
            w_init = tf.truncated_normal_initializer(stddev=xavier_stddev)
            w = tf.get_variable('kernel', shape=kshape, initializer=w_init)
            b = tf.get_variable('bias', shape=[depth], initializer=tf.constant_initializer(bval))
        conv = tf.nn.conv2d(inputs, w, strides, padding, name='conv')
        z = tf.nn.bias_add(conv, b)
        return z if activation is None else activation(z)
    
def fully_connected_layer(inputs, depth, bval=0.01, activation=tf.nn.relu, 
                          keep_prob=None, scope=None):
    with tf.variable_scope(scope, 'fully_connected'):
        with tf.device('/cpu:0'):
            inputs = tf.convert_to_tensor(inputs)
            prev_shape = inputs.get_shape().as_list()
            fan_in = prev_shape[-1]
            xavier_stddev = tf.sqrt(tf.constant(2.0, dtype=tf.float32) / fan_in, name='xavier_stddev')
            w_init = tf.truncated_normal_initializer(stddev=xavier_stddev)
            w = tf.get_variable('weight', shape=[fan_in, depth], initializer=w_init)
            b = tf.get_variable('bias', shape=[depth], initializer=tf.constant_initializer(bval))
        z = tf.matmul(inputs, w) + b
        a = z if activation is None else activation(z)
        return a if keep_prob is None else tf.nn.dropout(a, keep_prob)

def flatten(inputs, name=None):
    prev_shape = inputs.get_shape().as_list()
    fan_in = np.prod(prev_shape[1:])
    with tf.name_scope(name, 'flatten'):
        return tf.reshape(inputs, [-1, fan_in])

def vgg_model(inputs):
    with tf.name_scope('vgg_net'):
        conv = conv2d(inputs, 64, [3, 3], activation=tf.nn.relu, scope='c1')
        conv = conv2d(conv, 64, [3, 3], activation=tf.nn.relu, scope='c2')
        pool = tf.layers.max_pooling2d(conv, [2, 2], [2, 2])
        conv = conv2d(pool, 128, [3, 3], activation=tf.nn.relu, scope='c3')
        conv = conv2d(conv, 128, [3, 3], activation=tf.nn.relu, scope='c4')
        pool = tf.layers.max_pooling2d(conv, [2, 2], [2, 2])
        conv = conv2d(pool, 256, [3, 3], activation=tf.nn.relu, scope='c5')
        conv = conv2d(conv, 256, [3, 3], activation=tf.nn.relu, scope='c6')
        conv = conv2d(conv, 256, [3, 3], activation=tf.nn.relu, scope='c7')
        conv = conv2d(conv, 256, [3, 3], activation=tf.nn.relu, scope='c8')
        pool = tf.layers.max_pooling2d(conv, [2, 2], [2, 2])
        conv = conv2d(pool, 512, [3, 3], activation=tf.nn.relu, scope='c9')
        conv = conv2d(conv, 512, [3, 3], activation=tf.nn.relu, scope='c10')
        conv = conv2d(conv, 512, [3, 3], activation=tf.nn.relu, scope='c11')
        conv = conv2d(conv, 512, [3, 3], activation=tf.nn.relu, scope='c12')
        pool = tf.layers.max_pooling2d(conv, [2, 2], [2, 2])
        conv = conv2d(pool, 512, [3, 3], activation=tf.nn.relu, scope='c13')
        conv = conv2d(conv, 512, [3, 3], activation=tf.nn.relu, scope='c14')
        conv = conv2d(conv, 512, [3, 3], activation=tf.nn.relu, scope='c15')
        conv = conv2d(conv, 512, [3, 3], activation=tf.nn.relu, scope='c16')
        pool = tf.layers.max_pooling2d(conv, [2, 2], [2, 2])
        flat = flatten(pool)
        fc = fully_connected_layer(flat, 4096, activation=tf.nn.relu, scope='f17')
        fc = fully_connected_layer(fc, 4096, activation=tf.nn.relu, scope='f18')
        logits = fully_connected_layer(fc, 1000, scope='f19')
        return logits

def vgg_loss(logits, labels):
    with tf.name_scope('loss'):
        labels_onehot = tf.one_hot(labels, 1000)
        loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels_onehot, logits=logits)
        loss = tf.reduce_mean(loss)
    return loss

In [None]:
"""
https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py#L101-L136
"""

def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.
    Note that this function provides a synchronization point across all towers.
    Args:
    tower_grads: List of lists of (gradient, variable) tuples. The outer list
      is over individual gradients. The inner list is over the gradient
      calculation for each tower.
    Returns:
     List of pairs of (gradient, variable) where the gradient has been averaged
     across all towers.
    """
    with tf.name_scope('merge_losses'):
        average_grads = []
        for grad_and_vars in zip(*tower_grads):
            # Note that each grad_and_vars looks like the following:
            #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
            grads = []
            for g, _ in grad_and_vars:
                # Add 0 dimension to the gradients to represent the tower.
                expanded_g = tf.expand_dims(g, 0)

                # Append on a 'tower' dimension which we will average over below.
                grads.append(expanded_g)

            # Average over the 'tower' dimension.
            grad = tf.concat(axis=0, values=grads)
            grad = tf.reduce_mean(grad, 0)

            # Keep in mind that the Variables are redundant because they are shared
            # across towers. So .. we will just return the first tower's pointer to
            # the Variable.
            v = grad_and_vars[0][1]
            grad_and_var = (grad, v)
            average_grads.append(grad_and_var)
        return average_grads

In [None]:
NUM_GPU = 2
DECAY_STEPS = 100000
DECAY_FACTOR  = 0.998
CKPT_PATH = 'vggnet'
multi_graph = tf.Graph()
# Note how we're placing everything on the CPU by default
with multi_graph.as_default(), tf.device('cpu:0'):
    
    gs_init = tf.constant_initializer(0)
    global_step = tf.get_variable('global_step', [], 
                                  initializer=gs_init, trainable=False)
    inc_step = tf.assign_add(global_step, 1, name='inc_step')
    
    lr = tf.train.exponential_decay(0.05, 
                                    global_step,
                                    DECAY_STEPS,
                                    DECAY_FACTOR,
                                    staircase=True)
    
    opt = tf.train.AdamOptimizer(lr)
    
    image_batch, label_batch = image_label_queue()
    
    # Here's where we loop through our various GPUs
    # G
    grads = []
    with tf.variable_scope(tf.get_variable_scope()) as scope:
        for i in range(NUM_GPU):
            with tf.device('/gpu:{}'.format(i)):
                with tf.name_scope('replica_{}'.format(i)):
                    # Create model
                    logits = vgg_model(image_batch)
                    loss = vgg_loss(logits, label_batch)
                    scope.reuse_variables()
                    replica_grad = opt.compute_gradients(loss)
                    grads.append(replica_grad)
                    tf.summary.scalar('loss', loss)
       
    avg_grads = average_gradients(grads)            
    train = opt.apply_gradients(avg_grads, global_step=global_step)
    
    for grad, var in avg_grads:
        if grad is not None:
            tf.summary.histogram(var.op.name + '_gradients', grad)
    
    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name, var)
    
    summary_op = tf.summary.merge_all()
        
    saver = tf.train.Saver()
    
    init = tf.global_variables_initializer()

In [None]:
config=tf.ConfigProto(
    allow_soft_placement=True,
    log_device_placement=True
)    

writer = tf.summary.FileWriter('tbout/multigpu', graph=multi_graph)

sess = tf.Session(config=config, graph=multi_graph)  
sess.run(init)

coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess)

try:
    i = 0
    while i < 10000 and not coord.should_stop():
        if i % 100 == 0:
            err, step, summ, _ = sess.run([loss, inc_step, summary_op, train])
            writer.add_summary(summ, step)
            writer.flush()
        else:
            err, step, _ = sess.run([loss, inc_step, train])

        if step % 1000 == 0 or step == 1:
            saver.save(sess, os.path.join(CKPT_PATH, 'vgg.ckpt'), step)
        
        i += 1
        # Don't want to run a bunch of training steps in the notebook! Remove me in a real application
        break
except tf.errors.OutOfRangeError:
    print('done!')
finally:
    coord.request_stop()
    try:
        coord.join(threads)
    except RuntimeError as e:
        print('Threads not done.')

RuntimeError

# Show and Tell

![](images/12/showandtell.png)

In [None]:
## VOCABULARY SETTINGS
# The number of words in our input vocabulary
IN_VOCAB = 10000
# The size of our embedded input word vectors
IN_EMBED_SIZE = 100
# The number of words in our output vocabulary
OUT_VOCAB = 10000
# The size of our embedded output word vectors
OUT_EMBED_SIZE = 4096

## RNN SETTINGS
# The number of units/neurons in each layer of RNN cells. 
RNN_WIDTH = 1000
# The layers of RNN cells for both the encoder and decoder
RNN_DEPTH = 4
# The maximum sequence length for our inputs
MAX_LEN_INPUTS = 10
# The maximum sequence length for our outputs
MAX_LEN_LABELS = 10

## TRAINING SETTINGS
# The size of our training batch
BATCH_SIZE = 32
# The number of negative samples to sample for our sampled softmax loss
NUM_SAMPLES=50

show_graph = tf.Graph()
with show_graph.as_default():
    inputs = tf.placeholder(tf.float32, [None, 224, 224, 3])
    labels = tf.placeholder(tf.int32, [None, MAX_LEN_LABELS])
    labels_length = tf.placeholder(tf.int32, [None])
    
    # Encoder
    
    vgg_logits = vgg_model(inputs)
    encoded_image = show_graph.get_tensor_by_name('vgg_net/f18/Relu:0')
    
    # Decoder
    output_embedding = tf.Variable(tf.truncated_normal([OUT_VOCAB, OUT_EMBED_SIZE]))
    w = tf.Variable(tf.truncated_normal([RNN_WIDTH, OUT_VOCAB]))
    b = tf.Variable(tf.zeros([OUT_VOCAB]))
    
    cell_base = tf.contrib.rnn.GRUCell
    dec_cells = [cell_base(RNN_WIDTH) for _ in range(RNN_DEPTH)]
    dec_cell = tf.contrib.rnn.MultiRNNCell(dec_cells)
    
    # Initial decoder state is zero, instead of encoder state (not an RNN encoder)
    dec_state = tuple(tf.unstack(tf.zeros([RNN_DEPTH,tf.shape(labels)[0], RNN_WIDTH])))
    # Initial input is the encoded image
    output, dec_state = dec_cell(encoded_image, dec_state)
    tf.get_variable_scope().reuse_variables()
    
    # Don't append the output from the first LSTM
    outputs = []
    logits = []
    prev = output
    for i, label in enumerate(tf.unstack(labels, axis=1)):
        logit = tf.matmul(prev, w) + b
        logits.append(logit)
        label_idx = tf.argmax(logit, 1)
        label_emb = tf.nn.embedding_lookup(output_embedding, label_idx)
        prev, dec_state = dec_cell(label_emb, dec_state)
        if i > 0:
            outputs.append(prev)

    lengths_exp = tf.expand_dims(labels_length, 1)
    mask = tf.reshape(tf.tile(tf.range(MAX_LEN_LABELS), [tf.shape(labels)[0]]), [-1, MAX_LEN_LABELS], name='mask_reshape')
    mask = tf.to_float(tf.less(mask, lengths_exp))
    
    def s_loss(logits, labels):
        logits = tf.reshape(logits, [-1, RNN_WIDTH], name='logit_s')
        labels = tf.reshape(labels, [-1, 1], name='label_s')
        return tf.nn.sampled_softmax_loss(
            weights=tf.transpose(w),
            biases=b,
            labels=labels,
            inputs=logits,
            num_sampled=NUM_SAMPLES,
            num_classes=OUT_VOCAB
        )
    test = tf.convert_to_tensor(outputs)
    outputs_tensor = tf.reshape(tf.convert_to_tensor(outputs), [-1, MAX_LEN_LABELS, RNN_WIDTH], name='logit_reshape')
    labels_float = tf.to_float(labels)
    loss = tf.contrib.seq2seq.sequence_loss(outputs_tensor, labels_float, mask, softmax_loss_function=s_loss)
    
    learning_rate = tf.placeholder(tf.float32, [])
    train = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
    
    global_step = tf.Variable(0, trainable=False, name='global_step')
    inc_step = tf.assign_add(global_step, 1, name='inc_step')
    
    logits_tensor = tf.reshape(tf.convert_to_tensor(logits), [-1, MAX_LEN_LABELS, OUT_VOCAB])
    softmax = tf.nn.softmax(logits_tensor)
    predictions = tf.to_int32(tf.argmax(softmax, 2))
    
    init = tf.global_variables_initializer()
    