# tf.Dataset

In [None]:
## Assorted examples, code doesn't actually run
import tensorflow as tf

### Example using TFRecords
filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
dataset = tf.data.TFRecordDataset(filenames) # create dataset from tfrecord files
dataset = dataset.map(parser_fn) # parse tf record files into tensors
dataset = dataset.shuffle(buffer_size=10000) # randomly sample 10000 from dataset
dataset = dataset.batch(32) # batch tensors into batch size of 32
dataset = dataset.repeat() # repeat dataset indefinitely

### Example using python generator
filenames = [..., ...]
def gen():
    for fname in filenames:
        inputs, outputs = read_file(fname) # load numpy arrays from file
        inputs, outputs = preprocess(inputs, outputs) # preprocess your inputs and outputs
        yield inputs, outputs # yield inputs and outputs
        
dataset = tf.data.Dataset.from_generator(gen, output_types=(tf.float32, tf.float32), 
                                 output_shapes=(tf.TensorShape([None]), tf.TensorShape([None]))) \
                 .shard(num_workers, worker_index) \
                 .skip(5) \
                 .take(100) \
                 .filter(lambda x: True) \
                 .shuffle(num_examples_per_epoch * num_epochs) \
                 .batch(32) \
                 .map(map_func) # very useful if you need to run per batch preprocessing


### Using datasets
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next() # tensor corresponding to output of dataset
sess.run(next_element) # get one element from dataset

training_op = model(next_element) # define some tensorflow graph on the inputs
sess.run(training_op) # will pull the next_element from the dataset and then run it throught the defined tf graph


### an alternative kind of iterator
max_value = tf.placeholder(tf.int64, shape=[])
dataset = tf.data.Dataset.range(max_value) # create a dataset of 1, 2, 3, ..., max_value
iterator = dataset.make_intializable_iterator()
next_element = iterator.get_next()

sess.run(iterator.initializer, feed_dict={max_value: 5})
for _ in range(5):
    sess.run(next_element) # gets 1, 2, 3, 4, 5 in order

sess.run(iterator.initializer, feed_dict={max_value: 200})
for _ in range(200):
    sess.run(next_element) # gets 1, 2, 3 ..., 200 in order
    
### ^ this kind of iterator is very useful for doing train/val splits and other such things

# Image Classification

## Download data from here: http://vision.stanford.edu/aditya86/ImageNetDogs/

In [None]:
from keras.layers import Dense, Flatten
from keras.applications import VGG16
from keras.objectives import categorical_crossentropy
import tensorflow as tf

def model(inputs, targets, num_classes):
    vgg_out = VGG16(weights='imagenet', include_top=False)(inputs)
    vgg_out_flat = Flatten()(vgg_out)
    fc_1 = Dense(256, activation='relu')(vgg_out_flat)
    softmax = Dense(num_classes, activation='softmax')(fc_1)
    
    loss = tf.reduce_mean(categorical_crossentropy(targets, softmax))
    
    training_op = tf.train.AdamOptimizer().minimize(loss)
    return training_op, loss

In [None]:
import os
import re
import random
import numpy as np

filenames_with_labels = []

for path in os.listdir('Images'):
    m = re.match(".*-(.*)", path)
    if m is None:
        continue
    label = m.group(1)
    for fname in os.listdir(os.path.join('Images', path)):
        filenames_with_labels.append((os.path.join('Images', path, fname), label))
random.shuffle(filenames_with_labels)
unique_labels =  np.unique([x[1] for x in filenames_with_labels])
label_to_ind = {label: i for i, label in enumerate(unique_labels)}

In [None]:
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
def generator():
    for image_fname, label in filenames_with_labels:
        img = image.load_img(image_fname, target_size=(224, 224))
        img = image.img_to_array(img)
        img = preprocess_input(img)
        
        label_ind = label_to_ind[label]
        one_hot = np.eye(len(unique_labels))[label_ind]
        
        yield {'image': img, 'target': one_hot}

In [None]:
num_epochs = 1
batch_size = 2
output_types = {'image': tf.float32,
                'target': tf.float32}
output_shapes = {'image': tf.TensorShape([224, 224, 3]),
                 'target': tf.TensorShape([len(unique_labels)])}
def batch_process_map(tensor_dict):
    d = tensor_dict.copy()
    d['mean'] = tf.reduce_mean(tensor_dict['image'], axis=[1,2,3])
    return d
ds = tf.data.Dataset.from_generator(generator, output_types, output_shapes) \
                    .shuffle(100) \
                    .batch(batch_size)
                    .map(batch_process_map)
iterator = ds.make_one_shot_iterator()
next_element = iterator.get_next()
training_op, loss = model(next_element['image'], next_element['target'], len(unique_labels))

In [None]:
import sys
with tf.Session() as sess:
    step = 0
    sess.run(tf.global_variables_initializer())
    while True:
        try:
            _, l = sess.run([training_op, loss])
            print("Loss for step {}: {}".format(step, l))
            if step % len(filenames_with_labels) == 0:
                print("Starting epoch: {}".format(step / len(filenames_with_labels)))
            step += 1
            sys.stdout.flush()
        except tf.errors.OutOfRangeError:
            print("Finished training")
            break

# Tensorboard

In [None]:
from keras.layers import Dense, Flatten
from keras.applications import VGG16
from keras.objectives import categorical_crossentropy
import tensorflow as tf

def model(inputs, targets, num_classes):
    tf.summary.image("images", inputs)
    vgg_out = VGG16(weights='imagenet', include_top=False)(inputs)
    vgg_out_flat = Flatten()(vgg_out)
    fc_1 = Dense(256, activation='relu')(vgg_out_flat)
    tf.summary.histogram("fc_1", fc_1)
    softmax = Dense(num_classes, activation='softmax')(fc_1)
    tf.summary.histogram("activations", softmax)
    
    loss = tf.reduce_mean(categorical_crossentropy(targets, softmax))
    tf.summary.scalar("loss", loss)
    
    training_op = tf.train.AdamOptimizer().minimize(loss)
    return training_op, loss

In [None]:
num_epochs = 1
output_types = {'image': tf.float32,
                'target': tf.float32}
output_shapes = {'image': tf.TensorShape([224, 224, 3]),
                 'target': tf.TensorShape([len(unique_labels)])}
ds = tf.data.Dataset.from_generator(generator, output_types, output_shapes) \
                    .shuffle(100) \
                    .batch(batch_size)
iterator = ds.make_one_shot_iterator()
next_element = iterator.get_next()
training_op, loss = model(next_element['image'], next_element['target'], len(unique_labels))

In [None]:
import sys
with tf.Session() as sess:
    file_writer = tf.summary.FileWriter('logs', sess.graph)
    step = 0
    sess.run(tf.global_variables_initializer())
    while True:
        try:
            merged = tf.summary.merge_all()
            summary, _, l = sess.run([merged, training_op, loss])
            file_writer.add_summary(summary, step)
            print("Loss for step {}: {}".format(step, l))
            if step % len(filenames_with_labels) == 0:
                print("Starting epoch: {}".format(step / len(filenames_with_labels)))
            step += 1
        except tf.errors.OutOfRangeError:
            print("Finished training")
            break