In [52]:
# Shows an exmaple usage for tf.one_hot(..) and tf.squeeze(..) functions.
# When developing NN, one_hot function can be beneficial when you only 
# have the class number as label (i.e. 0, 1, 2, 3, etc.) If you know
# the total number of classes (= # units in output layer), you can easily
# construct a matrix of vectors where each vector represents the label 
# of a given train/dev/test example in the output layer. This can be
# needed either when calculating forward prop or cost of the model.


In [53]:
import tensorflow as tf

In [54]:
# UPDATE-1 IN EACH PROJECT (depending on default values for each column)
# Determine default values for each column in case data is missing
record_defaults = [[""], [0.0], [0.0], [0.0], [0.0], [0.0], [0]]

def decode_csv(line):
    parsed_line = tf.decode_csv(line, record_defaults)
    label = parsed_line[-1:]          # last column is label
    del parsed_line[-1]               # delete the last element from the list   (label column)
    del parsed_line[0]                # even delete the first element bcz it is assumed NOT to be a feature
    features = tf.stack(parsed_line)  # Stack features so that you can later vectorize forward prop., etc.
    label = tf.stack(label)           # Needed bcz labels consist of 2 columns
    batch_to_return = features, label

    return batch_to_return


In [61]:
# READS DATA FROM train data set CSV FILES, normalizes the data and writes normalized values in a new file.
# This preprocessing needs to be done first if input normalization is supposed to be applied prior to training
# the actual model.

# ASSUMPTIONS: (Otherwise, decode_csv function needs update)
# 1) The first column is NOT a feature. (It is most probably a training example ID or similar)
# 2) The last column is always the label. And there is ONLY 1 column that represents the label.
#    If more than 1 column represents the label, decode_csv() function needs update 
# 3) The first row is assumed to include names of the data types (i.e. feature name, label, etc.) so it is skipped

def apply_one_hot_on_labels(train_input_paths, minibatch_size, num_classes):

    with tf.name_scope("read_next_train_batch"):
        filenames = tf.placeholder(tf.string, shape=[None])
        dataset = tf.data.Dataset.from_tensor_slices(filenames)
        dataset = dataset.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
        dataset = dataset.batch(minibatch_size)
        iterator = dataset.make_initializable_iterator()
        next_element = iterator.get_next()
    
    with tf.Session() as sess:
        sess.run(iterator.initializer, feed_dict={filenames: train_input_paths})
        while True:
            try:
              features, labels = sess.run(next_element)
              print("labels:\n", labels)
              print("labels.shape: ", labels.shape, "\n")
              
              # apply one_hot  
              vectorized_labels = tf.one_hot(labels, depth=num_classes)
              print("vectorized_labels:\n", sess.run(vectorized_labels))
              print("vectorized_labels.shape: ", vectorized_labels.shape, "\n")
            
              # one_hot adds one more dimension. Remove that dimension if you don't need it
              reduced_dim = tf.squeeze(vectorized_labels, axis=1)
              print("reduced_dim:\n", sess.run(reduced_dim))   
              print("reduced_dim.shape: ", reduced_dim.shape, "\n")  
            except tf.errors.OutOfRangeError:
              print("All data has been normalized and printed out")
              break
                

In [63]:
# In this micro project, it is chosen to implement input normalization in a way
# so that we first calculate mu and sigma_square values based on the entire train
# set. The same mu and sigma_square values will be used in training as well as 
# when validating the model on dev and test sets. With other words, you do not
# re-calculate mu and sigma_square for dev and test sets

train_input_paths = ["train1.csv", "train2.csv"]

minibatch_size = 3
num_classes = 2

apply_one_hot_on_labels(train_input_paths, minibatch_size, num_classes)

labels:
 [[1]
 [1]
 [1]]
labels.shape:  (3, 1) 

vectorized_labels:
 [[[0. 1.]]

 [[0. 1.]]

 [[0. 1.]]]
vectorized_labels.shape:  (3, 1, 2) 

reduced_dim:
 [[0. 1.]
 [0. 1.]
 [0. 1.]]
reduced_dim.shape:  (3, 2) 

labels:
 [[1]
 [1]
 [1]]
labels.shape:  (3, 1) 

vectorized_labels:
 [[[0. 1.]]

 [[0. 1.]]

 [[0. 1.]]]
vectorized_labels.shape:  (3, 1, 2) 

reduced_dim:
 [[0. 1.]
 [0. 1.]
 [0. 1.]]
reduced_dim.shape:  (3, 2) 

labels:
 [[1]
 [1]
 [0]]
labels.shape:  (3, 1) 

vectorized_labels:
 [[[0. 1.]]

 [[0. 1.]]

 [[1. 0.]]]
vectorized_labels.shape:  (3, 1, 2) 

reduced_dim:
 [[0. 1.]
 [0. 1.]
 [1. 0.]]
reduced_dim.shape:  (3, 2) 

labels:
 [[0]
 [1]
 [0]]
labels.shape:  (3, 1) 

vectorized_labels:
 [[[1. 0.]]

 [[0. 1.]]

 [[1. 0.]]]
vectorized_labels.shape:  (3, 1, 2) 

reduced_dim:
 [[1. 0.]
 [0. 1.]
 [1. 0.]]
reduced_dim.shape:  (3, 2) 

labels:
 [[0]
 [0]
 [0]]
labels.shape:  (3, 1) 

vectorized_labels:
 [[[1. 0.]]

 [[1. 0.]]

 [[1. 0.]]]
vectorized_labels.shape:  (3, 1, 2) 

r