In [122]:
import tensorflow as tf


In [123]:
# UPDATE-1 IN EACH PROJECT (depending on default values for each column)
# Determine default values for each column in case data is missing
record_defaults = [[""], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]

def decode_csv(line):
    parsed_line = tf.decode_csv(line, record_defaults)
    label = parsed_line[-1:]          # last column is label
    del parsed_line[-1]               # delete the last element from the list   (label column)
    del parsed_line[0]                # even delete the first element bcz it is assumed NOT to be a feature
    features = tf.stack(parsed_line)  # Stack features so that you can later vectorize forward prop., etc.
    label = tf.stack(label)           # Needed bcz labels consist of 2 columns
    batch_to_return = features, label

    return batch_to_return


In [124]:
# READS DATA FROM train data set CSV FILES, calculates mean (mu) and variance (sigma_square) for all features

# ASSUMPTIONS: (Otherwise, decode_csv function needs update)
# 1) The first column is NOT a feature. (It is most probably a training example ID or similar)
# 2) The last column is always the label. And there is ONLY 1 column that represents the label.
#    If more than 1 column represents the label, decode_csv() function needs update 
# 3) The first row is assumed to include names of the data types (i.e. feature name, label, etc.) so it is skipped

def get_input_norm_params(train_input_paths, minibatch_size):

    with tf.name_scope("next_train_batch"):
        filenames = tf.placeholder(tf.string, shape=[None])
        dataset = tf.data.Dataset.from_tensor_slices(filenames)
        dataset = dataset.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
        dataset = dataset.batch(minibatch_size)
        iterator = dataset.make_initializable_iterator()
        next_element = iterator.get_next()

    num_examples = 0    # will keep total # train examples
    mu = 0              # will keep mean of all feature values
    sigma_square = 0    # keeps variance (to be used for scaling)
    
    with tf.Session() as sess:
        sess.run(iterator.initializer, feed_dict={filenames: train_input_paths})
        while True:
            try:
              features, labels = sess.run(next_element)
        
              num_examples += features.shape[0] #size of axis=0 gives # train examples in the current batch
            
              # mu = sum_i(features) / num_train_examples  (where i = 1, .., num_train_examples)
              mu += tf.reduce_sum(features, axis=[0], keepdims=True)
              # sigma_square = sum_i(features ** 2) / num_train_examples  (where i = 1, .., num_train_examples)
              sigma_square +=  tf.reduce_sum(tf.multiply(features, features), axis=[0], keepdims=True)
            
              #print(sess.run(mu))
              #print(sess.run(sigma_square))
        
            except tf.errors.OutOfRangeError:
              print("Input normalization completed on train set data.")
              break
                
        mu /= num_examples
        sigma_square /= num_examples
    
        print("mu: \n", sess.run(mu))
        print("sigma: \n", sess.run(sigma_square))
    
    return mu, sigma_square

In [131]:
# READS DATA FROM train data set CSV FILES, normalizes the data and writes normalized values in a new file.
# This preprocessing needs to be done first if input normalization is supposed to be applied prior to training
# the actual model.

# ASSUMPTIONS: (Otherwise, decode_csv function needs update)
# 1) The first column is NOT a feature. (It is most probably a training example ID or similar)
# 2) The last column is always the label. And there is ONLY 1 column that represents the label.
#    If more than 1 column represents the label, decode_csv() function needs update 
# 3) The first row is assumed to include names of the data types (i.e. feature name, label, etc.) so it is skipped

def normalize_train_data(train_input_paths, minibatch_size, mu, sigma_square):

    with tf.name_scope("read_next_train_batch"):
        filenames = tf.placeholder(tf.string, shape=[None])
        dataset = tf.data.Dataset.from_tensor_slices(filenames)
        dataset = dataset.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
        dataset = dataset.batch(minibatch_size)
        iterator = dataset.make_initializable_iterator()
        next_element = iterator.get_next()
    
    with tf.Session() as sess:
        sess.run(iterator.initializer, feed_dict={filenames: train_input_paths})
        while True:
            try:
              features, labels = sess.run(next_element)
              print("features:\n", features)
              #print(sess.run("features:\n", sess.run(features)))
            
              # Normalize as ((input_features - mu) / sigma_square)
              normalized_features = tf.divide(tf.subtract(features, mu), sigma_square)
              print("normalized features:\n", sess.run(normalized_features))
              
            except tf.errors.OutOfRangeError:
              print("All data has been normalized and printed out")
              break
                
    return

In [132]:
# In this micro project, it is chosen to implement input normalization in a way
# so that we first calculate mu and sigma_square values based on the entire train
# set. The same mu and sigma_square values will be used in training as well as 
# when validating the model on dev and test sets. With other words, you do not
# re-calculate mu and sigma_square for dev and test sets

train_input_paths = ["train1.csv", "train2.csv"]

minibatch_size = 10

mu, sigma_square = get_input_norm_params(train_input_paths, minibatch_size)

normalize_train_data(train_input_paths, minibatch_size, mu, sigma_square)

Input normalization completed on train set data.
mu: 
 [[3.409091  3.6818182 1.25      2.4772727 2.8863637]]
sigma: 
 [[65.86364  67.90909   7.068182 14.295455 25.704546]]
features:
 [[ 1.  2.  5.  9.  8.]
 [ 4.  5.  1.  1.  6.]
 [-4.  5.  1.  0. 12.]
 [ 1.  4.  7.  4.  2.]
 [ 2.  2.  4.  8.  2.]
 [18. 28.  0.  0.  1.]
 [40.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  9.]
 [ 4.  7.  3.  1. -8.]
 [ 2.  3.  1.  1.  4.]]
normalized features:
 [[-0.03657695 -0.02476573  0.5305466   0.45627978  0.19893898]
 [ 0.0089717   0.01941098 -0.03536977 -0.10333863  0.12113173]
 [-0.11249137  0.01941098 -0.03536977 -0.17329094  0.35455346]
 [-0.03657695  0.00468541  0.8135048   0.10651828 -0.03448276]
 [-0.02139406 -0.02476573  0.3890675   0.38632748 -0.03448276]
 [ 0.22153208  0.35809907 -0.17684887 -0.17329094 -0.07338639]
 [ 0.5555555  -0.05421687 -0.17684887 -0.17329094 -0.11229001]
 [-0.05175983 -0.05421687 -0.17684887 -0.17329094  0.2378426 ]
 [ 0.0089717   0.04886211  0.24758841 -0.10333863 -0.4235190