Exercise from TensorFlow for Machine Intelligence, by Ariel Scarpinelli; Danijar Hafner; Erik Erwitt; Sam Abrahams; Troy Mott

#### Features: 

Logistic Regression scaffold. 
Saving checkpoints.
Titanic dataset from Kaggle 
Categorization of data into relevant features. 

#### Useful snippets: 

tf.pack, tf.decode_csv, tf.train.shuffle_batch

#### Worth mentioning: 

tf.reduce_mean joins reduce_sum and reduce_prod

tf.cast


In [1]:
import tensorflow as tf


In [2]:
# Setup constants 
training_steps = 1000
learning_rate = 0.01
no_of_records = 800

# initialize variables/model parameters
W = tf.Variable(tf.zeros([5,1]), name='weights')
b = tf.Variable(0.0, name='biases')

In [None]:
# define the training loop operations

In [3]:
def read_csv(batch_size, file_name, record_defaults):
    filename_queue = tf.train.string_input_producer([file_name])

    reader = tf.TextLineReader(skip_header_lines=1)
    key, value = reader.read(filename_queue)

    # decode_csv will convert a Tensor from type string (the text line) in
    # a tuple of tensor columns with the specified defaults, which also
    # sets the data type for each column
    decoded = tf.decode_csv(value, record_defaults=record_defaults)

    # batch actually reads the file and loads "batch_size" rows in a single tensor
    return tf.train.shuffle_batch(decoded,
                                  batch_size=batch_size,
                                  capacity=batch_size * 50,
                                  min_after_dequeue=batch_size)

In [None]:
# SKIP - just a debug section 
# Testing the new method: does not work... at least runs very long... what am I doing wrong here? 
#batch_size = 10
#file_name = "train.csv"
#record_defaults = [[0.0], [0.0], [0], [""], [""], [0.0], [0.0], [0.0], [""], [0.0], [""], [""]]

#data_batch = read_csv(batch_size, file_name, record_defaults)
#tf.Session().run(data_batch)

In [4]:
def inputs():
    passenger_id, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked = \
        read_csv(no_of_records, "train.csv", [[0.0], [0.0], [0], [""], [""], [0.0], [0.0], [0.0], [""], [0.0], [""], [""]])

    # convert categorical data
    is_first_class = tf.to_float(tf.equal(pclass, [1]))
    is_second_class = tf.to_float(tf.equal(pclass, [2]))
    is_third_class = tf.to_float(tf.equal(pclass, [3]))

    gender = tf.to_float(tf.equal(sex, ["female"]))

    # Finally we pack all the features in a single matrix;
    # We then transpose to have a matrix with one example per row and one feature per column.
    features = tf.transpose(tf.pack([is_first_class, is_second_class, is_third_class, gender, age]))
    survived = tf.reshape(survived, [no_of_records, 1])

    return features, survived

In [5]:
def combine_inputs(X):
    return tf.matmul(X, W) + b

# new inferred value is the sigmoid applied to the former
def inference(X):
    return tf.sigmoid(combine_inputs(X))

In [6]:
def loss(X, Y):
    # compute loss/cost between predictions and expected outputs Y
    return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(combine_inputs(X), Y))

In [7]:
def train(total_loss):
    # train / adjust model parameters according to computed total loss
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train = optimizer.minimize(total_loss)
    return train

In [8]:
def evaluate(sess, X, Y):
    predicted = tf.cast(inference(X) > 0.5, tf.float32)
    print sess.run(tf.reduce_mean(tf.cast(tf.equal(predicted, Y), tf.float32)))
    

In [10]:
# Create a saver for saving training checkpoints 
saver = tf.train.Saver()

# Launch the graph in a session, setup boilerplate
with tf.Session() as sess:

    tf.initialize_all_variables().run()

    X, Y = inputs()

    total_loss = loss(X, Y)
    train_op = train(total_loss)

    # Need more info on this
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # actual training loop
    
    for step in range(training_steps):
        sess.run([train_op])
        # for debugging and learning purposes, see how the loss gets decremented thru training steps
        if step % 100 == 0:
            print "loss: ", sess.run([total_loss])
        # for saving training data to a checkpoint after 1000 steps.
        if step % 1000 == 0:
            saver.save(sess, 'my-model', global_step=step)
            
    evaluate(sess, X, Y)
    saver.save(sess, 'my-model', global_step=training_steps) # Save the checkpoint data
    coord.request_stop()
    coord.join(threads)
    sess.close()

loss:  [0.71079481]
loss:  [0.66135389]
loss:  [0.64831328]
loss:  [0.61731249]
loss:  [0.59576046]
loss:  [0.57414263]
loss:  [0.57320136]
loss:  [0.5638603]
loss:  [0.56719285]
loss:  [0.53430879]
0.7575
