# Convolutional Neural Networks

In [1]:
import sys
import time
import datetime
from os import getcwd, makedirs
from os.path import join, dirname, realpath, curdir, exists

import numpy as np
import pandas as pd
import tensorflow as tf

sys.path.append(join(dirname(getcwd()), "src"))
from utils import (read_text_files_and_labels_with_vocab_processor,
                   TextCNN)

In [2]:
base_data_path = join(dirname(getcwd()), "data")
training_data_path = join(base_data_path, "test_data_revised",
                          "PRAXIS_rapid_eval_MODEL_TRAINING_2015/*/*.txt")
test_data_path = join(base_data_path, "test_data_revised",
                      "PRAXIS_rapid_eval_TESTING_2015/*/*.txt")

In [3]:
train_labels_path = join(base_data_path, "test_data_revised",
                         "training_macro.csv")
test_labels_path = join(base_data_path, "test_data_revised",
                        "testing_macro.csv")
df = pd.read_csv(train_labels_path)
df = pd.concat([df, pd.read_csv(test_labels_path)])
df = df[["appointment_id", "H1"]]
df.rename(columns={"appointment_id": "id", "H1": "label"}, inplace=True)
ids_to_labels_dict = {}
if len(df.id) != len(set(df.id)):
    raise ValueError("Duplicate IDs!")
for id_ in df.id:
    ids_to_labels_dict[id_] = df[df.id == id_].iloc[0].label

In [4]:
# Create DataSet objects by using read_text_files_and_labels
(train_data, test_data, dev_data) = \
    read_text_files_and_labels_with_vocab_processor(ids_to_labels_dict,
                                                    training_data_path,
                                                    test_data_path,
                                                    get_id_from_text_file_func=
                                                        lambda x: int(x[:16]))

In [5]:
# show_data = False
show_data = True

In [6]:
if show_data:
    print("Shape of data:\n\tTraining: {}\n\tTest: {}"
          .format(train_data._features.shape,
                  test_data._features.shape))

Shape of data:
	Training: (4000, 1255)
	Test: (2750, 1255)


In [7]:
train_data._features

array([[   1,    2,    3, ...,    0,    0,    0],
       [   1,   89,  148, ...,    0,    0,    0],
       [  19,  203,  244, ...,    0,    0,    0],
       ..., 
       [  53,  942,  340, ...,    0,    0,    0],
       [  24,  157,   43, ...,    0,    0,    0],
       [ 456,   89, 3212, ...,    0,    0,    0]], dtype=int32)

In [8]:
# Allow device soft device placement
allow_soft_placement = True

# Log placement of ops on devices
log_device_placement = False

# Number of checkpoints to store
NUM_CHECKPOINTS = 5

# Evaluate model on dev set after this many steps
EVALUATE_EVERY = 100

# Number of training epochs
NUM_EPOCHS = 200

# Batch size
BATCH_SIZE = 64

# Use shape of `train_data._features` since it should be the same as for
# `test_data._features`/`dev_data._features`
NUM_FEATURES = train_data._features.shape[1]

# Dimensionality of character embedding
EMBEDDING_DIM = 128

# Filter sizes
FILTER_SIZES = [3, 4, 5]

# Number of filters per filter size
NUM_FILTERS = 128

# L2 regularization lambda
L2_REG_LAMBDA = 0.0

# Dropout keep probability
DROPOUT_KEEP_PROB = 0.5

In [9]:
with tf.Graph().as_default():
    session_conf = \
        tf.ConfigProto(allow_soft_placement=allow_soft_placement,
                       log_device_placement=log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(sequence_length=train_data.get_size(),
                      num_classes=train_data.get_num_classes(),
                      vocab_size=NUM_FEATURES,
                      embedding_size=EMBEDDING_DIM,
                      filter_sizes=FILTER_SIZES,
                      num_filters=NUM_FILTERS,
                      l2_reg_lambda=L2_REG_LAMBDA)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = \
                    tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = \
                    tf.summary.scalar("{}/grad/sparsity".format(v.name),
                                      tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = realpath(join(curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary,
                                             acc_summary,
                                             grad_summaries_merged])
        train_summary_dir = join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir,
                                                     sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                   sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory
        # already exists so we need to create it
        checkpoint_dir = realpath(join(out_dir, "checkpoints"))
        checkpoint_prefix = join(checkpoint_dir, "model")
        makedirs(checkpoint_dir, exist_ok=True)
        saver = tf.train.Saver(tf.global_variables(),
                               max_to_keep=NUM_CHECKPOINTS)

        # Write vocabulary
        vocab_processor.save(join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        def train_step(x_batch, y_batch):
            """
            A single training step

            :param x_batch: batch of training features
            :type x_batch: np.array
            :param y_batch: batch of labels
            :type y_batch: np.array

            :returns: None
            :rtype: None
            """

            feed_dict = {cnn.input_x: x_batch,
                         cnn.input_y: y_batch,
                         cnn.dropout_keep_prob: DROPOUT_KEEP_PROB}
            (_,
             step,
             summaries,
             loss,
             accuracy) = sess.run([train_op,
                                   global_step,
                                   train_summary_op,
                                   cnn.loss,
                                   cnn.accuracy],
                                  feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}"
                  .format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set.

            :param x_batch: batch of test features
            :type x_batch: np.array
            :param y_batch: batch of labels
            :type y_batch: np.array

            :returns: None
            :rtype: None
            """
            
            feed_dict = {cnn.input_x: x_batch,
                         cnn.input_y: y_batch,
                         cnn.dropout_keep_prob: 1.0}
            (step,
             summaries,
             loss,
             accuracy) = sess.run([global_step,
                                   dev_summary_op,
                                   cnn.loss,
                                   cnn.accuracy],
                                  feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}"
                  .format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        #batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
        #                                  BATCH_SIZE,
        #                                  NUM_EPOCHS)

        # Training loop. For each batch...
        while True:
            batch = train_data.next_batch(BATCH_SIZE)
            if not batch: break
            ids, features, labels = batch
            train_step(features, labels)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % EVALUATE_EVERY == 0:
                print("\nEvaluation:")
                dev_step(test_data._features, test_data._labels,
                         writer=dev_summary_writer)
                print("")
            if current_step % EVALUATE_EVERY == 0:
                path = saver.save(sess, checkpoint_prefix,
                                  global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

TypeError: Expected int32, got list containing Tensors of type '_Message' instead.

In [None]:
train_data._labels.shape

In [None]:
# Define some parameters
log_dir_path = join(getcwd(), "logs")
max_steps = 10000
optimizer_type = "adam"
#optimizer_type = "gradient descent"
learning_rate = 0.01
hidden1 = 512
hidden2 = 128
hidden3 = 16
# Use shape of `train_data._features` since it should be the same as for
# `test_data._features`/`dev_data._features`
NUM_FEATURES = train_data._features.shape[1]
batch_size = 10
NUM_CLASSES = 6
dropout = 0.5
NUM_FILTERS = 20
FILTERS = 3

In [None]:
# Tell TensorFlow that the model will be built into the default Graph.
with tf.Graph().as_default():

    # Generate placeholders for the input feature data and labels.
    inputs_placeholder = tf.placeholder(tf.float32, shape=(batch_size,
                                                           NUM_FEATURES))
    labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))
    
    keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
    
    x = tf.reshape(inputs_placeholder, shape=[-1, 1, NUM_FEATURES, 1])

    conv1 = conv_layer(x, 2, NUM_FILTERS, 2, 2)
    print(conv1.get_shape())
    conv2 = conv_layer(x, 3, NUM_FILTERS, 2, 2)
    print(conv2.get_shape())
    conv3 = conv_layer(x, 4, NUM_FILTERS, 2, 2)
    print(conv3.get_shape())
    
    conv = tf.concat(2, [conv1, conv2, conv3])
    print(conv.get_shape())
    
    reshape_length = NUM_FILTERS*FILTERS*conv1.get_shape().as_list()[2]
    input_fc = tf.reshape(conv, [-1, reshape_length])
    
    logits = fully_connected_network(input_fc, [reshape_length, 14, 14], keep_prob, NUM_CLASSES)
    
    # Add to the Graph the Ops for loss calculation.
    loss_ = loss(logits, labels_placeholder)
    
    # Add to the Graph the Ops that calculate and apply gradients.
    if optimizer_type == "adam":
        train_op = training_adam(loss_, learning_rate)
    elif optimizer_type == "gradient descent":
        train_op = training_gradient_descent(loss_, learning_rate)
    else:
        raise ValueError("Choose either \"adam\" or \"gradient descent\" for "
                         "`optimizer_type`.")

    # Add the Op to compare the logits to the labels during evaluation.
    eval_correct = evaluation(logits, labels_placeholder)

    # Build the summary Tensor based on the TF collection of Summaries.
    summary = tf.summary.merge_all()

    # Add the variable initializer Op.
    init = tf.global_variables_initializer()

    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()

    # Create a session for running Ops on the Graph.
    sess = tf.Session()

    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.summary.FileWriter(log_dir_path, sess.graph)

    # And then after everything is built:

    # Run the Op to initialize the variables.
    sess.run(init)

    # Start the training loop.
    for step in range(max_steps):
        start_time = time.time()

        # Fill a feed dictionary with the actual set of images and labels
        # for this particular training step.
        feed_dict = fill_feed_dict(train_data,
                                   inputs_placeholder,
                                   labels_placeholder,
                                   batch_size)

        feed_dict[keep_prob] = dropout
        # Run one step of the model.  The return values are the activations
        # from the `train_op` (which is discarded) and the `loss` Op.  To
        # inspect the values of your Ops or variables, you may include them
        # in the list passed to sess.run() and the value tensors will be
        # returned in the tuple from the call.
        _, loss_value = sess.run([train_op, loss_],
                                 feed_dict=feed_dict)

        duration = time.time() - start_time

        # Write the summaries and print an overview fairly often.
        if step % 100 == 0:

            # Print status to stdout.
            print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
            # Update the events file.
            summary_str = sess.run(summary, feed_dict=feed_dict)
            summary_writer.add_summary(summary_str, step)
            summary_writer.flush()
        
        # Save a checkpoint and evaluate the model periodically.
        if (step + 1) % 1000 == 0 or (step + 1) == max_steps:
            checkpoint_file = join(log_dir_path, 'model.ckpt')
            saver.save(sess, checkpoint_file, global_step=step)

            # Evaluate against the training set.
            print('Train Data Eval:')
            do_eval_cnn(sess,
                        eval_correct,
                        inputs_placeholder,
                        labels_placeholder,
                        train_data,
                        logits,
                        batch_size,
                        keep_prob, dropout)

            # Evaluate against the test set.
            print('Test Data Eval:')
            do_eval_cnn(sess,
                        eval_correct,
                        inputs_placeholder,
                        labels_placeholder,
                        test_data,
                        logits,
                        batch_size,
                        keep_prob, dropout)