# Convolutional Neural Networks

In [23]:
import re
import sys
import math
import time
import random
from glob import glob
from itertools import chain
from os import getcwd, listdir
from os.path import join, dirname, join, splitext, basename

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.contrib import learn

sys.path.append(join(dirname(getcwd()), "src"))
from utils import (read_data, DataSet, inference, loss, training_adam,
                   training_gradient_descent, evaluation, fill_feed_dict,
                   do_eval_cnn, read_text_files_and_labels)

In [2]:
base_data_path = join(dirname(getcwd()), "data")
training_data_path = join(base_data_path, "test_data_revised",
                          "PRAXIS_rapid_eval_MODEL_TRAINING_2015/*/*.txt")
test_data_path = join(base_data_path, "test_data_revised",
                      "PRAXIS_rapid_eval_TESTING_2015/*/*.txt")

In [3]:
train_labels_path = join(base_data_path, "test_data_revised",
                         "training_macro.csv")
test_labels_path = join(base_data_path, "test_data_revised",
                        "testing_macro.csv")
df = pd.read_csv(train_labels_path)
df = pd.concat([df, pd.read_csv(test_labels_path)])
df = df[["appointment_id", "H1"]]
df.rename(columns={"appointment_id": "id", "H1": "label"}, inplace=True)
ids_to_labels_dict = {}
if len(df.id) != len(set(df.id)):
    raise ValueError("Duplicate IDs!")
for id_ in df.id:
    ids_to_labels_dict[id_] = df[df.id == id_].iloc[0].label

In [4]:
# Create DataSet objects by using read_text_files_and_labels
(train_data, test_data, dev_data) = \
    read_text_files_and_labels(ids_to_labels_dict,
                               training_data_path,
                               test_data_path,
                               get_id_from_text_file_func=lambda x: int(x[:16]))

In [5]:
# show_data = False
show_data = True

In [6]:
if show_data:
    print("Shape of data:\n\tTraining: {}\n\tTest: {}"
          .format(train_data._features.shape,
                  test_data._features.shape))

Shape of data:
	Training: (4000, 1219)
	Test: (2750, 1219)


In [8]:
train_data._features

array([[18393, 31788, 11094, ...,    -1,    -1,    -1],
       [18393, 16906, 27596, ...,    -1,    -1,    -1],
       [18368, 56204,  1884, ...,    -1,    -1,    -1],
       ..., 
       [32218, 33461,  8588, ...,    -1,    -1,    -1],
       [10445, 23445, 35110, ...,    -1,    -1,    -1],
       [44186, 16906, 40001, ...,    -1,    -1,    -1]], dtype=int32)

In [44]:
# Try different method of loading in text, etc.

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

get_id_from_text_file_func = lambda x: int(x[:16])

NON_ALPHA_RE = re.compile(r"[^a-z0-9\-']+")
data_paths = [training_data_path, test_data_path]
partitions = ["training", "test"]
train_texts = []
train_ids = []
train_labels = []
test_texts = []
test_ids = []
test_labels = []
texts_list = [train_texts, test_texts]
ids_lists = [train_ids, test_ids]
labels_lists = [train_labels, test_labels]
for (texts,
     ids_list,
     labels_list,
     data_path) in zip(texts_list,
                       ids_lists,
                       labels_lists,
                       data_paths):
    file_paths = glob(data_path)
    if not file_paths:
        raise ValueError("glob('{}') resulted in no matching file paths!"
                         .format(data_path))
    for file_path in file_paths:
        id_ = get_id_from_text_file_func(basename(file_path))
        ids_list.append(id_)
        labels_list.append(ids_to_labels_dict[id_])
        with open(file_path) as text_file:
            texts.append(clean_str(text_file.read()))

In [45]:
# Example texts (preview)
train_texts[0][:50]

'colleges should require all students , regardless '

In [46]:
test_texts[0][:50]

"in today 's society , the only real function of a "

In [47]:
# Build vocabulary
all_texts = list(chain(train_texts, test_texts))
max_document_length = max([len(x.split(" ")) for x in all_texts])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
vocab_processor.fit(all_texts)

<tensorflow.contrib.learn.python.learn.preprocessing.text.VocabularyProcessor at 0x1252afa58>

In [48]:
max_document_length

1255

In [49]:
train_texts_vectorized = np.array(list(vocab_processor.transform(train_texts)))
test_texts_vectorized = np.array(list(vocab_processor.transform(test_texts)))

In [50]:
train_texts_vectorized.shape

(4000, 1255)

In [51]:
# Note: The padding value used is 0 rather than the -1 that the
# read_text_files_and_labels function uses
train_texts_vectorized

array([[   1,    2,    3, ...,    0,    0,    0],
       [   1,   89,  148, ...,    0,    0,    0],
       [  19,  203,  244, ...,    0,    0,    0],
       ..., 
       [  53,  942,  340, ...,    0,    0,    0],
       [  24,  157,   43, ...,    0,    0,    0],
       [ 456,   89, 3212, ...,    0,    0,    0]])

In [53]:
test_texts_vectorized.shape

(2750, 1255)

In [54]:
test_texts_vectorized

array([[  53,  942,  340, ...,    0,    0,    0],
       [ 456,   89,  300, ...,    0,    0,    0],
       [  53,  942,  340, ...,    0,    0,    0],
       ..., 
       [ 172, 2208, 3563, ...,    0,    0,    0],
       [  53,  203,  169, ...,    0,    0,    0],
       [  24, 2152,  346, ...,    0,    0,    0]])

In [55]:
# Create new DataSet objects using the other method of reading in
# the data, etc.
train_texts_vectorized = np.array(train_texts_vectorized, dtype=np.int32)
if len(train_ids) != len(np.array(train_ids, dtype=np.int32)):
    raise ValueError("Decrease in precision causes ID duplicates.")
train_ids = np.array(train_ids, dtype=np.int32)
train_labels = np.array(train_labels, dtype=np.int32)
if np.min(train_labels) == 1:
    train_labels = train_labels - 1
test_texts_vectorized = np.array(test_texts_vectorized, dtype=np.int32)
if len(test_ids) != len(np.array(test_ids, dtype=np.int32)):
    raise ValueError("Decrease in precision causes ID duplicates.")
test_ids = np.array(test_ids, dtype=np.int32)
test_labels = np.array(test_labels, dtype=np.int32)
if np.min(test_labels) == 1:
    test_labels = test_labels - 1

train_data = DataSet(train_ids, train_texts_vectorized, train_labels, random_=False)
test_data = DataSet(test_ids, test_texts_vectorized, test_labels, random_=False)

In [56]:
train_data.get_size()

4000

In [57]:
test_data.get_size()

2750

In [58]:
train_data._features

array([[   1,    2,    3, ...,    0,    0,    0],
       [   1,   89,  148, ...,    0,    0,    0],
       [  19,  203,  244, ...,    0,    0,    0],
       ..., 
       [  53,  942,  340, ...,    0,    0,    0],
       [  24,  157,   43, ...,    0,    0,    0],
       [ 456,   89, 3212, ...,    0,    0,    0]], dtype=int32)

In [59]:
# Define some parameters
log_dir_path = join(getcwd(), "logs")
max_steps = 10000
optimizer_type = "adam"
#optimizer_type = "gradient descent"
learning_rate = 0.01
hidden1 = 512
hidden2 = 128
hidden3 = 16

# Use updated value of feature size since the processing method was different
# in this case
#NUM_FEATURES = 1219
NUM_FEATURES = max_document_length

batch_size = 10
NUM_CLASSES = 6
dropout = 0.5
NUM_FILTERS = 20
FILTERS = 3

In [60]:
def fully_connected_network(input_fc, vector_sizes, keep_prob, num_classes):

    fc_w1 = tf.Variable(tf.random_normal([vector_sizes[0], vector_sizes[1]]))
    fc_b1 = tf.Variable(tf.random_normal([vector_sizes[1]]))

    hidden1 = tf.add(tf.matmul(input_fc, fc_w1), fc_b1)
    hidden1 = tf.nn.relu(hidden1)
    hidden1 = tf.nn.dropout(hidden1, keep_prob)

    fc_w2 = tf.Variable(tf.random_normal([vector_sizes[1], vector_sizes[2]]))
    fc_b2 = tf.Variable(tf.random_normal([vector_sizes[2]]))

    hidden2 = tf.add(tf.matmul(hidden1, fc_w2), fc_b2)
    hidden2 = tf.nn.relu(hidden2)
    hidden2 = tf.nn.dropout(hidden2, keep_prob)

    weights = tf.Variable(tf.random_normal([vector_sizes[2], num_classes]))
    biases = tf.Variable(tf.random_normal([num_classes]))
    logits = tf.matmul(hidden2, weights) + biases
    
    return logits

In [61]:
def conv_layer(input, filter_size, num_filter, max_pool_filter_size, max_pool_stride_size):
    
    weight = tf.Variable(tf.random_normal([filter_size, 1, 1, num_filter]))
    bias = tf.Variable(tf.random_normal([num_filter]))
    
    conv = tf.nn.conv2d(input, weight, strides=[1, 1, 1, 1], padding='SAME')
    conv = tf.nn.bias_add(conv, bias)
    conv = tf.nn.relu(conv, name="relu")
    
    conv = tf.nn.max_pool(conv,
                          ksize=[1, 1, max_pool_filter_size, 1],
                          strides=[1, 1, max_pool_stride_size, 1],
                          padding='VALID')
    return conv

In [62]:
# Tell TensorFlow that the model will be built into the default Graph.
with tf.Graph().as_default():

    # Generate placeholders for the input feature data and labels.
    inputs_placeholder = tf.placeholder(tf.float32, shape=(batch_size,
                                                           NUM_FEATURES))
    labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))
    
    keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
    
    x = tf.reshape(inputs_placeholder, shape=[-1, 1, NUM_FEATURES, 1])

    conv1 = conv_layer(x, 2, NUM_FILTERS, 2, 2)
    print(conv1.get_shape())
    conv2 = conv_layer(x, 3, NUM_FILTERS, 2, 2)
    print(conv2.get_shape())
    conv3 = conv_layer(x, 4, NUM_FILTERS, 2, 2)
    print(conv3.get_shape())
    
    conv = tf.concat(2, [conv1, conv2, conv3])
    print(conv.get_shape())
    
    reshape_length = NUM_FILTERS*FILTERS*conv1.get_shape().as_list()[2]
    input_fc = tf.reshape(conv, [-1, reshape_length])
    
    logits = fully_connected_network(input_fc, [reshape_length, 14, 14], keep_prob, NUM_CLASSES)
    
    # Add to the Graph the Ops for loss calculation.
    loss_ = loss(logits, labels_placeholder)
    
    # Add to the Graph the Ops that calculate and apply gradients.
    if optimizer_type == "adam":
        train_op = training_adam(loss_, learning_rate)
    elif optimizer_type == "gradient descent":
        train_op = training_gradient_descent(loss_, learning_rate)
    else:
        raise ValueError("Choose either \"adam\" or \"gradient descent\" for "
                         "`optimizer_type`.")

    # Add the Op to compare the logits to the labels during evaluation.
    eval_correct = evaluation(logits, labels_placeholder)

    # Build the summary Tensor based on the TF collection of Summaries.
    summary = tf.summary.merge_all()

    # Add the variable initializer Op.
    init = tf.global_variables_initializer()

    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()

    # Create a session for running Ops on the Graph.
    sess = tf.Session()

    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.summary.FileWriter(log_dir_path, sess.graph)

    # And then after everything is built:

    # Run the Op to initialize the variables.
    sess.run(init)

    # Start the training loop.
    for step in range(max_steps):
        start_time = time.time()

        # Fill a feed dictionary with the actual set of images and labels
        # for this particular training step.
        feed_dict = fill_feed_dict(train_data,
                                   inputs_placeholder,
                                   labels_placeholder,
                                   batch_size)

        feed_dict[keep_prob] = dropout
        # Run one step of the model.  The return values are the activations
        # from the `train_op` (which is discarded) and the `loss` Op.  To
        # inspect the values of your Ops or variables, you may include them
        # in the list passed to sess.run() and the value tensors will be
        # returned in the tuple from the call.
        _, loss_value = sess.run([train_op, loss_],
                                 feed_dict=feed_dict)

        duration = time.time() - start_time

        # Write the summaries and print an overview fairly often.
        if step % 100 == 0:

            # Print status to stdout.
            print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
            # Update the events file.
            summary_str = sess.run(summary, feed_dict=feed_dict)
            summary_writer.add_summary(summary_str, step)
            summary_writer.flush()
        
        # Save a checkpoint and evaluate the model periodically.
        if (step + 1) % 1000 == 0 or (step + 1) == max_steps:
            checkpoint_file = join(log_dir_path, 'model.ckpt')
            saver.save(sess, checkpoint_file, global_step=step)

            # Evaluate against the training set.
            print('Train Data Eval:')
            do_eval_cnn(sess,
                        eval_correct,
                        inputs_placeholder,
                        labels_placeholder,
                        train_data,
                        logits,
                        batch_size,
                        keep_prob, dropout)

            # Evaluate against the test set.
            print('Test Data Eval:')
            do_eval_cnn(sess,
                        eval_correct,
                        inputs_placeholder,
                        labels_placeholder,
                        test_data,
                        logits,
                        batch_size,
                        keep_prob, dropout)

(10, 1, 627, 20)
(10, 1, 627, 20)
(10, 1, 627, 20)
(10, 1, 1881, 20)
Step 0: loss = 92481.52 (0.023 sec)
Step 100: loss = 1.79 (0.010 sec)
Step 200: loss = 1.01 (0.009 sec)
Step 300: loss = 1.43 (0.007 sec)
Step 400: loss = 1.31 (0.007 sec)
Step 500: loss = 1.35 (0.007 sec)
Step 600: loss = 0.97 (0.007 sec)
Step 700: loss = 1.30 (0.008 sec)
Step 800: loss = 1.20 (0.008 sec)
Step 900: loss = 1.31 (0.007 sec)
Train Data Eval:
  Num examples: 4000  Num correct: 2086  Accuracy @ 1: 0.5215
Test Data Eval:
  Num examples: 2750  Num correct: 1395  Accuracy @ 1: 0.5073
Step 1000: loss = 0.96 (0.009 sec)
Step 1100: loss = 1.41 (0.007 sec)
Step 1200: loss = 1.18 (0.007 sec)
Step 1300: loss = 1.28 (0.007 sec)
Step 1400: loss = 0.89 (0.007 sec)
Step 1500: loss = 1.33 (0.007 sec)
Step 1600: loss = 1.17 (0.008 sec)
Step 1700: loss = 1.30 (0.008 sec)
Step 1800: loss = 0.90 (0.014 sec)
Step 1900: loss = 1.35 (0.007 sec)
Train Data Eval:
  Num examples: 4000  Num correct: 2086  Accuracy @ 1: 0.5215
Tes