## Trying a simpler CNN classification model

### Importing and Massaging Data

http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/

In [1]:
# Import necessary libraries
import pickle
import numpy as np
from scipy import sparse
import collections
import itertools
import re

import os
import time
import datetime
from tensorflow.contrib import learn
import csv

from shared_lib import vocabulary


In [2]:
# Load and save data into liberal, conservative and neutral objects
[lib, con, neutral] = pickle.load(open('ibcData.pkl', 'rb'))

In [3]:
# Formatting data into workable arrays
liberal = np.array(lib)
conserv = np.array(con)
neut = np.array(neutral)

# Seprating data and labels
def separate_data_and_labels(label_class):
    labels = []
    data = []
    for i in range(len(label_class)):
        for node in label_class[i]:
            if hasattr(node, 'label'):
                data.append(node.get_words())
                labels.append(node.label)
    data = np.array(data)
    labels = np.array(labels)
    return data, labels

lib_data, lib_labs = separate_data_and_labels(liberal)
con_data, con_labs = separate_data_and_labels(conserv)
neut_data, neut_labs = separate_data_and_labels(neut)

In [4]:
# Combining into one dataset
data_all = np.concatenate((neut_data, lib_data, con_data), axis=0)
labs_all = np.concatenate((neut_labs, lib_labs, con_labs), axis=0)

print (data_all.shape)
print (labs_all.shape)

(22621,)
(22621,)


In [5]:
# Randomly mixing data&labels so that they can be split into test and train
def shuffle_in_unison(a, b):
    assert len(a) == len(b)
    shuffled_a = np.empty(a.shape, dtype=a.dtype)
    shuffled_b = np.empty(b.shape, dtype=b.dtype)
    permutation = np.random.permutation(len(a))
    for old_index, new_index in enumerate(permutation):
        shuffled_a[new_index] = a[old_index]
        shuffled_b[new_index] = b[old_index]
    return shuffled_a, shuffled_b

data_all, labs_all = shuffle_in_unison(data_all, labs_all)

In [6]:
# Split data into test (20%) and train (80%)
slice = int(.8*labs_all.shape[0])
data_train = data_all[:slice]
labs_train = labs_all[:slice]
data_test = data_all[slice:]
labs_test = labs_all[slice:]
print(labs_all.shape)
print(labs_test.shape)
print(labs_train.shape)

(22621,)
(4525,)
(18096,)


### Text pre-processing

In [7]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def clean_data_and_labels(sentences, labels):
    """
    Takes an array of sentences and their labels.
    Splits the data into words and generates labels. Returns clean sentences and labels.
    """
    # Array to list of sentences
    x_text = [s.strip() for s in sentences]
    # Clean words
    x_text = [clean_str(sent) for sent in x_text]
    
    # Generate labels
    # Map: Liberal --> (1,0,0), Neutral --> (0,1,0), Conservative --> (0,0,1)
    y = []
    for i in range(0, labels.shape[0]):
        if labels[i] == 'Liberal':
            y.append([1,0,0])
        elif labels[i] == 'Conservative':           
            y.append([0,0,1])
        else:
            y.append([0,1,0])
            
    return x_text, y


# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
    else: return "<unk>" # unknown token


def build_vocab(corpus, V=10000):
    words = []
    for i in range(0,corpus.shape[0]):
        words += corpus[i].split()
    token_feed = (canonicalize_word(w) for w in words)
    vocab = vocabulary.Vocabulary(token_feed, size=V)
    return vocab


In [24]:
vocab = build_vocab(data_all, V = 14836)

### A Working CNN!

In [None]:
import tensorflow as tf
import numpy as np

# Defining the graph
class CNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    """
    def __init__(self, sequence_length, num_classes, vocab_size,embedding_size, filter_sizes, num_filters):
        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
    
        with tf.device('/cpu:0'), tf.name_scope("Embedding_Layer"):
            W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),name="W")
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
            
        
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Max-pooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * 3 #len(filter_sizes)
        self.h_pool = tf.concat(3, pooled_outputs)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        
        
        # Add dropout
        with tf.name_scope("Dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
            
            
        # Define outputs
        with tf.name_scope("Output_Layer"):
            W = tf.Variable(tf.truncated_normal([num_filters_total, num_classes], stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
        
        
        # Calculate mean cross-entropy loss
        with tf.name_scope("Cost_Function"):
            losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
            self.loss = tf.reduce_mean(losses)
            
        
        # Calculate Accuracy to compare to other models
        with tf.name_scope("Accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")


In [25]:
# Load data
x_raw, y = clean_data_and_labels(data_all, labs_all)

# Map data into vocabulary
max_sentence_len = len(max(data_all, key=len).split())
vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_len)
x = np.array(list(vocab_processor.fit_transform(x_raw)))

# Split up vocabulary
split = int(0.9*x.shape[0])
x_train, x_dev = x[:split], x[split:]
y_train, y_dev = y[:split], y[split:]

In [None]:
# Building the graph

with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        cnn = CNN(
            sequence_length=x_train.shape[1],
            num_classes=3,
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=128,
            filter_sizes=map(int, '3,4,5'.split(",")),
            num_filters=128)

        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-4)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)


        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpointing
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")

        # Tensorflow assumes this directory already exists so we need to create it
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables())

        sess.run(tf.global_variables_initializer())

In [34]:
# Defining an epoch
def train_epoch(x_batch, y_batch):
    """
    A single training epoch
    """
    feed_dict = {
      cnn.input_x: x_batch,
      cnn.input_y: y_batch,
      cnn.dropout_keep_prob: 0.5
    }
    _, step, summaries, loss, accuracy = sess.run(
        [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
        feed_dict)
    #time_str = datetime.datetime.now().isoformat()
    #print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
    train_summary_writer.add_summary(summaries, step)

def dev_epoch(x_batch, y_batch, writer=None):
    """
    Evaluates model on a dev set
    """
    feed_dict = {
      cnn.input_x: x_batch,
      cnn.input_y: y_batch,
      cnn.dropout_keep_prob: 1.0
    }
    step, summaries, loss, accuracy, predictions = sess.run(
        [global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.predictions],
        feed_dict)
    time_str = datetime.datetime.now().isoformat()
    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
    if writer:
        writer.add_summary(summaries, step)
    return predictions

In [35]:
# Function to generate batches
def batch_generator(data, labels, batch_size, num_epochs):
    """
    Generates a batch iterator for a dataset.
    """
    data_size = len(data)
    
    data = np.array(data)
    labels = np.array(labels)
    
    num_batches_per_epoch = int((data_size-1)/batch_size) + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield data[start_index:end_index], labels[start_index:end_index]

In [None]:
# Generate batches
batches = batch_generator(x_train, y_train, batch_size = 64, num_epochs = 5)

# Training loop
for batch in batches:
    x_batch, y_batch = batch
    train_epoch(x_batch, y_batch)
    current_step = tf.train.global_step(sess, global_step)
    if current_step % 50 == 0: # evaluate every 50 steps
        print("\nEvaluation:")
        dev_epoch(x_dev, y_dev, writer=dev_summary_writer)
        print("")
    if current_step % 100 == 0: # checkpoint every 100 steps
        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
        print("Saved model checkpoint to {}\n".format(path))

## Showing Predictions

In [None]:
# Getting predictions for the dev set
predictions = dev_epoch(x_dev, y_dev, writer=dev_summary_writer)

In [None]:
# Printing out examples of predictions in the dev set
data_dev = data_all[split:]

print("Note:",'\n',"Map: Liberal --> 0, Neutral --> 1, Conservative --> 2",'\n')
for i in range(0, 10):
    correct_label = y_dev[i].index(max(y_dev[i]))
    if predictions[i] == correct_label:
        print("CORRECT:")
        print("Correct Label:", correct_label)
        print ("Predicted Label:", predictions[i])
        print(data_dev[i], '\n')
    else:
        print("WRONG:")
        print("Correct Label:", correct_label)
        print ("Predicted Label:", predictions[i])
        print(data_dev[i], '\n')


## CNN with Google Word Vectors

http://www.mattmahoney.net/dc/textdata

In [9]:
from gensim.models import word2vec

In [10]:
model = word2vec.Word2Vec.load_word2vec_format('/Users/megan/Downloads/GoogleNews-vectors-negative300.bin', binary=True)  

In [29]:
# Creating embeddings of pre-trained word vectors

# Initialize start, stop, and unk words randomly
start = np.random.rand(300,)
stop = np.random.rand(300,)
unk = np.random.rand(300,)
embeddings = np.vstack((start, stop, unk))

# Loop through words and pull initialized embeddings
for i in range(3, len(vocab.ordered_words())):
    try:
        vector = model.wv[vocab.ordered_words()[i]]
    except KeyError: # the word does not have a pre-initialized vector
        vector = np.random.rand(300,) #initialize randomly
    
    embeddings = np.vstack((embeddings,vector))

In [30]:
embeddings.shape

(14836, 300)

In [37]:
import tensorflow as tf

# Defining the graph
class initialized_CNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    """
    def __init__(self, sequence_length, num_classes, 
                 vocab_size,embedding_size, filter_sizes, 
                 num_filters, embedding):
        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
    
        with tf.device('/cpu:0'), tf.name_scope("Embedding_Layer"):
            W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_size]),
                trainable=True, name="W")
            self.embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_size])
            self.embedding_init = W.assign(self.embedding_placeholder)
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
            
        
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Max-pooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * 3 #len(filter_sizes)
        self.h_pool = tf.concat(3, pooled_outputs)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        
        
        # Add dropout
        with tf.name_scope("Dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
            
            
        # Define outputs
        with tf.name_scope("Output_Layer"):
            W = tf.Variable(tf.truncated_normal([num_filters_total, num_classes], stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
        
        
        # Calculate mean cross-entropy loss
        with tf.name_scope("Cost_Function"):
            losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
            self.loss = tf.reduce_mean(losses)
            
        
        # Calculate Accuracy to compare to other models
        with tf.name_scope("Accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")


In [38]:
# Building the graph

with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        cnn = initialized_CNN(
            sequence_length=x_train.shape[1],
            num_classes=3,
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=300,
            filter_sizes=map(int, '3,4,5'.split(",")),
            num_filters=128,
            embedding = embeddings
        )

        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-4)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)


        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpointing
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")

        # Tensorflow assumes this directory already exists so we need to create it
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables())

        sess.run(cnn.embedding_init, feed_dict={cnn.embedding_placeholder: embeddings})
        sess.run(tf.global_variables_initializer())

Writing to /Users/megan/Documents/W266_final_project/runs/1493485591



In [39]:
# Generate batches
batches = batch_generator(x_train, y_train, batch_size = 64, num_epochs = 5)

# Training loop
for batch in batches:
    x_batch, y_batch = batch
    train_epoch(x_batch, y_batch)
    current_step = tf.train.global_step(sess, global_step)
    if current_step % 50 == 0: # evaluate every 50 steps
        print("\nEvaluation:")
        dev_epoch(x_dev, y_dev, writer=dev_summary_writer)
        print("")
    if current_step % 100 == 0: # checkpoint every 100 steps
        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
        print("Saved model checkpoint to {}\n".format(path))


Evaluation:
2017-04-29T10:07:23.973549: step 50, loss 1.04479, acc 0.542201


Evaluation:
2017-04-29T10:08:08.546536: step 100, loss 0.991615, acc 0.542643

Saved model checkpoint to /Users/megan/Documents/W266_final_project/runs/1493485591/checkpoints/model-100


Evaluation:
2017-04-29T10:08:53.087333: step 150, loss 0.950423, acc 0.539991


Evaluation:
2017-04-29T10:09:39.640344: step 200, loss 0.924513, acc 0.548829

Saved model checkpoint to /Users/megan/Documents/W266_final_project/runs/1493485591/checkpoints/model-200


Evaluation:
2017-04-29T10:10:27.950290: step 250, loss 0.905098, acc 0.571807


Evaluation:
2017-04-29T10:11:15.136827: step 300, loss 0.880652, acc 0.601414

Saved model checkpoint to /Users/megan/Documents/W266_final_project/runs/1493485591/checkpoints/model-300


Evaluation:
2017-04-29T10:12:03.063064: step 350, loss 0.858892, acc 0.655767


Evaluation:
2017-04-29T10:12:50.893534: step 400, loss 0.837894, acc 0.693327

Saved model checkpoint to /Users/megan/Do

In [40]:
# Getting predictions for the dev set
predictions = dev_epoch(x_dev, y_dev, writer=dev_summary_writer)

# Printing out examples of predictions in the dev set
data_dev = data_all[split:]

print("Note:",'\n',"Map: Liberal --> 0, Neutral --> 1, Conservative --> 2",'\n')
for i in range(0, 10):
    correct_label = y_dev[i].index(max(y_dev[i]))
    if predictions[i] == correct_label:
        print("CORRECT:")
        print("Correct Label:", correct_label)
        print ("Predicted Label:", predictions[i])
        print(data_dev[i], '\n')
    else:
        print("WRONG:")
        print("Correct Label:", correct_label)
        print ("Predicted Label:", predictions[i])
        print(data_dev[i], '\n')


2017-04-29T10:49:33.657669: step 1595, loss 0.461783, acc 0.836942
Note: 
 Map: Liberal --> 0, Neutral --> 1, Conservative --> 2 

CORRECT:
Correct Label: 0
Predicted Label: 0
a global agreement on a new ( strategic plan to halt biodiversity loss ) , the mobilisation of the finance needed to make it happen and a new legally-binding protocol on access and benefit sharing ( ABS ) 

CORRECT:
Correct Label: 0
Predicted Label: 0
a summer replacement sitcom created by Alan King , depicted impoverished Muscovites suffering deprivations and corny bread line jokes while living in a cramped one-bedroom apartment 

CORRECT:
Correct Label: 1
Predicted Label: 1
the housing market 

CORRECT:
Correct Label: 1
Predicted Label: 1
three days of oral arguments 

CORRECT:
Correct Label: 2
Predicted Label: 2
the Libertarian Party 's Bob Barr , who had an excellent immigration record as a Republican congressman and who has not totally capitulated to the culturally illiterate left-libertarianism that now dom