In [1]:
# LSTM Sample code
# from: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/dynamic_rnn.py

In [2]:
from __future__ import print_function

import tensorflow as tf
import random
import collections, itertools
import re

# Helper libraries
from shared_lib import vocabulary, utils

In [3]:
# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
    else: return "<unk>" # unknown token

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

def sents_to_tokens(sents, vocab):
    """Returns an flattened list of the words in the sentences, with normal padding."""
    padded_sentences = (["<s>"] + s + ["</s>"] for s in sents)
    # This will canonicalize words, and replace anything not in vocab with <unk>
    return np.array([canonicalize_word(w, wordset=vocab.wordset)
                     for w in flatten(padded_sentences)], dtype=object)

def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))

def build_vocab(corpus, V=10000):
    words = []
    for i in range(0,corpus.shape[0]):
        words += corpus[i].split()
    token_feed = (canonicalize_word(w) for w in words)
    vocab = vocabulary.Vocabulary(token_feed, size=V)
    return vocab


def preprocess_sentences(corpus, vocab):
    """Preprocess sentences by canonicalizing and mapping to ids.
    Args:
      sentences ( list(list(string)) ): input sentences
      vocab: Vocabulary object, already initialized
    Returns:
      ids ( array(int) ): flattened array of sentences, including boundary <s>
      tokens.
    """
    # Get sentences
    sents = []
    for i in range(0,corpus.shape[0]):
        sents.append(corpus[i])
    
    # Map sentences to word ids after canonicalizing
    sentences = []
    for i in range(0, len(sents)):
        words = sents[i].split()
        words = [canonicalize_word(w, wordset=vocab.word_to_id) for w in words]
        word_ids = vocab.words_to_ids(words)
        sentences.append(word_ids)

    return sentences

def process_data(data, V=10000):
    """Load and split train/test along sentences in dataset."""
    corpus = data
    vocab = build_vocab(corpus, V)
    sentences = preprocess_sentences(corpus, vocab)
    return vocab, sentences


In [None]:
# Pre-process data
vocab, processed_data_train = process_data(data_train)
vocab, processed_data_test = process_data(data_test)

In [None]:
# Find max_seq_length from processed data
max_sequence = max(len(max(processed_data_test,key=len)), len(max(processed_data_train,key=len)))

In [None]:
class DataHandling(object):
    """ 
    
    """
    def __init__(self, data, labs, max_seq_len=max_sequence):
        self.data = data
        ideo_labs = labs
        self.labels = []
        self.seqlen = []
        self.batch_id = 0

        # Make inner lists
        for i in range(0, len(self.data)):
            new_list = [[self.data[i][x]] for x in range(0, len(self.data[i]))]
            self.data[i] = new_list
        
        # Pad sequence for dimension consistency
        for i in range(0, len(self.data)):
            if len(self.data[i]) < max_seq_len:
                s = []
                s += [[0.] for i in range(max_seq_len - len(self.data[i]))]
                self.data[i] += s
            else:
                pass
        
        # Map: Liberal --> (1,0,0), Neutral --> (0,1,0), Conservative --> (0,0,1)
        for i in range(0, ideo_labs.shape[0]):
            if ideo_labs[i] == 'Liberal':
                self.labels.append([1.,0.,0.])
            elif ideo_labs[i] == 'Conservative':
                self.labels.append([0.,0.,1.])
            else:
                self.labels.append([0.,1.,0.])

    def next(self, batch_size):
        """ Return a batch of data. When dataset end is reached, start over.
        """
        if self.batch_id == len(self.data):
            self.batch_id = 0
        batch_data = (self.data[self.batch_id:min(self.batch_id + batch_size, len(self.data))])
        batch_labels = (self.labels[self.batch_id:min(self.batch_id + batch_size, len(self.data))])
        batch_seqlen = (self.seqlen[self.batch_id:min(self.batch_id + batch_size, len(self.data))])
        self.batch_id = min(self.batch_id + batch_size, len(self.data))
        return batch_data, batch_labels, batch_seqlen


In [None]:
# Data to pass to LSTM
trainset = DataHandling(processed_data_train,labs_train)
testset = DataHandling(processed_data_test,labs_test)

# Show train/test data and labels are the same shape
print('Train data shape: ', len(trainset.data))
print('Train label shape: ', len(trainset.labels))
print('Test data shape: ', len(testset.data))
print('Test label shape: ', len(testset.labels))

In [None]:
# Parameters
learning_rate = 0.01
training_iters = 1000000
batch_size = 20
display_step = 10

# Network Parameters
seq_max_len = max_sequence # Sequence max length
n_hidden = 64 # hidden layer num of features
n_classes = 3 # linear sequence or not


# tf Graph input
x = tf.placeholder("float", [None, seq_max_len, 1])
y = tf.placeholder("float", [None, n_classes])

# A placeholder for indicating each sequence length
seqlen = tf.placeholder(tf.int32, [None])

# Define weights
weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
}
biases = {
    'out': tf.Variable(tf.random_normal([n_classes]))
}

In [None]:
def dynamicRNN(x, seqlen, weights, biases):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, n_steps, n_input)
    # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
    
    # Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
    x = tf.unstack(x, seq_max_len, 1)

    # Define a lstm cell with tensorflow
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)

    # Get lstm cell output, providing 'sequence_length' will perform dynamic
    # calculation.
    outputs, states = tf.nn.rnn(lstm_cell, x, dtype=tf.float32,
                                sequence_length=seqlen)

    # When performing dynamic calculation, we must retrieve the last
    # dynamically computed output, i.e., if a sequence length is 10, we need
    # to retrieve the 10th output.
    # However TensorFlow doesn't support advanced indexing yet, so we build
    # a custom op that for each sample in batch size, get its length and
    # get the corresponding relevant output.

    # 'outputs' is a list of output at every timestep, we pack them in a Tensor
    # and change back dimension to [batch_size, n_step, n_input]
    outputs = tf.stack(outputs)
    outputs = tf.transpose(outputs, [1, 0, 2])

    # Hack to build the indexing and retrieve the right output.
    batch_size = tf.shape(outputs)[0]
    # Start indices for each sample
    index = tf.range(0, batch_size) * seq_max_len + (seqlen - 1)
    # Indexing
    outputs = tf.gather(tf.reshape(outputs, [-1, n_hidden]), index)

    # Linear activation, using outputs computed above
    return tf.matmul(outputs, weights['out']) + biases['out']

In [None]:
pred = dynamicRNN(x, seqlen, weights, biases)

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


In [None]:
# Initializing the variables
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    step = 1
    # Keep training until reach max iterations
    while step * batch_size < training_iters:
        batch_x, batch_y, batch_seqlen = trainset.next(batch_size)
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
                                       seqlen: batch_seqlen})
        if step % display_step == 0:
            # Calculate batch accuracy
            acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y,
                                                seqlen: batch_seqlen})
            # Calculate batch loss
            loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y,
                                             seqlen: batch_seqlen})
            print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
                  "{:.6f}".format(loss) + ", Training Accuracy= " + \
                  "{:.5f}".format(acc))
        step += 1
    print("Optimization Finished!")

    # Calculate accuracy
    test_data = testset.data
    test_label = testset.labels
    test_seqlen = testset.seqlen
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={x: test_data, y: test_label,
                                      seqlen: test_seqlen}))