In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
import os
import time
from datetime import datetime
from utils import *
import matplotlib.pyplot as plt
import re
from nltk.tokenize import TweetTokenizer
import tensorflow as tf
from tensorflow.contrib import rnn
import math

%matplotlib inline

# Load Dictionary

In [2]:
pos_list = list()
neg_list = list()
rev_list = list()
inc_list = list()
dec_list = list()
sent_words_dict = dict()


fneg = open('../dict/negative-words.txt', 'r')
fpos = open('../dict/positive-words.txt', 'r')
frev = open('../dict/reverse-words.txt', 'r')
fdec = open('../dict/decremental-words.txt', 'r')
finc = open('../dict/incremental-words.txt', 'r')

for line in fpos:
    if not line.split()[0] in sent_words_dict:
        sent_words_dict[line.split()[0]] = 0
        pos_list.append(line.split()[0])
for line in fneg:
    if not line.split()[0] in sent_words_dict:
        sent_words_dict[line.split()[0]] = 1
        neg_list.append(line.split()[0])
for line in frev:
    if not line.split()[0] in sent_words_dict:
        sent_words_dict[line.split()[0]] = 2
        rev_list.append(line.split()[0])
for line in finc:
    if not line.split()[0] in sent_words_dict:
        sent_words_dict[line.split()[0]] = 3
        inc_list.append(line.split()[0])
for line in fdec:
    if not line.split()[0] in sent_words_dict:
        sent_words_dict[line.split()[0]] = 4
        dec_list.append(line.split()[0])
            
fneg.close()
fpos.close()
frev.close()
fdec.close()
finc.close()

In [3]:
vocabulary_size = 20000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
X_train_labels = list()
sent_dict = dict()
sentences = list()

sent_dict['positive'] = 0
sent_dict['negative'] = 1
sent_dict['neutral'] = 2
sent_dict['objective'] = 2
sent_dict['objective-OR-neutral'] = 2

# Preprocess train dataset

In [4]:
# Read the data and append SENTENCE_START and SENTENCE_END tokens
print "Reading CSV file..."
with open('../data/semeval/2013/b.dist.csv', 'rb') as f:
    reader = csv.reader(f, delimiter='\t')
    reader.next()
    # Split full comments into sentences
    for x in reader:
        tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
        sentences.append(re.sub(r"http\S+", "", x[3]).decode('utf-8').lower())
        # Append SENTENCE_START and SENTENCE_END
        X_train_labels.append(sent_dict[x[2]])
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]

f.close()
print "Parsed %d sentences." % (len(sentences))

# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

Reading CSV file...
Parsed 6087 sentences.


In [5]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique words tokens." % len(word_freq.items())
 
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab if re.search(r'^[a-zA-Z]', x[0]) and len(x[0]) < 20 and len(x[0]) > 1]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
word_to_index_reverse = {v: k for k, v in word_to_index.iteritems()}

vocabulary_size = len(word_to_index)

print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

Found 16824 unique words tokens.
Using vocabulary size 15399.
The least frequent word in our vocabulary is 'alexisnews' and appeared 1 times.


In [6]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w for w in sent if w in word_to_index]
    
    if (len(tokenized_sentences[i]) < 40):
        while (len(tokenized_sentences[i]) < 40):
            tokenized_sentences[i].append(unknown_token)
    else:
        while (len(tokenized_sentences[i]) > 40):
            tokenized_sentences[i].pop()
    
print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]

X_train = list()
X_train_sent_for_word = list()
X_train_mask = list()

# Create the training data
for sent in tokenized_sentences:
    temp_index = list()
    temp_sent = list()
    temp_mask = list()
    for w in sent[:-1]:
        temp_index.append(word_to_index[w])
        if (w in pos_list):
            temp_sent.append(0)
            temp_mask.append(1.)
        elif (w in neg_list):
            temp_sent.append(1)
            temp_mask.append(1.)
        elif (w in rev_list):
            temp_sent.append(2)
            temp_mask.append(1.)
        elif (w in inc_list):
            temp_sent.append(3)
            temp_mask.append(1.)
        elif (w in dec_list):
            temp_sent.append(4)
            temp_mask.append(1.)
        elif (w == unknown_token):
            temp_sent.append(5)
            temp_mask.append(0.)
        else:
            temp_sent.append(5)
            temp_mask.append(0.)
            
    X_train.append(temp_index)
    X_train_sent_for_word.append(temp_sent)
    X_train_mask.append(temp_mask)
    
#X_train = np.asarray(X_train)
#X_train_sent_for_word = np.asarray(X_train_sent_for_word)
#X_train_mask = np.asarray(X_train_mask)

y_train = [[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]

# Print an training data example
x_example, y_example = X_train[0], y_train[0]
print "x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)
print "\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)


Example sentence: 'SENTENCE_START iranian general says israel's iron dome can't deal with their missiles (keep talking like that and we may end up finding out) SENTENCE_END'

Example sentence after Pre-processing: '[u'SENTENCE_START', u'iranian', u'general', u'says', u'israel', u'iron', u'dome', u'ca', u"n't", u'deal', u'with', u'their', u'missiles', u'keep', u'talking', u'like', u'that', u'and', u'we', u'may', u'end', u'up', u'finding', u'out', u'SENTENCE_END', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN']'




x:
SENTENCE_START iranian general says israel iron dome ca n't deal with their missiles keep talking like that and we may end up finding out SENTENCE_END UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
[0, 7457, 2283, 214, 804, 4654, 4323, 93, 23, 659, 11, 131, 7047, 343, 924, 53, 16, 6, 28, 24, 220, 40, 4051, 27, 1, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398]

y:
iranian general says israel iron dome ca n't deal with their missiles keep talking like that and we may end up finding out SENTENCE_END UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
[7457, 2283, 214, 804, 4654, 4323, 93, 23, 659, 11, 131, 7047, 343, 924, 53, 16, 6, 2

# Preprocess test dataset

In [7]:
sentences = list()
X_test_labels = list()
with open('../data/semeval/2013/b.test.dist.csv', 'rb') as f:
    reader = csv.reader(f, delimiter='\t')
    reader.next()
    # Split full comments into sentences
    for x in reader:
        tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
        sentences.append(re.sub(r"http\S+", "", x[3]).decode('utf-8').lower())
        # Append SENTENCE_START and SENTENCE_END
        X_test_labels.append(sent_dict[x[2]])
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
    
print "Parsed %d sentences." % (len(sentences))
f.close()
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w for w in sent if w in word_to_index]
    if (len(tokenized_sentences[i]) < 40):
        while (len(tokenized_sentences[i]) < 40):
            tokenized_sentences[i].append(unknown_token)
    else:
        while (len(tokenized_sentences[i]) > 40):
            tokenized_sentences[i].pop()
print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]

X_test = list()
X_test_sent_for_word = list()
X_test_mask = list()

# Create the training data
for sent in tokenized_sentences:
    temp_index = list()
    temp_sent = list()
    temp_mask = list()
    for w in sent[:-1]:
        temp_index.append(word_to_index[w])
        if (w in pos_list):
            temp_sent.append(0)
            temp_mask.append(1.)
        elif (w in neg_list):
            temp_sent.append(1)
            temp_mask.append(1.)
        elif (w in rev_list):
            temp_sent.append(2)
            temp_mask.append(1.)
        elif (w in inc_list):
            temp_sent.append(3)
            temp_mask.append(1.)
        elif (w in dec_list):
            temp_sent.append(4)
            temp_mask.append(1.)
        elif (w == unknown_token):
            temp_sent.append(5)
            temp_mask.append(0.)
        else:
            temp_sent.append(5)
            temp_mask.append(0.)
            
    X_test.append(temp_index)
    X_test_sent_for_word.append(temp_sent)
    X_test_mask.append(temp_mask)
    
#X_test = np.asarray(X_test)
#X_test_sent_for_word = np.asarray(X_test_sent_for_word)
#X_test_mask = np.asarray(X_test_mask)

y_test = [[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]

# Print an training data example
x_example, y_example = X_test[0], y_test[0]
print "x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)
print "\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)

Parsed 2345 sentences.

Example sentence: 'SENTENCE_START on radio786 100.4fm 7:10 fri oct 19 labour analyst shawn hattingh: cosatu's role in the context of unrest in the mining  SENTENCE_END'

Example sentence after Pre-processing: '[u'SENTENCE_START', u'on', u'fri', u'oct', u'labour', u'shawn', u'role', u'in', u'the', u'of', u'in', u'the', u'mining', u'SENTENCE_END', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN']'




x:
SENTENCE_START on fri oct labour shawn role in the of in the mining SENTENCE_END UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
[0, 5, 1024, 174, 9783, 4309, 2961, 4, 2, 7, 4, 2, 4818, 1, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398, 15398]

y:
on fri oct labour shawn role in the of in the mining SENTENCE_END UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_T

# Modeling

In [11]:
batch_size = 100
seq_max_len = 39
num_sentiment_label = 3
num_sentiment_for_word = 6
embedding_size = 256
num_linear_inside = 256
num_ltsm_inside = 256
layers = 2
alpha  = 0.01
dimension = vocabulary_size + num_sentiment_for_word

graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
    tf_X_train = tf.placeholder(tf.int32, shape=[None, seq_max_len])
    tf_X_train_sent_for_word = tf.placeholder(tf.int32, shape=[None, seq_max_len])
    tf_X_train_mask = tf.placeholder(tf.float32, shape=[None, seq_max_len])
    tf_X_train_labels = tf.placeholder(tf.int32, shape=[None])
    tf_y_train = tf.placeholder(tf.int32, shape=[None, seq_max_len])
    keep_prob = tf.placeholder(tf.float32)
    
    embeddings_w = tf.Variable(tf.random_uniform([dimension, embedding_size], -1.0, 1.0))
    embeddings_b = tf.Variable(tf.zeros([embedding_size]))
    
    lm_w = tf.Variable(tf.truncated_normal([2 * num_ltsm_inside, vocabulary_size], 
                                           stddev=1.0 / math.sqrt(num_ltsm_inside)))
    lm_b = tf.Variable(tf.zeros([vocabulary_size]))
    
    sent_w = tf.Variable(tf.truncated_normal([2 * num_ltsm_inside, num_sentiment_label],
                                             stddev=1.0 / math.sqrt(num_ltsm_inside)))
    sent_b = tf.Variable(tf.zeros([num_sentiment_label]))
    
    X_data = tf.one_hot(tf_X_train, vocabulary_size,
                        on_value = 1.0,
                        off_value = 0.0,
                        axis = -1)
    X_sent_for_word = tf.one_hot(tf_X_train_sent_for_word, num_sentiment_for_word,
                                 on_value = 1.0,
                                 off_value = 0.0,
                                 axis = -1)
    X_labels = tf.one_hot(tf_X_train_labels, num_sentiment_label,
                          on_value = 1.0,
                          off_value = 0.0,
                          axis = -1)
    y_labels = tf.one_hot(tf_y_train, vocabulary_size,
                          on_value = 1.0,
                          off_value = 0.0,
                          axis = -1)
    X_data = tf.concat(2, [X_data, X_sent_for_word])
    

    X_data = tf.transpose(X_data, [1, 0, 2])
    # Reshaping to (n_steps*batch_size, n_input)
    X_data = tf.reshape(X_data, [-1, dimension])
    X_data = tf.add(tf.matmul(X_data, embeddings_w), embeddings_b)
    X_data = tf.nn.relu(X_data)
    X_data = tf.split(0, seq_max_len, X_data)
    

    # Creating the forward and backwards cells
    lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_ltsm_inside, forget_bias=1.0)
    lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_ltsm_inside, forget_bias=1.0)
    # Pass lstm_fw_cell / lstm_bw_cell directly to tf.nn.bidrectional_rnn
    # if only a single layer is needed
    lstm_fw_multicell = tf.nn.rnn_cell.MultiRNNCell([lstm_fw_cell]*layers)
    lstm_bw_multicell = tf.nn.rnn_cell.MultiRNNCell([lstm_bw_cell]*layers)
    # Get lstm cell output
        
    outputs, _, _ = tf.nn.bidirectional_rnn(lstm_fw_multicell,
                                            lstm_bw_multicell,
                                            X_data,
                                            dtype='float32')
    # outputs = tf.pack(outputs)
    
    reconstruction = tf.reshape(outputs, [-1, 2 * num_ltsm_inside])    
    reconstruction = tf.add(tf.matmul(reconstruction, lm_w), lm_b)
    reconstruction = tf.split(0, seq_max_len, reconstruction)
    # change back dimension to [batch_size, n_step, n_input]
    reconstruction = tf.pack(reconstruction)
    reconstruction = tf.transpose(reconstruction, [1, 0, 2]) 
    reconstruction = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(reconstruction,
                                                                            y_labels))
    
    
    # sentiment = tf.transpose(outputs, [1, 0, 2])
    # mask = tf.expand_dims(tf_X_train_mask, 2)
    # sentiment = tf.multiply(sentiment, mask)
    # sentiment = tf.reduce_mean(sentiment, reduction_indices=1)
    sentiment = outputs[-1]
    sentiment = tf.nn.dropout(sentiment, keep_prob)
    sentiment = tf.add(tf.matmul(sentiment, sent_w), sent_b)
    prediction = tf.argmax(tf.nn.softmax(sentiment), 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, tf.argmax(X_labels, 1)), tf.float32))
    correct_prediction = tf.reduce_sum(tf.cast(tf.equal(prediction, tf.argmax(X_labels, 1)), tf.float32))
    sentiment = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(sentiment, X_labels))
    
    cost = alpha * reconstruction + (1 - alpha) * sentiment

    global_step = tf.Variable(0, trainable=False)
    learning_rate = tf.train.exponential_decay(0.01, global_step, 1000, 0.65, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost,
                                                                          global_step=global_step)
    
    saver = tf.train.Saver()
    

(?, 39, 15405)
(39, ?, 512)


In [16]:
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    # saver.restore(sess, "se-v002.ckpt")
    for i in range(10000):
        s_start = (i * batch_size) % len(X_train)
        if s_start + batch_size >= len(X_train):
            s_start = len(X_train) - batch_size
        
        s_end = s_start + batch_size
        
        _, c, r, s, a = session.run([optimizer, cost, reconstruction, sentiment, accuracy],
                                    feed_dict = {tf_X_train: X_train[s_start:s_end], 
                                     tf_X_train_sent_for_word: X_train_sent_for_word[s_start:s_end],
                                     tf_X_train_mask: X_train_mask[s_start:s_end],
                                     tf_X_train_labels: X_train_labels[s_start:s_end],
                                     tf_y_train: y_train[s_start:s_end],
                                     keep_prob: 0.5})
        
        print("Cost at step %d is %.4f, cost lm is %.4f, cost sent is %.4f, accuracy is %.4f" %(i, c, r, s, a))
        # print("Cost at step %d is %.4f, accuracy is %.4f" %(step, c, a))
        if (i % 5 == 0):
            step = 0
            true_prediction = 0
            while step + batch_size <= len(X_test):
                correct_pred = session.run(correct_prediction,
                                           feed_dict = {tf_X_train: X_test[step:step+batch_size], 
                                            tf_X_train_sent_for_word: X_test_sent_for_word[step:step+batch_size],
                                            tf_X_train_mask: X_test_mask[step:step+batch_size],
                                            tf_X_train_labels: X_test_labels[step:step+batch_size],
                                            tf_y_train: y_test[step:step+batch_size],
                                            keep_prob: 1.0})
                print ("Correct prediction: {0} - {1}/ batch_size = {2}".format(step,
                                                                                correct_pred,
                                                                                batch_size))
                true_prediction += correct_pred
                step += batch_size
            
            step = step - batch_size
            correct_pred = session.run(correct_prediction,
                                       feed_dict = {tf_X_train: X_test[step:len(X_test)], 
                                        tf_X_train_sent_for_word: X_test_sent_for_word[step:len(X_test)],
                                        tf_X_train_mask: X_test_mask[step:len(X_test)],
                                        tf_X_train_labels: X_test_labels[step:len(X_test)],
                                        tf_y_train: y_test[step:len(X_test)],
                                        keep_prob: 1.0})
            print ("Correct prediction: {0} - {1}/ batch_size = {2}".format(step,
                                                                            correct_pred,
                                                                            len(X_test) - step))
            
            true_prediction += correct_pred
            
            print(true_prediction)
            print ("Test accuracy: {}".format(true_prediction/len(X_test)))
            
            saver.save(session, "se-v002.ckpt")

Initialized
Cost at step 0 is 1.1704, cost lm is 9.7716, cost sent is 1.0835, accuracy is 0.3300
Correct prediction: 0 - 44.0/ batch_size = 100
Correct prediction: 100 - 48.0/ batch_size = 100
Correct prediction: 200 - 35.0/ batch_size = 100
Correct prediction: 300 - 43.0/ batch_size = 100
Correct prediction: 400 - 54.0/ batch_size = 100
Correct prediction: 500 - 38.0/ batch_size = 100
Correct prediction: 600 - 38.0/ batch_size = 100
Correct prediction: 700 - 39.0/ batch_size = 100
Correct prediction: 800 - 40.0/ batch_size = 100
Correct prediction: 900 - 31.0/ batch_size = 100
Correct prediction: 1000 - 50.0/ batch_size = 100
Correct prediction: 1100 - 51.0/ batch_size = 100
Correct prediction: 1200 - 54.0/ batch_size = 100
Correct prediction: 1300 - 44.0/ batch_size = 100
Correct prediction: 1400 - 52.0/ batch_size = 100
Correct prediction: 1500 - 59.0/ batch_size = 100
Correct prediction: 1600 - 43.0/ batch_size = 100
Correct prediction: 1700 - 58.0/ batch_size = 100
Correct predict

KeyboardInterrupt: 