In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import HTMLParser as htm
import string
import re
import time

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

## Read in data

In [4]:
data = pd.read_csv("tweet_data_1.csv",sep='\t',quoting=3)
data["escape"] = data.apply(lambda row: htm.HTMLParser().unescape(row[1].decode("utf-8")),axis=1)

data.head()

Unnamed: 0,Id,Tweet,Emotion,Positive,escape
0,138881940341260288:,I got a surprise for all you bitches...pull th...,:: surprise,0,I got a surprise for all you bitches...pull th...
1,144479819843911683:,If I was a thief.. The first thing I would ste...,:: joy,1,If I was a thief.. The first thing I would ste...
2,139110849120972800:,"""&quot;@RevRunWisdom: not afraid of tomorrow, ...",:: fear,0,"""""@RevRunWisdom: not afraid of tomorrow, for I..."
3,141532076791971840:,"""Extreme can neither fight nor fly.&#xA;-- Wil...",:: fear,0,"""Extreme can neither fight nor fly.\n-- Willia..."
4,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise,0,Thinks that @melbahughes had a great 50th birt...


## Split into train & test sets

In [5]:
# Train and test data frames
train, test = train_test_split(data, test_size = 0.2)

# Train and test target labels for polarity
train_pol_y = train.ix[:,3].tolist()
test_pol_y = test.ix[:,3].tolist()

# Binarize labels for sub-emotion classifier
train_emo = train.ix[:,2].tolist()
test_emo = test.ix[:,2].tolist()
emo_bin = preprocessing.LabelBinarizer()

# Labels for sub-emotion classifier
train_emo_y = emo_bin.fit_transform(train_emo)
tests_emo_y = emo_bin.transform(test_emo)

# Train and test inputs
train_pol_x = train.ix[:, 4].tolist()
test_pol_x = test.ix[:, 4].tolist()

In [29]:
train_emo_y[0]

array([0, 0, 0, 0, 0, 1])

## Get matrix ids

In [6]:
# Matrix ids for each tweet were built using GloVe word embeddings
# Because construction of matrix ids is computationally expensive,
# matrix ids were saved and will simply be reloaded
d = np.load('ids.npz')
train_ids = d['train_ids']
test_ids = d['test_ids']

## Pull in GloVe embeddings

In [7]:
# Pull in word list & vectors
wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')
print ('Loaded the word vectors!')

Loaded the word list!
Loaded the word vectors!


## Helper functions for training

In [16]:
from random import randint

# For Polarity Classifier
def getTrainBatch(train_data, train_labels, train_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize):
        num = randint(1, (len(train_data)-1))
        if train_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = train_ids[num-1:num]
        
    return arr.astype(int), labels

def getTestBatch(test_data, test_labels, test_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        
        if test_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels

# For sub-emotion classifier
def getTrainBatch_subEmo(train_data, train_labels, train_ids, batchSize, maxSeqLength):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize):
        num = randint(1, (len(train_data)-1))
        labels.append(train_labels[num-1])
            
        arr[i] = train_ids[num-1:num]
        
    return arr.astype(int), labels


def getTestBatch_subEmo(test_data, test_labels, test_ids, batchSize, maxSeqLength):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        labels.append(test_labels[num-1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels

# From w266 a3
def matmul3d(X, W):
    """Wrapper for tf.matmul to handle a 3D input tensor X.
    Will perform multiplication along the last dimension.
    Args:
      X: [m,n,k]
      W: [k,l]
    Returns:
      XW: [m,n,l]
    """
    Xr = tf.reshape(X, [-1, tf.shape(X)[2]])
    XWr = tf.matmul(Xr, W)
    newshape = [tf.shape(X)[0], tf.shape(X)[1], tf.shape(W)[1]]
    return tf.reshape(XWr, newshape)

# Sub-emotion Classifier without polarity

## RNN Model

In [44]:
# Specify parameters
maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]]) #Maximum number of words in a tweet
batchSize = 100
lstmUnits = 2
numClasses = 6
numDimensions = 50
learningRate = 0.001

iterations = 10000

# Reset graph & create placeholders
tf.reset_default_graph()
labels = tf.placeholder(tf.int32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

# Lookup word vectors
data_vec = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data_vec = tf.nn.embedding_lookup(wordVectors,input_data)
print "Embedding layer:", data_vec.shape


# Feed RNN cell
with tf.name_scope("RNN_Cell"):
    lstmCell = tf.contrib.rnn.BasicLSTMCell(numDimensions, forget_bias=0.0)
    lstmCell_dropout = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
    cell = tf.contrib.rnn.MultiRNNCell([lstmCell_dropout] * lstmUnits)
    value, _ = tf.nn.dynamic_rnn(cell, data_vec, dtype=tf.float32)

# lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
# lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
# value, _ = tf.nn.dynamic_rnn(lstmCell, data_vec, dtype=tf.float32)
# print "Output of RNN cell", value.shape

# Get final output
weight = tf.Variable(tf.truncated_normal([numDimensions, batchSize]))
print "Weight", weight.shape
bias = tf.zeros(batchSize, tf.float32)
print "Bias", bias.shape
multiplier = matmul3d(value, weight)
print "Multiplier", multiplier.shape
prediction = tf.add(multiplier, bias)
print "Prediction", prediction.shape

# weight = tf.Variable(tf.truncated_normal([numDimensions, batchSize]))
# print "Weight", weight.shape
# bias = tf.Variable(tf.constant(0.1, shape=[batchSize]))
# value = tf.transpose(value, [1, 0, 2])
# print "Transposed output of RNN", value.shape
# last = tf.gather(value, int(value.get_shape()[0]) - 1)
# print "Last", last.shape
# prediction = (tf.matmul(last, weight) + bias)


# Define correct predictions and accuracy
# loss_step_one = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=prediction)
# accuracy = tf.reduce_mean(loss_step_one)

comparison = tf.argmax(prediction,1)
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

# Define loss & optimizer
# loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
# optimizer = tf.train.AdamOptimizer().minimize(loss)
loss_one_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=prediction)
loss_ = tf.reduce_mean(loss_one_)
optimizer = tf.train.AdagradOptimizer(learning_rate=learningRate).minimize(loss_)

# For training
# loss_step_one_ = tf.nn.sampled_softmax_loss(weights=tf.transpose(weight), 
#                                             biases=bias, 
#                                             labels=tf.reshape(labels, [-1, 1]), 
#                                             inputs=tf.reshape(value, [-1, numDimensions]), 
#                                             num_sampled=200, num_classes=numClasses)

# train_loss_ = tf.reduce_mean(self.loss_step_one_)



Embedding layer: (100, 34, 50)


ValueError: Attempt to reuse RNNCell <tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl.BasicLSTMCell object at 0x7fc00643fdd0> with a different variable scope than its first use.  First use of cell was with scope 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell', this attempt is with scope 'rnn/multi_rnn_cell/cell_1/basic_lstm_cell'.  Please create a new instance of the cell if you would like it to use a different set of weights.  If before you were using: MultiRNNCell([BasicLSTMCell(...)] * num_layers), change to: MultiRNNCell([BasicLSTMCell(...) for _ in range(num_layers)]).  If before you were using the same cell instance as both the forward and reverse cell of a bidirectional RNN, simply create two instances (one for forward, one for reverse).  In May 2017, we will start transitioning this cell's behavior to use existing stored weights, if any, when it is called with scope=None (which can lead to silent model degradation, so this error will remain until then.)

## For Tensorboard

To see tensorboard, enter the following command in your terminal & then go to http://localhost:6006/

$ tensorboard --logdir=

In [31]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

## For Training

In [35]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(iterations):
    # Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch_subEmo(train_pol_x, train_emo_y, train_ids, batchSize, maxSeqLength);
    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
    # Write summary to Tensorboard
#     if (i % 50 == 0):
#         summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
#         writer.add_summary(summary, i)

#     # Save the network every 10,000 training iterations
#     if (i % 10000 == 0 and i != 0):
#         save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
#         print("saved to %s" % save_path)
# writer.close()

InvalidArgumentError: Matrix size-incompatible: In[0]: [3400,2], In[1]: [100,50]
	 [[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/cpu:0"](Reshape, Variable_1/read)]]

Caused by op u'MatMul', defined at:
  File "/home/melanie_costello/anaconda2/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/home/melanie_costello/anaconda2/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-34-7fc75dfe1ae1>", line 33, in <module>
    multiplier = matmul3d(value, weight)
  File "<ipython-input-16-185fdd420a02>", line 70, in matmul3d
    XWr = tf.matmul(Xr, W)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/math_ops.py", line 1801, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_math_ops.py", line 1263, in _mat_mul
    transpose_b=transpose_b, name=name)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/melanie_costello/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Matrix size-incompatible: In[0]: [3400,2], In[1]: [100,50]
	 [[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/cpu:0"](Reshape, Variable_1/read)]]


In [36]:
iterations = 10
for i in range(iterations):
    nextBatch, nextBatchLabels = getTestBatch_subEmo(test_pol_x, tests_emo_y, test_ids, batchSize, maxSeqLength);
    print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)

('Accuracy for this batch:', 31.999999284744263)
('Accuracy for this batch:', 37.999999523162842)
('Accuracy for this batch:', 33.000001311302185)
('Accuracy for this batch:', 37.000000476837158)
('Accuracy for this batch:', 30.000001192092896)
('Accuracy for this batch:', 40.000000596046448)
('Accuracy for this batch:', 33.000001311302185)
('Accuracy for this batch:', 40.999999642372131)
('Accuracy for this batch:', 40.000000596046448)
('Accuracy for this batch:', 43.000000715255737)


In [37]:
# Show index of predicted class
print("Compare:", (sess.run(comparison, {input_data: nextBatch, labels: nextBatchLabels})))

('Compare:', array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3]))


In [38]:
# Show actual predicted values
print("Preds:", (sess.run(prediction, {input_data: nextBatch, labels: nextBatchLabels})))

('Preds:', array([[ -6.18120849e-01,  -1.29292178e+00,  -5.04282787e-02,
          1.05776560e+00,   2.99769282e-01,   3.20789069e-01],
       [ -8.26627970e-01,  -2.60414886e+00,  -5.32003403e-01,
          1.46117842e+00,   1.84977233e-01,   2.76954383e-01],
       [ -5.97185552e-01,  -1.14407337e+00,   1.81038119e-02,
          9.67402816e-01,   2.95893252e-01,   3.05981725e-01],
       [ -6.17704809e-01,  -1.28954434e+00,  -4.85741980e-02,
          1.05475390e+00,   2.99316645e-01,   3.20026368e-01],
       [ -6.36788130e-01,  -1.38501096e+00,  -6.38464093e-02,
          1.02050960e+00,   2.66826183e-01,   2.88596451e-01],
       [ -5.90257168e-01,  -1.06673324e+00,   7.37397522e-02,
          8.56072247e-01,   2.69456863e-01,   2.69710690e-01],
       [ -6.06284201e-01,  -1.22769511e+00,  -3.38991433e-02,
          1.06157136e+00,   3.14536065e-01,   3.33566844e-01],
       [ -6.40986443e-01,  -1.34031689e+00,   9.89777967e-03,
          8.22470844e-01,   2.00828448e-01,   2.0828

In [51]:
d.close()

In [23]:
tests_emo_y[-1]

array([0, 0, 1, 0, 0, 0])

In [31]:
# Specify parameters
maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]]) #Maximum number of words in a tweet
batchSize = 100
lstmUnits = 2
numClasses = 6
numDimensions = 50

iterations = 10000

# Reset graph & create placeholders
tf.reset_default_graph()
labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

# Lookup word vectors
data_vec = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data_vec = tf.nn.embedding_lookup(wordVectors,input_data)

# Feed RNN cell
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data_vec, dtype=tf.float32)

# Get final output
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

# Define correct predictions and accuracy
comparison = tf.argmax(prediction,1)
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

# Define loss & optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)