In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import HTMLParser as htm
import string
import re
import time

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

## Read in data

In [2]:
data = pd.read_csv("tweet_data_1.csv",sep='\t',quoting=3)
data["escape"] = data.apply(lambda row: htm.HTMLParser().unescape(row[1].decode("utf-8")),axis=1)

data.head()

Unnamed: 0,Id,Tweet,Emotion,Positive,escape
0,138881940341260288:,I got a surprise for all you bitches...pull th...,:: surprise,0,I got a surprise for all you bitches...pull th...
1,144479819843911683:,If I was a thief.. The first thing I would ste...,:: joy,1,If I was a thief.. The first thing I would ste...
2,139110849120972800:,"""&quot;@RevRunWisdom: not afraid of tomorrow, ...",:: fear,0,"""""@RevRunWisdom: not afraid of tomorrow, for I..."
3,141532076791971840:,"""Extreme can neither fight nor fly.&#xA;-- Wil...",:: fear,0,"""Extreme can neither fight nor fly.\n-- Willia..."
4,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise,0,Thinks that @melbahughes had a great 50th birt...


## Split into train & test sets

In [3]:
# Train and test data frames
train, test = train_test_split(data, test_size = 0.2)

# Train and test target labels for polarity
train_pol_y = train.ix[:,3].tolist()
test_pol_y = test.ix[:,3].tolist()

# Binarize labels for sub-emotion classifier
train_emo = train.ix[:,2].tolist()
test_emo = test.ix[:,2].tolist()
emo_bin = preprocessing.LabelBinarizer()

# Labels for sub-emotion classifier
train_emo_y = emo_bin.fit_transform(train_emo)
tests_emo_y = emo_bin.transform(test_emo)

# Train and test inputs
train_pol_x = train.ix[:, 4].tolist()
test_pol_x = test.ix[:, 4].tolist()

In [4]:
train_emo_y[0]

array([0, 0, 1, 0, 0, 0])

## Get matrix ids

In [5]:
# Matrix ids for each tweet were built using GloVe word embeddings
# Because construction of matrix ids is computationally expensive,
# matrix ids were saved and will simply be reloaded
d = np.load('ids.npz')
train_ids = d['train_ids']
test_ids = d['test_ids']

## Pull in GloVe embeddings

In [6]:
# Pull in word list & vectors
wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')
print ('Loaded the word vectors!')

Loaded the word list!
Loaded the word vectors!


## Helper functions for training

In [7]:
from random import randint

# For Polarity Classifier
def getTrainBatch(train_data, train_labels, train_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize):
        num = randint(1, (len(train_data)-1))
        if train_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = train_ids[num-1:num]
        
    return arr.astype(int), labels

def getTestBatch(test_data, test_labels, test_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        
        if test_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels

# For sub-emotion classifier
def getTrainBatch_subEmo(train_data, train_labels, train_ids, batchSize, maxSeqLength):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize):
        num = randint(1, (len(train_data)-1))
        labels.append(train_labels[num-1])
            
        arr[i] = train_ids[num-1:num]
        
    return arr.astype(int), labels


def getTestBatch_subEmo(test_data, test_labels, test_ids, batchSize, maxSeqLength):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        labels.append(test_labels[num-1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels

def matmul3d(X, W):
    """Wrapper for tf.matmul to handle a 3D input tensor X.
    Will perform multiplication along the last dimension.
    Args:
      X: [m,n,k]
      W: [k,l]
    Returns:
      XW: [m,n,l]
    """
    Xr = tf.reshape(X, [-1, tf.shape(X)[2]])
    XWr = tf.matmul(Xr, W)
    newshape = [tf.shape(X)[0], tf.shape(X)[1], tf.shape(W)[1]]
    return tf.reshape(XWr, newshape)


def MakeFancyRNNCell(H, keep_prob, num_layers=1):
    """Make a fancy RNN cell.
    Use tf.nn.rnn_cell functions to construct an LSTM cell.
    Initialize forget_bias=0.0 for better training.
    Args:
      H: hidden state size
      keep_prob: dropout keep prob (same for input and output)
      num_layers: number of cell layers
    Returns:
      (tf.nn.rnn_cell.RNNCell) multi-layer LSTM cell with dropout
    """
    cell = tf.contrib.rnn.BasicLSTMCell(H, forget_bias=0.0)
    cell = tf.contrib.rnn.DropoutWrapper(
        cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
#     cell = tf.contrib.rnn.MultiRNNCell([cell for _ in range(num_layers)])
    cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers)
    return cell

# Sub-emotion Classifier without polarity

## RNN Model

Changes: 
- adding forget_bias to the LSTM Cell
- adding keep_prob

Check on:
- use of tf.nn.dynamic_rnn cell vs MultiRNN

In [14]:
# Specify parameters
# maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]])
maxSeqLength = 31
#Maximum number of words in a tweet
batchSize = 75
hiddenStateSize = 1
# lstmUnits = 2
numClasses = 6
numDimensions = 50
keepProb = 0.5
learningRate = 0.001

iterations = 250

# Reset graph & create placeholders
tf.reset_default_graph()
labels = tf.placeholder(tf.int32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

##ADD NS on 8/2
ns = tf.tile([maxSeqLength], [batchSize, ])

# Lookup word vectors
with tf.name_scope("Embedding_Layer"):
    data_vec = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
    data_vec = tf.nn.embedding_lookup(wordVectors,input_data)
#     print "Embedding Layer shape", data_vec.shape

# Construct RNN/LSTM cell and recurrent layer.
with tf.name_scope("Cell_RNN_Layer"):
    cells=[]
    for _ in range(hiddenStateSize):
        lstmCell = tf.contrib.rnn.BasicLSTMCell(numDimensions, forget_bias=0.0)
        lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, input_keep_prob=keepProb, output_keep_prob=keepProb)        
        cells.append(lstmCell)
        multicell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
    value, _ = tf.nn.dynamic_rnn(multicell, data_vec, sequence_length=ns, dtype=tf.float32)
# with tf.name_scope("Cell_RNN_Layer"):
#     lstmCell = tf.contrib.rnn.BasicLSTMCell(numDimensions, forget_bias=0.0)
#     lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, input_keep_prob=keepProb, output_keep_prob=keepProb)            
#     lstmCell = tf.contrib.rnn.MultiRNNCell([lstmCell] * hiddenStateSize)
#     value, _ = tf.nn.dynamic_rnn(lstmCell, data_vec, dtype=tf.float32)
#     print "Output of RNN shape", value.shape
    
with tf.name_scope("Output_Layer"):
    weight = tf.Variable(tf.random_uniform([numDimensions, numClasses], -1.0, 1.0))
    bias = tf.Variable(tf.zeros(numClasses, tf.float32))
    value = tf.transpose(value, [1, 0, 2])
    last = tf.gather(value, int(value.get_shape()[0]) - 1)
    multiplier = tf.matmul(last, weight)
    prediction = tf.add(multiplier, bias)
    print prediction
 
#     print "Weights shape", weight.shape
#     print "Bias shape", bias.shape
#     print "New shape for value", value.shape
#     print "last shape", last.shape
#     print "multiplier shape", multiplier.shape
#     print "Output shape", prediction.shape

    
# From A3
#     multiplier = matmul3d(value, weight)
#     print "Multiplier shape", multiplier.shape
#     prediction = tf.add(multiplier, bias)
#     print "Logits shape", prediction.shape
    
with tf.name_scope("Prediction_Layer"):
    # Define correct predictions and accuracy
    comparison = tf.argmax(prediction,1)
    correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

    # Define loss & optimizer
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
    optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(loss)



Tensor("Output_Layer/Add:0", shape=(75, 6), dtype=float32)


## Ignore - Scratch paper notes

In [15]:
# with tf.name_scope("Output_Layer"):
            
#     # W_out_ is the transpose of W_in_
#     self.W_out_ = tf.transpose(self.W_in_)
            
#     # Initialize b_out_
#     self.b_out_ = tf.zeros(self.V, tf.float32, name="b_out_")

#     # Logits will be of (batch size, max time, V)
#     self.logits_ = tf.add(matmul3d(self.output_, self.W_out_), self.b_out_)
            
            
#     # Loss computation (true loss, for prediction)
#     self.loss_one_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.target_y_, logits=self.logits_)
#     self.loss_ = tf.reduce_mean(self.loss_one_)
    
# with tf.name_scope("Training_Layer"):
#     # Loss computation (sampled, for training)
#     self.loss_step_one_ = tf.nn.sampled_softmax_loss(weights=tf.transpose(self.W_out_), biases=self.b_out_, 
#                                                      labels=tf.reshape(self.target_y_, [-1, 1]), 
#                                                      inputs=tf.reshape(self.output_, [-1, self.H]), 
#                                                      num_sampled=self.softmax_ns, num_classes=self.V)
#     self.train_loss_ = tf.reduce_mean(self.loss_step_one_)
            
#     # Define optimizer and training op
#     self.train_step_ = tf.train.AdagradOptimizer(learning_rate=self.learning_rate_).minimize(self.train_loss_)

In [16]:
# Specify parameters
maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]]) #Maximum number of words in a tweet
batchSize = 150
hiddenStateSize = 1
# lstmUnits = 2
numClasses = 6
numDimensions = 50
keepProb = 0.5
learningRate = 0.001

iterations = 1500

# Reset graph & create placeholders
tf.reset_default_graph()
labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

# Lookup word vectors
data_vec = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data_vec = tf.nn.embedding_lookup(wordVectors,input_data)

# Feed RNN cell
lstmCell = tf.contrib.rnn.BasicLSTMCell(hiddenStateSize, forget_bias=0.0)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, input_keep_prob=keepProb, output_keep_prob=keepProb)
value, _ = tf.nn.dynamic_rnn(lstmCell, data_vec, dtype=tf.float32)

# Get final output
weight = tf.Variable(tf.truncated_normal([hiddenStateSize, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

# Define correct predictions and accuracy
comparison = tf.argmax(prediction,1)
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

# Define loss & optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(loss)
# optimizer = tf.train.AdagradOptimizer(learning_rate=learningRate).minimize(loss)

## For Tensorboard

In [17]:
import datetime

sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

## For Training

In [18]:


train_inds = []
train_logits = []
train_labels = []
for i in range(iterations):
    # Next Batch of reviews
    nextBatch, nextBatchLabels,train_i = getTrainBatch_subEmo(train_pol_x, train_emo_y, train_ids, batchSize, maxSeqLength);
    train_inds.append(train_i)
    train_logs = sess.run([prediction,optimizer], {input_data: nextBatch, labels: nextBatchLabels})
    train_logits.append(train_logs[0])
    train_labels.append(nextBatchLabels)
    # Write summary to Tensorboard
    summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
    writer.add_summary(summary, i)


# for i in range(iterations):
#     # Next Batch of reviews
#     nextBatch, nextBatchLabels = getTrainBatch_subEmo(train_pol_x, train_emo_y, train_ids, batchSize, maxSeqLength);
#     sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
#     # Write summary to Tensorboard
#     summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
#     writer.add_summary(summary, i)

#     # Save the network every 10,000 training iterations
#     if (i % 10000 == 0 and i != 0):
#         save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
#         print("saved to %s" % save_path)
# writer.close()

ValueError: could not broadcast input array from shape (31) into shape (34)

In [None]:
iterations = 500
l_predictions = []
l_labels = []
l_logits = []
l_inds = []
for i in range(iterations):
    nextBatch, nextBatchLabels,test_i = getTestBatch_subEmo(test_pol_x, tests_emo_y, test_ids, batchSize, maxSeqLength)

    test_log,p,q= (sess.run([prediction,comparison,accuracy], {input_data: nextBatch, labels: nextBatchLabels}))
    l_predictions.append(p)
    l_labels.append(nextBatchLabels)
    l_logits.append(test_log)
    l_inds.append(test_i)
    #print("Accuracy for this batch:",q)

#     print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)

In [None]:
from sklearn.metrics import classification_report
from collections import OrderedDict
from operator import itemgetter

target_names = emo_bin.classes_.tolist()
def score(preds,labels,target_names, indexes):
    predictions = np.asarray(preds).ravel()
    labels = np.argmax(np.asarray(labels),2).ravel()
    indexes = np.asarray(indexes).ravel()
    
    print classification_report(labels,predictions,target_names=target_names)
    
    errors = dict()
    examples = dict()
    for i, p in enumerate(predictions):
        if p != labels[i]:
            if (p, labels[i]) not in errors:
                errors[(p, labels[i])] = 1
                examples[(p, labels[i])] = [indexes[i]]
            else:
                errors[(p, labels[i])] += 1 
                examples[(p, labels[i])].append(indexes[i])
                
    return OrderedDict(sorted(errors.items(), key=itemgetter(1))), examples
err, ex = score(l_predictions,l_labels,target_names,l_inds)

# See which pairs are getting confused most often
for key, val in err.iteritems():
    print key, val

In [42]:
# Show index of predicted class
print("Compare:", (sess.run(comparison, {input_data: nextBatch, labels: nextBatchLabels})))

('Compare:', array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3]))


In [43]:
# Show actual predicted values
print("Preds:", (sess.run(prediction, {input_data: nextBatch, labels: nextBatchLabels})))

('Preds:', array([[-0.014025  ,  0.06395618,  0.12048888,  0.38174605,  0.00953795,
         0.07971515],
       [-0.00301381,  0.08063177,  0.21235675,  0.44487906, -0.03570636,
         0.11071764],
       [ 0.02004176,  0.02027215,  0.12705526,  0.25619584,  0.09074374,
         0.15652873],
       [ 0.01999545,  0.02020205,  0.12666899,  0.25593045,  0.09093395,
         0.15639834],
       [ 0.02003685,  0.02026472,  0.12701431,  0.25616771,  0.0907639 ,
         0.1565149 ],
       [-0.01359886,  0.06284045,  0.11891198,  0.37825832,  0.01187535,
         0.08056186],
       [-0.00301309,  0.08062685,  0.21234521,  0.4448629 , -0.03569534,
         0.11071844],
       [ 0.00902759,  0.0035997 ,  0.03518479,  0.19307154,  0.13598254,
         0.12551938],
       [ 0.00902759,  0.0035997 ,  0.03518479,  0.19307154,  0.13598254,
         0.12551938],
       [-0.00301183,  0.08061852,  0.2123259 ,  0.44483566, -0.03567678,
         0.11071994],
       [-0.01395453,  0.06377167,  0.12

In [51]:
d.close()

In [23]:
tests_emo_y[-1]

array([0, 0, 1, 0, 0, 0])