In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import HTMLParser as htm
import string
import re
import time

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

## Read in and preprocess/clean data

In [2]:
data = pd.read_csv("tweet_data_1.csv",sep='\t',quoting=3)
data["escape"] = data.apply(lambda row: htm.HTMLParser().unescape(row[1].decode("utf-8")),axis=1)

data.head()

Unnamed: 0,Id,Tweet,Emotion,Positive,escape
0,138881940341260288:,I got a surprise for all you bitches...pull th...,:: surprise,0,I got a surprise for all you bitches...pull th...
1,144479819843911683:,If I was a thief.. The first thing I would ste...,:: joy,1,If I was a thief.. The first thing I would ste...
2,139110849120972800:,"""&quot;@RevRunWisdom: not afraid of tomorrow, ...",:: fear,0,"""""@RevRunWisdom: not afraid of tomorrow, for I..."
3,141532076791971840:,"""Extreme can neither fight nor fly.&#xA;-- Wil...",:: fear,0,"""Extreme can neither fight nor fly.\n-- Willia..."
4,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise,0,Thinks that @melbahughes had a great 50th birt...


In [3]:
def process_data(data):
    """Converts to lowercase, strips out punctuation,
    removes excess whitespace within a string & leading & trailing whitespace"""
    new_list = []
    table = string.maketrans("","")
    for elem in data:
        elem = "".join(i for i in elem if ord(i)<128)
        elem = str(elem)        
        elem = elem.lower()
        # New addition to handle elipsis
#         elem = re.sub('\\.+', ' ', elem)
        elem = elem.translate(table, string.punctuation)
        elem = re.sub(' +',' ', elem)
        elem = elem.strip()
        
        new_list.append(elem)
    return new_list

#Clean entire data set at once
data.escape = process_data(data.escape)
data.head()

Unnamed: 0,Id,Tweet,Emotion,Positive,escape
0,138881940341260288:,I got a surprise for all you bitches...pull th...,:: surprise,0,i got a surprise for all you bitchespull theri...
1,144479819843911683:,If I was a thief.. The first thing I would ste...,:: joy,1,if i was a thief the first thing i would steal...
2,139110849120972800:,"""&quot;@RevRunWisdom: not afraid of tomorrow, ...",:: fear,0,revrunwisdom not afraid of tomorrow for i have...
3,141532076791971840:,"""Extreme can neither fight nor fly.&#xA;-- Wil...",:: fear,0,extreme can neither fight nor fly\n william sh...
4,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise,0,thinks that melbahughes had a great 50th birth...


## Pull in GloVe embeddings

In [4]:
# Pull in word list & vectors
wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')
print ('Loaded the word vectors!')

Loaded the word list!
Loaded the word vectors!


## Load Train Data

In [5]:
#load all of train and test data
p = np.load('train_test.npz')
train_pol_y = p['train_pol_y']
test_pol_y = p['test_pol_y']
train_pol_x = p['train_pol_x']
test_pol_x = p['test_pol_x']
train_emo = p['train_emo']
test_emo = p['test_emo']
train_emo_y = p['train_emo_y']
tests_emo_y = p['tests_emo_y']

## Get matrix ids

In [6]:
# Matrix ids for each tweet were built using GloVe word embeddings
# Because construction of matrix ids is computationally expensive,
# matrix ids were saved and will simply be reloaded
d = np.load('ids.npz')
train_ids = d['train_ids']
test_ids = d['test_ids']

In [7]:
train_ids.shape

(16840, 31)

## Helper functions for training

In [22]:
from random import randint
import random

# For Polarity Classifier
def getTrainBatch(train_data, train_labels, train_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize):
        num = randint(1, (len(train_data)-1))
        if train_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = train_ids[num-1:num]
        
    return arr.astype(int), labels

def getTestBatch(test_data, test_labels, test_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        
        if test_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels



def getTrainBatch_subEmo(train_data, train_labels, train_ids, batchSize, maxSeqLength):
    labels = []
    inds = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    #for i in range(batchSize-10): 
        #num = randint(1, (len(train_data)-1))
        
    count = 0
    for num in random.sample(xrange(1,(len(train_data)-1)), batchSize-10):
        labels.append(train_labels[num-1])
            
        #arr[i]    
        arr[count] = train_ids[num-1:num]
        inds.append(num-1)
        count +=1
        
    disgust = []
    for m in range(len(train_labels)):
        if train_labels[m][1] == 1:
            disgust.append(m)
    
    #for mel in range(5):
        #num = randint(1, (len(disgust)-1))
    for num in random.sample(xrange(1,(len(disgust)-1)), 5):
        ind = disgust[num]
        labels.append(train_labels[ind])
        arr[count] = train_ids[ind]
        inds.append(ind)
        count +=1
        
    anger = []
    for p in range(len(train_labels)):
        if train_labels[p][0] == 1:
            anger.append(p)
    
    #for pri in range(5,10):
        #num = randint(1, (len(anger)-1))
    for num in random.sample(xrange(1,(len(anger)-1)), 5):
        ind = anger[num]
        labels.append(train_labels[ind])
        arr[count] = train_ids[ind]
        inds.append(ind)
        count +=1
    
    return arr.astype(int), labels,inds


def getTestBatch_subEmo(test_data, test_labels, test_ids, batchSize, maxSeqLength):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    inds=[]
    #for i in range(batchSize):
        #num = randint(1,(len(test_data)-1))
        
    count = 0
    for num in random.sample(xrange(1,(len(test_data)-1)), batchSize):
        labels.append(test_labels[num-1])
            
        arr[count] = test_ids[num-1:num]
        inds.append(num-1)
        count +=1
        
    return arr.astype(int), labels,inds




# Sub-emotion Classifier without polarity

## RNN Model

In [23]:
# Specify parameters

maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]]) #Maximum number of words in a tweet
batchSize = 150
hiddenStateSize = 1
# lstmUnits = 2
numClasses = 6
numDimensions = 50
keepProb = 0.5
learningRate = 0.001

iterations = 1000

# Reset graph & create placeholders
tf.reset_default_graph()
labels = tf.placeholder(tf.int32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
ns = tf.tile([maxSeqLength], [batchSize, ])

# Lookup word vectors
with tf.name_scope("Embedding_Layer"):
    data_vec = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
    data_vec = tf.nn.embedding_lookup(wordVectors,input_data)


# Construct RNN/LSTM cell and recurrent layer.
with tf.name_scope("Cell_RNN_Layer"):
    cells=[]
    for _ in range(hiddenStateSize):
        lstmCell = tf.contrib.rnn.BasicLSTMCell(numDimensions, forget_bias=1.0)
        lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, input_keep_prob=keepProb, output_keep_prob=keepProb)        
        cells.append(lstmCell)
        multicell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
    value, _ = tf.nn.dynamic_rnn(multicell, data_vec, sequence_length=ns, dtype=tf.float32)

    
with tf.name_scope("Output_Layer"):
    weight = tf.Variable(tf.random_uniform([numDimensions, numClasses], -1.0, 1.0))
    bias = tf.Variable(tf.random_uniform([numClasses], -1.0, 1.0))
    value = tf.transpose(value, [1, 0, 2])
    last = tf.gather(value, int(value.get_shape()[0]) - 1)
    multiplier = tf.matmul(last, weight)
    prediction = tf.add(multiplier, bias)

#     print "Embedding Layer shape", data_vec.shape
#     print "Output of RNN shape", value.shape
#     print "Weights shape", weight.shape
#     print "Bias shape", bias.shape
#     print "New shape for value", value.shape
#     print "last shape", last.shape
#     print "multiplier shape", multiplier.shape
#     print "Output shape", prediction.shape
    
with tf.name_scope("Prediction_Layer"):
    # Define correct predictions and accuracy
    comparison = tf.argmax(prediction,1)
    correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

    # Define loss & optimizer
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
    optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(loss)

## For Tensorboard

In [24]:
import datetime

sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

## For Training

In [25]:
train_inds = []
train_logits = []
train_labels = []
for i in range(iterations):
    # Next Batch of reviews
    nextBatch, nextBatchLabels,train_i = getTrainBatch_subEmo(train_pol_x, train_emo_y, train_ids, batchSize, maxSeqLength)
    train_inds.append(train_i)
    train_logs = sess.run([prediction,optimizer], {input_data: nextBatch, labels: nextBatchLabels})
    train_logits.append(train_logs[0])
    train_labels.append(nextBatchLabels)
    # Write summary to Tensorboard
    summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
    writer.add_summary(summary, i)

In [27]:
iterations = 500
l_predictions = []
l_labels = []
l_logits = []
l_inds = []
for i in range(iterations):
    nextBatch, nextBatchLabels,test_i = getTestBatch_subEmo(test_pol_x, tests_emo_y, test_ids, batchSize, maxSeqLength)

    test_log,p,q= (sess.run([prediction,comparison,accuracy], {input_data: nextBatch, labels: nextBatchLabels}))
    l_predictions.append(p)
    l_labels.append(nextBatchLabels)
    l_logits.append(test_log)
    l_inds.append(test_i)

In [29]:
from sklearn.metrics import classification_report
from collections import OrderedDict
from operator import itemgetter

# target_names = emo_bin.classes_.tolist()
target_names = [':: anger', ':: disgust', ':: fear', ':: joy', ':: sadness', ':: surprise']
def score(preds,labels,target_names, indexes):
    predictions = np.asarray(preds).ravel()
    labels = np.argmax(np.asarray(labels),2).ravel()
    indexes = np.asarray(indexes).ravel()
    
    print classification_report(labels,predictions,target_names=target_names)
    
    errors = dict()
    examples = dict()
    for i, p in enumerate(predictions):
        if p != labels[i]:
            if (p, labels[i]) not in errors:
                errors[(p, labels[i])] = 1
                examples[(p, labels[i])] = [indexes[i]]
            else:
                errors[(p, labels[i])] += 1 
                examples[(p, labels[i])].append(indexes[i])
                
    return OrderedDict(sorted(errors.items(), key=itemgetter(1))), examples
err, ex = score(l_predictions,l_labels,target_names,l_inds)

# See which pairs are getting confused most often
for key, val in err.iteritems():
    print key, val

             precision    recall  f1-score   support

   :: anger       0.22      0.22      0.22      5205
 :: disgust       0.15      0.07      0.10      2594
    :: fear       0.46      0.39      0.42      9703
     :: joy       0.52      0.81      0.63     29165
 :: sadness       0.30      0.15      0.20     14546
:: surprise       0.49      0.27      0.34     13787

avg / total       0.43      0.46      0.42     75000

(5, 1) 101
(1, 2) 151
(1, 0) 167
(1, 5) 204
(5, 0) 245
(2, 1) 275
(1, 4) 279
(1, 3) 281
(4, 1) 451
(0, 1) 494
(0, 5) 538
(5, 2) 641
(2, 0) 681
(4, 0) 693
(2, 5) 737
(4, 2) 828
(0, 3) 917
(0, 4) 1020
(3, 1) 1080
(0, 2) 1211
(2, 4) 1313
(2, 3) 1347
(5, 3) 1439
(5, 4) 1456
(4, 5) 1571
(4, 3) 1689
(3, 0) 2263
(3, 2) 3127
(3, 5) 7063
(3, 4) 8265
