In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import HTMLParser as htm
import string
import re
import time

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

## Read in and preprocess/clean data

In [2]:
data = pd.read_csv("tweet_data_1.csv",sep='\t',quoting=3)
data["escape"] = data.apply(lambda row: htm.HTMLParser().unescape(row[1].decode("utf-8")),axis=1)

data.head()

Unnamed: 0,Id,Tweet,Emotion,Positive,escape
0,138881940341260288:,I got a surprise for all you bitches...pull th...,:: surprise,0,I got a surprise for all you bitches...pull th...
1,144479819843911683:,If I was a thief.. The first thing I would ste...,:: joy,1,If I was a thief.. The first thing I would ste...
2,139110849120972800:,"""&quot;@RevRunWisdom: not afraid of tomorrow, ...",:: fear,0,"""""@RevRunWisdom: not afraid of tomorrow, for I..."
3,141532076791971840:,"""Extreme can neither fight nor fly.&#xA;-- Wil...",:: fear,0,"""Extreme can neither fight nor fly.\n-- Willia..."
4,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise,0,Thinks that @melbahughes had a great 50th birt...


In [3]:
def process_data(data):
    """Converts to lowercase, strips out punctuation,
    removes excess whitespace within a string & leading & trailing whitespace"""
    new_list = []
    table = string.maketrans("","")
    for elem in data:
        elem = "".join(i for i in elem if ord(i)<128)
        elem = str(elem)        
        elem = elem.lower()
        # New addition to handle elipsis
#         elem = re.sub('\\.+', ' ', elem)
        elem = elem.translate(table, string.punctuation)
        elem = re.sub(' +',' ', elem)
        elem = elem.strip()
        
        new_list.append(elem)
    return new_list

#Clean entire data set at once
data.escape = process_data(data.escape)
data.head()

Unnamed: 0,Id,Tweet,Emotion,Positive,escape
0,138881940341260288:,I got a surprise for all you bitches...pull th...,:: surprise,0,i got a surprise for all you bitchespull theri...
1,144479819843911683:,If I was a thief.. The first thing I would ste...,:: joy,1,if i was a thief the first thing i would steal...
2,139110849120972800:,"""&quot;@RevRunWisdom: not afraid of tomorrow, ...",:: fear,0,revrunwisdom not afraid of tomorrow for i have...
3,141532076791971840:,"""Extreme can neither fight nor fly.&#xA;-- Wil...",:: fear,0,extreme can neither fight nor fly\n william sh...
4,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise,0,thinks that melbahughes had a great 50th birth...


## Pull in GloVe embeddings

In [4]:
# Pull in word list & vectors
wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')
print ('Loaded the word vectors!')

Loaded the word list!
Loaded the word vectors!


## Load Train Data

In [5]:
#load all of train and test data
p = np.load('train_test.npz')
train_pol_y = p['train_pol_y']
test_pol_y = p['test_pol_y']
train_pol_x = p['train_pol_x']
test_pol_x = p['test_pol_x']
train_emo = p['train_emo']
test_emo = p['test_emo']
train_emo_y = p['train_emo_y']
tests_emo_y = p['tests_emo_y']

## Get matrix ids

In [6]:
# Matrix ids for each tweet were built using GloVe word embeddings
# Because construction of matrix ids is computationally expensive,
# matrix ids were saved and will simply be reloaded
d = np.load('ids.npz')
train_ids = d['train_ids']
test_ids = d['test_ids']

In [7]:
train_ids.shape

(16840, 31)

## Helper functions for training

In [8]:
from random import randint

# For Polarity Classifier
def getTrainBatch(train_data, train_labels, train_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize):
        num = randint(1, (len(train_data)-1))
        if train_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = train_ids[num-1:num]
        
    return arr.astype(int), labels

def getTestBatch(test_data, test_labels, test_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        
        if test_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels

# For sub-emotion classifier
# def getTrainBatch_subEmo(train_data, train_labels, train_ids, batchSize, maxSeqLength):
#     labels = []
#     arr = np.zeros([batchSize, maxSeqLength])
#     # iterate through batch size
#     for i in range(batchSize):
#         num = randint(1, (len(train_data)-1))
#         labels.append(train_labels[num-1])
            
#         arr[i] = train_ids[num-1:num]
        
#     return arr.astype(int), labels

def getTrainBatch_subEmo(train_data, train_labels, train_ids, batchSize, maxSeqLength):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize-10): #took out -5
        num = randint(1, (len(train_data)-1))
        labels.append(train_labels[num-1])
            
        arr[i] = train_ids[num-1:num]
    
    disgust = []
    for m in range(len(train_labels)):
        if train_labels[m][1] == 1:
            disgust.append(m)
    
    for mel in range(5):
        num = randint(1, (len(disgust)-1))
        ind = disgust[num]
        labels.append(train_labels[ind])
        arr[batchSize-mel-1] = train_ids[ind]
        
    anger = []
    for p in range(len(train_labels)):
        if train_labels[p][0] == 1:
            anger.append(p)
    
    for pri in range(5,10):
        num = randint(1, (len(anger)-1))
        ind = anger[num]
        labels.append(train_labels[ind])
        arr[batchSize-pri-1] = train_ids[ind]
    
    return arr.astype(int), labels


def getTestBatch_subEmo(test_data, test_labels, test_ids, batchSize, maxSeqLength):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        labels.append(test_labels[num-1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels


# Sub-emotion Classifier without polarity

## RNN Model

In [14]:
# Specify parameters

maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]]) #Maximum number of words in a tweet
batchSize = 75
hiddenStateSize = 1
# lstmUnits = 2
numClasses = 6
numDimensions = 50
keepProb = 0.5
learningRate = 0.001

iterations = 1500

# Reset graph & create placeholders
tf.reset_default_graph()
labels = tf.placeholder(tf.int32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
ns = tf.tile([maxSeqLength], [batchSize, ])

# Lookup word vectors
with tf.name_scope("Embedding_Layer"):
    data_vec = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
    data_vec = tf.nn.embedding_lookup(wordVectors,input_data)
    
    # I THINK WE HAVE TO EXTEND THE VECTOR RIGHT HERE


# Construct RNN/LSTM cell and recurrent layer.
with tf.name_scope("Cell_RNN_Layer"):
    lstmCell = tf.contrib.rnn.BasicLSTMCell(numDimensions, forget_bias=0.0)
    lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, input_keep_prob=keepProb, output_keep_prob=keepProb)            
    lstmCell = tf.contrib.rnn.MultiRNNCell([lstmCell] * hiddenStateSize)
    value, _ = tf.nn.dynamic_rnn(lstmCell, data_vec, sequence_length=ns, dtype=tf.float32)

    
with tf.name_scope("Output_Layer"):
    weight = tf.Variable(tf.random_uniform([numDimensions, numClasses], -1.0, 1.0))
    bias = tf.zeros(numClasses, tf.float32)
    value = tf.transpose(value, [1, 0, 2])
    last = tf.gather(value, int(value.get_shape()[0]) - 1)
    multiplier = tf.matmul(last, weight)
    prediction = tf.add(multiplier, bias)

    print "Embedding Layer shape", data_vec.shape
#     print "Output of RNN shape", value.shape
#     print "Weights shape", weight.shape
#     print "Bias shape", bias.shape
#     print "New shape for value", value.shape
#     print "last shape", last.shape
#     print "multiplier shape", multiplier.shape
#     print "Output shape", prediction.shape
    
with tf.name_scope("Prediction_Layer"):
    # Define correct predictions and accuracy
    comparison = tf.argmax(prediction,1)
    correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

    # Define loss & optimizer
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
    optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(loss)



Embedding Layer shape (75, 31, 50)


## For Tensorboard

In [15]:
import datetime

sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

## For Training

In [16]:


for i in range(iterations):
    # Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch_subEmo(train_pol_x, train_emo_y, train_ids, batchSize, maxSeqLength);
    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
    # Write summary to Tensorboard
    summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
    writer.add_summary(summary, i)

#     # Save the network every 10,000 training iterations
#     if (i % 10000 == 0 and i != 0):
#         save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
#         print("saved to %s" % save_path)
# writer.close()

In [17]:
iterations = 10
for i in range(iterations):
    nextBatch, nextBatchLabels = getTestBatch_subEmo(test_pol_x, tests_emo_y, test_ids, batchSize, maxSeqLength);
    print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)

('Accuracy for this batch:', 54.666668176651001)
('Accuracy for this batch:', 49.333333969116211)
('Accuracy for this batch:', 38.666665554046631)
('Accuracy for this batch:', 56.000000238418579)
('Accuracy for this batch:', 45.333334803581238)
('Accuracy for this batch:', 37.333333492279053)
('Accuracy for this batch:', 40.000000596046448)
('Accuracy for this batch:', 40.000000596046448)
('Accuracy for this batch:', 47.999998927116394)
('Accuracy for this batch:', 43.999999761581421)


In [18]:
print("Bias:", (sess.run(bias, {input_data: nextBatch, labels: nextBatchLabels})))

('Bias:', array([ 0.,  0.,  0.,  0.,  0.,  0.], dtype=float32))


In [23]:
# Show index of predicted class
print("Compare:", (sess.run(comparison, {input_data: nextBatch, labels: nextBatchLabels})))

('Compare:', array([3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3]))


In [59]:
# Show actual predicted values
print("Preds:", (sess.run(prediction, {input_data: nextBatch, labels: nextBatchLabels})))

('Preds:', array([[ -1.70764267e-01,  -1.18983972e+00,  -5.41362107e-01,
          6.36327565e-01,   3.38299751e-01,   1.79633245e-01],
       [ -1.19698989e+00,  -2.05328321e+00,  -2.01932818e-01,
          1.36098087e+00,   6.58866107e-01,   6.26864076e-01],
       [ -1.37206626e+00,  -2.97011209e+00,  -1.14057708e+00,
          2.07653046e+00,   1.22284031e+00,   8.62038255e-01],
       [ -4.96542275e-01,  -8.49670291e-01,   1.07648514e-01,
          7.06201136e-01,   2.07316279e-01,   2.47223362e-01],
       [ -1.56369805e+00,  -2.18205690e+00,  -5.20198703e-01,
          1.83059955e+00,   4.45347369e-01,   7.49708951e-01],
       [ -4.74655926e-01,  -1.14407694e+00,  -3.83297876e-02,
          7.92131722e-01,   3.42834473e-01,   3.81663144e-01],
       [ -6.71388924e-01,  -1.65920734e+00,  -7.78439105e-01,
          1.56935143e+00,   8.43184590e-01,   9.76065993e-01],
       [ -9.34223294e-01,  -1.18638587e+00,  -4.24825251e-01,
          1.16256618e+00,   1.75717831e-01,   2.9881

In [51]:
d.close()

In [14]:
tests_emo_y[-1]

array([0, 0, 0, 1, 0, 0])