In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import HTMLParser as htm
import string
import re
import time

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


## Read in data

In [3]:
data = pd.read_csv("tweet_data_1.csv",sep='\t',quoting=3)
data["escape"] = data.apply(lambda row: htm.HTMLParser().unescape(row[1].decode("utf-8")),axis=1)

data.head()

Unnamed: 0,Id,Tweet,Emotion,Positive,escape
0,138881940341260288:,I got a surprise for all you bitches...pull th...,:: surprise,0,I got a surprise for all you bitches...pull th...
1,144479819843911683:,If I was a thief.. The first thing I would ste...,:: joy,1,If I was a thief.. The first thing I would ste...
2,139110849120972800:,"""&quot;@RevRunWisdom: not afraid of tomorrow, ...",:: fear,0,"""""@RevRunWisdom: not afraid of tomorrow, for I..."
3,141532076791971840:,"""Extreme can neither fight nor fly.&#xA;-- Wil...",:: fear,0,"""Extreme can neither fight nor fly.\n-- Willia..."
4,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise,0,Thinks that @melbahughes had a great 50th birt...


## Clean Data

In [4]:
def process_data(data):
    """Converts to lowercase, strips out punctuation,
    removes excess whitespace within a string & leading & trailing whitespace"""
    new_list = []
    table = string.maketrans("","")
    for elem in data:
        elem = "".join(i for i in elem if ord(i)<128)
        elem = str(elem)        
        elem = elem.lower()
        elem = elem.translate(table, string.punctuation)
        elem = re.sub(' +',' ', elem)
        elem = elem.strip()
        
        new_list.append(elem)
    return new_list

#train_pol_x = process_data(train_pol_x)
#test_pol_x = process_data(test_pol_x)

#Clean entire data set at once
data.escape = process_data(data.escape)

## Split into train & test sets

In [5]:
# Train and test data frames
train, test = train_test_split(data, test_size = 0.2)

# Train and test target labels for polarity
train_pol_y = train.ix[:,3].tolist()
test_pol_y = test.ix[:,3].tolist()

# Binarize labels for sub-emotion classifier
train_emo = train.ix[:,2].tolist()
test_emo = test.ix[:,2].tolist()
emo_bin = preprocessing.LabelBinarizer()

# Labels for sub-emotion classifier
train_emo_y = emo_bin.fit_transform(train_emo)
tests_emo_y = emo_bin.transform(test_emo)

# Train and test inputs
train_pol_x = train.ix[:, 4].tolist()
test_pol_x = test.ix[:, 4].tolist()


#save data to recall later
#np.savez('train_test.npz', train_pol_y=train_pol_y, test_pol_y=test_pol_y,train_pol_x=train_pol_x,\
        #test_pol_x=test_pol_x, train_emo=train_emo,test_emo=test_emo,train_emo_y=train_emo_y,\
        #tests_emo_y=tests_emo_y)


## Pull in GloVe embeddings

In [6]:
# Pull in word list & vectors
wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')
print ('Loaded the word vectors!')

Loaded the word list!
Loaded the word vectors!


In [7]:
maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]]) #Maximum number of words in a tweet
# numDimensions = 25 #Dimensions for each word vector

def get_matrix_ids(data, maxSeqLength):
    numFiles = len(data)
    ids = np.zeros((numFiles, maxSeqLength), dtype='int32')

    for fileCounter, tweet in enumerate(data):
        start = time.time()
        split = tweet.split()
        for indexCounter, word in enumerate(split):
            try:
                ids[fileCounter][indexCounter] = wordsList.index(word)
            except ValueError:
                ids[fileCounter][indexCounter] = 399999 #Vector for unkown words

            if indexCounter >= maxSeqLength:
                break
        if fileCounter % 500 == 0:
            print "Tweet matrices completed:", fileCounter
    end = time.time()
    print "Time elapsed", (end - start)
    return ids


#train_ids = get_matrix_ids(train_pol_x, maxSeqLength)
#test_ids = get_matrix_ids(test_pol_x, maxSeqLength)
#np.savez('ids.npz', train_ids=train_ids, test_ids=test_ids)


## Load Train Data

In [8]:
#load all of train and test data
p = np.load('train_test.npz')
train_pol_y = p['train_pol_y']
test_pol_y = p['test_pol_y']
train_pol_x = p['train_pol_x']
test_pol_x = p['test_pol_x']
train_emo = p['train_emo']
test_emo = p['test_emo']
train_emo_y = p['train_emo_y']
tests_emo_y = p['tests_emo_y']

## Get matrix ids

In [9]:
# Matrix ids for each tweet were built using GloVe word embeddings
# Because construction of matrix ids is computationally expensive,
# matrix ids were saved and will simply be reloaded
d = np.load('ids.npz')
train_ids = d['train_ids']
test_ids = d['test_ids']

In [10]:
train_ids.shape

(16840, 31)

In [11]:
#Add extra dimension to traim_ids_emo for polarity

print train_pol_y[0]
print train_emo[0]
print train_ids[0]

pol_array = np.asarray(train_pol_y).reshape(-1,1)
train_ids_emo = np.append(train_ids, pol_array,axis=1)
print train_pol_x[0]
print train_ids.shape
print train_ids_emo.shape

0
:: sadness
[ 8254  3711     4  1147   373     4 74927     7  4249   539  1720     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0]
boyfriend flew to chicago today to appraise a comic book collection
(16840, 31)
(16840, 32)


## Helper functions for training

In [174]:
from random import randint

# For Polarity Classifier
def getTrainBatch(train_data, train_labels, train_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize):
        num = randint(1, (len(train_data)-1))
        if train_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = train_ids[num-1:num]
        
    return arr.astype(int), labels

def getTestBatch(test_data, test_labels, test_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        
        if test_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels

# For sub-emotion classifier
def getTrainBatch_subEmo(train_data, train_labels, train_ids, batchSize, maxSeqLength):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize-10): #took out -5
        num = randint(1, (len(train_data)-1))
        labels.append(train_labels[num-1])
            
        arr[i] = train_ids[num-1:num]
    
    disgust = []
    for m in range(len(train_labels)):
        if train_labels[m][1] == 1:
            disgust.append(m)
    
    for mel in range(5):
        num = randint(1, (len(disgust)-1))
        ind = disgust[num]
        labels.append(train_labels[ind])
        arr[batchSize-mel-1] = train_ids[ind]
        
    anger = []
    for p in range(len(train_labels)):
        if train_labels[p][0] == 1:
            anger.append(p)
    
    for pri in range(5,10):
        num = randint(1, (len(anger)-1))
        ind = anger[num]
        labels.append(train_labels[ind])
        arr[batchSize-pri-1] = train_ids[ind]
    
    return arr.astype(int), labels


def getTestBatch_subEmo(test_data, test_labels, test_ids, batchSize, maxSeqLength):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        labels.append(test_labels[num-1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels

def matmul3d(X, W):
    """Wrapper for tf.matmul to handle a 3D input tensor X.
    Will perform multiplication along the last dimension.
    Args:
      X: [m,n,k]
      W: [k,l]
    Returns:
      XW: [m,n,l]
    """
    Xr = tf.reshape(X, [-1, tf.shape(X)[2]])
    XWr = tf.matmul(Xr, W)
    newshape = [tf.shape(X)[0], tf.shape(X)[1], tf.shape(W)[1]]
    return tf.reshape(XWr, newshape)


def MakeFancyRNNCell(H, keep_prob, num_layers=1):
    """Make a fancy RNN cell.
    Use tf.nn.rnn_cell functions to construct an LSTM cell.
    Initialize forget_bias=0.0 for better training.
    Args:
      H: hidden state size
      keep_prob: dropout keep prob (same for input and output)
      num_layers: number of cell layers
    Returns:
      (tf.nn.rnn_cell.RNNCell) multi-layer LSTM cell with dropout
    """
    cell = tf.contrib.rnn.BasicLSTMCell(H, forget_bias=0.0)
    cell = tf.contrib.rnn.DropoutWrapper(
        cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
#     cell = tf.contrib.rnn.MultiRNNCell([cell for _ in range(num_layers)])
    cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers)
    return cell

In [169]:
anger = []
for p in range(len(train_emo_y)):
    if train_emo_y[p][0] == 1:
        anger.append(p)
        
print anger[:5]

train_emo_y[55]

[11, 34, 55, 63, 88]


array([1, 0, 0, 0, 0, 0])

## Balance Check

In [175]:
#check train data first
check = np.unique(train_emo, return_counts=True)
check_test = np.unique(test_emo, return_counts=True)

counts = {"TRAIN":{},"TEST":{},"Perc":{}}
for i in range(6):
    counts["TRAIN"][check[0][i]] = check[1][i]
    counts["TEST"][check_test[0][i]] = check_test[1][i]
    counts["Perc"][check[0][i]]= float(check_test[1][i])/float(check[1][i])*100
counts

{'Perc': {':: anger': 23.02215189873418,
  ':: disgust': 24.34640522875817,
  ':: fear': 24.217026907807675,
  ':: joy': 24.981040497497347,
  ':: sadness': 26.73726009265387,
  ':: surprise': 24.886437378325763},
 'TEST': {':: anger': 291,
  ':: disgust': 149,
  ':: fear': 549,
  ':: joy': 1647,
  ':: sadness': 808,
  ':: surprise': 767},
 'TRAIN': {':: anger': 1264,
  ':: disgust': 612,
  ':: fear': 2267,
  ':: joy': 6593,
  ':: sadness': 3022,
  ':: surprise': 3082}}

In [176]:
#check batch data
def check_bal(nextBatchLabels):
    x = np.asarray(nextBatchLabels)
    x2 = np.unique(x,return_counts=True,axis=0)
    labs = emo_bin.inverse_transform(x2[0])
    for i in range(6):
        print x2[0][i],labs[i],x2[1][i]

In [221]:
nextBatch, nextBatchLabels = getTrainBatch_subEmo(train_pol_x, train_emo_y, train_ids_emo, batchSize, maxSeqLength)
check_bal(nextBatchLabels)

[0 0 0 0 0 1] :: surprise 28
[0 0 0 0 1 0] :: sadness 22
[0 0 0 1 0 0] :: joy 57
[0 0 1 0 0 0] :: fear 15
[0 1 0 0 0 0] :: disgust 13
[1 0 0 0 0 0] :: anger 15


## Polarity Classifier  w/ Scikit

In [235]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

from sklearn.metrics import classification_report

In [289]:
vectorizer = TfidfVectorizer(min_df = 2,
                            max_df=.5,
                            use_idf=True,
                            stop_words='english',
                            sublinear_tf=True
                             
                            )
train_vectors = vectorizer.fit_transform(train_pol_x)
test_vectors = vectorizer.transform(test_pol_x)

base1 = svm.LinearSVC(loss="hinge")#svm.SVC(kernel='linear')
base1.fit(train_vectors, train_pol_y)
predict_base1 = base1.predict(test_vectors)

target_names = ["Negative","Positive"]
print classification_report(test_pol_y,predict_base1, target_names = target_names)


             precision    recall  f1-score   support

   Negative       0.79      0.85      0.82      2564
   Positive       0.74      0.66      0.70      1647

avg / total       0.77      0.78      0.77      4211



In [290]:
#save as npz
#np.savez('pol_predictions.npz', sci_svm=predict_base1)

In [291]:
#call in npz labels
m = np.load('pol_predictions.npz')
predicted_svm = m['sci_svm']

#add predicted labels as [32] into test_ids 
predict_pol = predicted_svm.reshape(-1,1)
test_ids_emo = np.append(test_ids, predict_pol,axis=1)

test_ids_emo.shape

(4211, 32)

## RNN Model

Changes: 
- adding forget_bias to the LSTM Cell
- adding keep_prob

Check on:
- use of tf.nn.dynamic_rnn cell vs MultiRNN

In [292]:
# Specify parameters

#7/30 added 1 to increase max length to add a polarity field
maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]])+1 #Maximum number of words in a tweet
batchSize = 150
hiddenStateSize = 1
# lstmUnits = 2
numClasses = 6
numDimensions = 50
keepProb = 0.5
learningRate = 0.001

iterations = 1500

# Reset graph & create placeholders
tf.reset_default_graph()
labels = tf.placeholder(tf.int32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

# Lookup word vectors
with tf.name_scope("Embedding_Layer"):
    data_vec = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
    data_vec = tf.nn.embedding_lookup(wordVectors,input_data)
#     print "Embedding Layer shape", data_vec.shape

# Construct RNN/LSTM cell and recurrent layer.
with tf.name_scope("Cell_RNN_Layer"):
    lstmCell = tf.contrib.rnn.BasicLSTMCell(numDimensions, forget_bias=0.0)
    lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, input_keep_prob=keepProb, output_keep_prob=keepProb)            
    lstmCell = tf.contrib.rnn.MultiRNNCell([lstmCell] * hiddenStateSize)
    value, _ = tf.nn.dynamic_rnn(lstmCell, data_vec, dtype=tf.float32)
#     print "Output of RNN shape", value.shape
    
with tf.name_scope("Output_Layer"):
    weight = tf.Variable(tf.random_uniform([numDimensions, numClasses], -1.0, 1.0))
    bias = tf.zeros(numClasses, tf.float32)
    value = tf.transpose(value, [1, 0, 2])
    last = tf.gather(value, int(value.get_shape()[0]) - 1)
    multiplier = tf.matmul(last, weight)
    prediction = tf.add(multiplier, bias)
    print prediction
 
#     print "Weights shape", weight.shape
#     print "Bias shape", bias.shape
#     print "New shape for value", value.shape
#     print "last shape", last.shape
#     print "multiplier shape", multiplier.shape
#     print "Output shape", prediction.shape

    
# From A3
#     multiplier = matmul3d(value, weight)
#     print "Multiplier shape", multiplier.shape
#     prediction = tf.add(multiplier, bias)
#     print "Logits shape", prediction.shape
    
with tf.name_scope("Prediction_Layer"):
    # Define correct predictions and accuracy
    comparison = tf.argmax(prediction,1)
    correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

    # Define loss & optimizer
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
    optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(loss)



Tensor("Output_Layer/Add:0", shape=(150, 6), dtype=float32)


## For Tensorboard

In [293]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

## For Training

In [294]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(iterations):
    # Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch_subEmo(train_pol_x, train_emo_y, train_ids_emo, batchSize, maxSeqLength);
    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
    # Write summary to Tensorboard
    summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
    writer.add_summary(summary, i)

#     # Save the network every 10,000 training iterations
#     if (i % 10000 == 0 and i != 0):
#         save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
#         print("saved to %s" % save_path)
writer.close()

In [295]:
iterations = 400
l_predictions = []
l_labels = []
for i in range(iterations):
    nextBatch, nextBatchLabels = getTestBatch_subEmo(test_pol_x, tests_emo_y, test_ids_emo, batchSize, maxSeqLength)

    p, q= (sess.run([comparison,accuracy], {input_data: nextBatch, labels: nextBatchLabels}))
    l_predictions.append(p)
    l_labels.append(nextBatchLabels)
    #print("Accuracy for this batch:",q)

    #print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)
    

In [296]:
from sklearn.metrics import classification_report

target_names = emo_bin.classes_.tolist()
def score(preds,labels,target_names):
    predictions = np.asarray(preds).ravel()
    labels = np.argmax(np.asarray(labels),2).ravel()
    
    print classification_report(labels,predictions,target_names=target_names)
    
score(l_predictions,l_labels,target_names)


             precision    recall  f1-score   support

   :: anger       0.18      0.26      0.21      4334
 :: disgust       0.05      0.00      0.01      2133
    :: fear       0.46      0.42      0.44      7730
     :: joy       0.73      0.65      0.69     23193
 :: sadness       0.30      0.43      0.35     11535
:: surprise       0.43      0.38      0.40     11075

avg / total       0.49      0.48      0.48     60000



In [202]:
d.close()