In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import HTMLParser as htm
import string
import re
import time

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


## Read in data

In [2]:
data = pd.read_csv("tweet_data_1.csv",sep='\t',quoting=3)
data["escape"] = data.apply(lambda row: htm.HTMLParser().unescape(row[1].decode("utf-8")),axis=1)

data.head()

Unnamed: 0,Id,Tweet,Emotion,Positive,escape
0,138881940341260288:,I got a surprise for all you bitches...pull th...,:: surprise,0,I got a surprise for all you bitches...pull th...
1,144479819843911683:,If I was a thief.. The first thing I would ste...,:: joy,1,If I was a thief.. The first thing I would ste...
2,139110849120972800:,"""&quot;@RevRunWisdom: not afraid of tomorrow, ...",:: fear,0,"""""@RevRunWisdom: not afraid of tomorrow, for I..."
3,141532076791971840:,"""Extreme can neither fight nor fly.&#xA;-- Wil...",:: fear,0,"""Extreme can neither fight nor fly.\n-- Willia..."
4,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise,0,Thinks that @melbahughes had a great 50th birt...


## Clean Data

In [3]:
def process_data(data):
    """Converts to lowercase, strips out punctuation,
    removes excess whitespace within a string & leading & trailing whitespace"""
    new_list = []
    table = string.maketrans("","")
    for elem in data:
        elem = "".join(i for i in elem if ord(i)<128)
        elem = str(elem)        
        elem = elem.lower()
        elem = elem.translate(table, string.punctuation)
        elem = re.sub(' +',' ', elem)
        elem = elem.strip()
        
        new_list.append(elem)
    return new_list

#train_pol_x = process_data(train_pol_x)
#test_pol_x = process_data(test_pol_x)

#Clean entire data set at once
data.escape = process_data(data.escape)

## Split into train & test sets

In [4]:
# Train and test data frames
train, test = train_test_split(data, test_size = 0.2)

# Train and test target labels for polarity
train_pol_y = train.ix[:,3].tolist()
test_pol_y = test.ix[:,3].tolist()

# Binarize labels for sub-emotion classifier
train_emo = train.ix[:,2].tolist()
test_emo = test.ix[:,2].tolist()
emo_bin = preprocessing.LabelBinarizer()

# Labels for sub-emotion classifier
train_emo_y = emo_bin.fit_transform(train_emo)
tests_emo_y = emo_bin.transform(test_emo)

# Train and test inputs
train_pol_x = train.ix[:, 4].tolist()
test_pol_x = test.ix[:, 4].tolist()


#save data to recall later
#np.savez('train_test.npz', train_pol_y=train_pol_y, test_pol_y=test_pol_y,train_pol_x=train_pol_x,\
        #test_pol_x=test_pol_x, train_emo=train_emo,test_emo=test_emo,train_emo_y=train_emo_y,\
        #tests_emo_y=tests_emo_y)


## Pull in GloVe embeddings

In [5]:
# Pull in word list & vectors
wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')
print ('Loaded the word vectors!')

Loaded the word list!
Loaded the word vectors!


In [6]:
maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]]) #Maximum number of words in a tweet


def get_matrix_ids(data, maxSeqLength):
    numFiles = len(data)
    ids = np.zeros((numFiles, maxSeqLength), dtype='int32')

    for fileCounter, tweet in enumerate(data):
        start = time.time()
        split = tweet.split()
        for indexCounter, word in enumerate(split):
            try:
                ids[fileCounter][indexCounter] = wordsList.index(word)
            except ValueError:
                ids[fileCounter][indexCounter] = 399999 #Vector for unkown words

    return ids


#train_ids = get_matrix_ids(train_pol_x, maxSeqLength)
#test_ids = get_matrix_ids(test_pol_x, maxSeqLength)
#np.savez('ids.npz', train_ids=train_ids, test_ids=test_ids)


## Load Train Data

In [7]:
#load all of train and test data
p = np.load('train_test.npz')
train_pol_y = p['train_pol_y']
test_pol_y = p['test_pol_y']
train_pol_x = p['train_pol_x']
test_pol_x = p['test_pol_x']
train_emo = p['train_emo']
test_emo = p['test_emo']
train_emo_y = p['train_emo_y']
tests_emo_y = p['tests_emo_y']

## Get matrix ids

In [8]:
# Matrix ids for each tweet were built using GloVe word embeddings
# Because construction of matrix ids is computationally expensive,
# matrix ids were saved and will simply be reloaded
d = np.load('ids.npz')
train_ids = d['train_ids']
test_ids = d['test_ids']

In [9]:
train_ids.shape

(16840, 31)

In [10]:
#Add extra dimension to traim_ids_emo for polarity

#print train_pol_y[0]
#print train_emo[0]
#print train_ids[0]

#pol_array = np.asarray(train_pol_y).reshape(-1,1)
#train_ids_emo = np.append(train_ids, pol_array,axis=1)
#print train_pol_x[0]
#print train_ids.shape
#print train_ids_emo.shape

## Helper functions for training

In [11]:
from random import randint
import random

# For Polarity Classifier
def getTrainBatch(train_data, train_labels, train_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize):
        num = randint(1, (len(train_data)-1))
        if train_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = train_ids[num-1:num]
        
    return arr.astype(int), labels

def getTestBatch(test_data, test_labels, test_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        
        if test_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels

# For sub-emotion classifier
def getTrainBatch_subEmo(train_data, train_labels, train_ids, batchSize, maxSeqLength):
    labels = []
    inds = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    #for i in range(batchSize-10): 
        #num = randint(1, (len(train_data)-1))
        
    count = 0
    for num in random.sample(xrange(1,(len(train_data)-1)), batchSize-10):
        labels.append(train_labels[num-1])
            
        #arr[i]    
        arr[count] = train_ids[num-1:num]
        inds.append(num-1)
        count +=1
        
    disgust = []
    for m in range(len(train_labels)):
        if train_labels[m][1] == 1:
            disgust.append(m)
    
    #for mel in range(5):
        #num = randint(1, (len(disgust)-1))
    for num in random.sample(xrange(1,(len(disgust)-1)), 5):
        ind = disgust[num]
        labels.append(train_labels[ind])
        arr[count] = train_ids[ind]
        inds.append(ind)
        count +=1
        
    anger = []
    for p in range(len(train_labels)):
        if train_labels[p][0] == 1:
            anger.append(p)
    
    #for pri in range(5,10):
        #num = randint(1, (len(anger)-1))
    for num in random.sample(xrange(1,(len(anger)-1)), 5):
        ind = anger[num]
        labels.append(train_labels[ind])
        arr[count] = train_ids[ind]
        inds.append(ind)
        count +=1
    
    return arr.astype(int), labels,inds


def getTestBatch_subEmo(test_data, test_labels, test_ids, batchSize, maxSeqLength):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    inds=[]
    #for i in range(batchSize):
        #num = randint(1,(len(test_data)-1))
        
    count = 0
    for num in random.sample(xrange(1,(len(test_data)-1)), batchSize):
        labels.append(test_labels[num-1])
            
        arr[count] = test_ids[num-1:num]
        inds.append(num-1)
        count +=1
        
    return arr.astype(int), labels,inds


In [12]:
anger = []
for p in range(len(train_emo_y)):
    if train_emo_y[p][0] == 1:
        anger.append(p)
        
print anger[:5]

train_emo_y[55]

[11, 34, 55, 63, 88]


array([1, 0, 0, 0, 0, 0])

## Polarity Classifier  w/ Scikit

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

from sklearn.metrics import classification_report

In [14]:
vectorizer = TfidfVectorizer(min_df = 2,
                            max_df=.5,
                            use_idf=True,
                            stop_words='english',
                            sublinear_tf=True
                             
                            )
train_vectors = vectorizer.fit_transform(train_pol_x)
test_vectors = vectorizer.transform(test_pol_x)

base1 = svm.LinearSVC(loss="hinge")#svm.SVC(kernel='linear')
base1.fit(train_vectors, train_pol_y)
predict_base1 = base1.predict(test_vectors)

target_names = ["Negative","Positive"]
print classification_report(test_pol_y,predict_base1, target_names = target_names)


             precision    recall  f1-score   support

   Negative       0.79      0.85      0.82      2564
   Positive       0.74      0.66      0.70      1647

avg / total       0.77      0.78      0.77      4211



In [15]:
#save as npz
#np.savez('pol_predictions.npz', sci_svm=predict_base1)

In [16]:
# call in npz labels
m = np.load('pol_predictions.npz')
predicted_svm = m['sci_svm']
# predicted_svm

#add predicted labels as [32] into test_ids 
#predict_pol = predicted_svm.reshape(-1,1)
#test_ids_emo = np.append(test_ids, predict_pol,axis=1)

#test_ids_emo.shape
# test_ids.shape

## Append Polarity predictions as word in tweet

In [17]:
def concat_pol(df, pol_pred, tweet):
    new_tweet = []
    for i in range(len(df[pol_pred])):
        if elem == 1:
            tw = 'positive ' + df[tweet][i]
        else:
            tw = 'negative ' + df[tweet][i]
        new_tweet.append(tw)
    return new_tweet

In [18]:
# # Test Set
# # Create a data frame with the tweet words & polarity prediction
# test_df = pd.DataFrame({'test_tweet': test_pol_x, 'pol_pred': predicted_svm})
# test_df['concat'] = concat_pol(test_df, 'pol_pred', 'test_tweet')

# # Train Set
# train_df = pd.DataFrame({'train_tweet': train_pol_x, 'pol_pred': train_pol_y.astype(int)})
# train_df['concat'] = concat_pol(train_df, 'pol_pred', 'train_tweet')

# maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]]) + 1 #Maximum number of words in a tweet

# train_ids_pol = get_matrix_ids(train_df.concat, maxSeqLength)
# test_ids_pol = get_matrix_ids(test_df.concat, maxSeqLength)
# np.savez('ids_pol.npz', train_ids_pol=train_ids_pol, test_ids_pol=test_ids_pol)

Unnamed: 0,pol_pred,test_tweet,concat
0,1,first case done chambers have now found more w...,negative first case done chambers have now fou...
1,0,robertlemke so rocking flow3 yeah have fun btw...,negative robertlemke so rocking flow3 yeah hav...
2,0,just got back from my last history class until...,negative just got back from my last history cl...
3,0,abi a4akir w rasi y3wrni w abi anam w abi atra...,negative abi a4akir w rasi y3wrni w abi anam w...
4,0,hearnesie71 sleep overs tuesday after hockey a...,negative hearnesie71 sleep overs tuesday after...


### Load matrix ids with polarity prediction

In [None]:
d = np.load('ids_pol.npz')
train_ids_pol = d['train_ids_pol']
test_ids_pol = d['test_ids_pol']

# Sub-emotion Classifier without polarity

## RNN Model

Changes: 
- adding forget_bias to the LSTM Cell
- adding keep_prob

Check on:
- use of tf.nn.dynamic_rnn cell vs MultiRNN

In [26]:
# Specify parameters

#7/30 added 1 to increase max length to add a polarity field
maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]])+1 #Maximum number of words in a tweet
batchSize = 150
hiddenStateSize = 1
# lstmUnits = 2
numClasses = 6
numDimensions = 50
keepProb = 0.5
learningRate = 0.001

iterations = 1500

# Reset graph & create placeholders
tf.reset_default_graph()
labels = tf.placeholder(tf.int32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

##ADD NS on 8/2
ns = tf.tile([maxSeqLength], [batchSize, ])

# Lookup word vectors
with tf.name_scope("Embedding_Layer"):
    data_vec = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
    data_vec = tf.nn.embedding_lookup(wordVectors,input_data)
#     print "Embedding Layer shape", data_vec.shape

# Construct RNN/LSTM cell and recurrent layer.
#with tf.name_scope("Cell_RNN_Layer"):
    #lstmCell = tf.contrib.rnn.BasicLSTMCell(numDimensions, forget_bias=0.0)
    #lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, input_keep_prob=keepProb, output_keep_prob=keepProb)            
    #lstmCell = tf.contrib.rnn.MultiRNNCell([lstmCell] * hiddenStateSize)
    #value, _ = tf.nn.dynamic_rnn(lstmCell, data_vec, dtype=tf.float32)
    
    
##NEW MULTILAYER added on 8/2
with tf.name_scope("Cell_RNN_Layer"):
    cells=[]
    for _ in range(hiddenStateSize):
        lstmCell = tf.contrib.rnn.BasicLSTMCell(numDimensions, forget_bias=0.0)
        lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, input_keep_prob=keepProb, output_keep_prob=keepProb)        
        cells.append(lstmCell)
        multicell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
    value, _ = tf.nn.dynamic_rnn(multicell, data_vec, sequence_length=ns, dtype=tf.float32)
    
#     print "Output of RNN shape", value.shape
    
with tf.name_scope("Output_Layer"):
    weight = tf.Variable(tf.random_uniform([numDimensions, numClasses], -1.0, 1.0))
    bias = tf.Variable(tf.zeros(numClasses, tf.float32))
    value = tf.transpose(value, [1, 0, 2])
    last = tf.gather(value, int(value.get_shape()[0]) - 1)
    multiplier = tf.matmul(last, weight)
    prediction = tf.add(multiplier, bias)

    
with tf.name_scope("Prediction_Layer"):
    # Define correct predictions and accuracy
    comparison = tf.argmax(prediction,1)
    correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

    # Define loss & optimizer
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
    optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(loss)



## For Tensorboard

In [27]:
import datetime

sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

## For Training

In [28]:


train_inds = []
train_logits = []
train_labels = []
for i in range(iterations):
    # Next Batch of reviews
    nextBatch, nextBatchLabels,train_i = getTrainBatch_subEmo(train_pol_x, train_emo_y, train_ids_pol, batchSize, maxSeqLength);
    train_inds.append(train_i)
    train_logs = sess.run([prediction,optimizer], {input_data: nextBatch, labels: nextBatchLabels})
    train_logits.append(train_logs[0])
    train_labels.append(nextBatchLabels)
    # Write summary to Tensorboard
    summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
    writer.add_summary(summary, i)

#     # Save the network every 10,000 training iterations
#     if (i % 10000 == 0 and i != 0):
#         save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
#         print("saved to %s" % save_path)
writer.close()

In [29]:
iterations = 500
l_predictions = []
l_labels = []
l_logits = []
l_inds = []
for i in range(iterations):
    nextBatch, nextBatchLabels,test_i = getTestBatch_subEmo(test_pol_x, tests_emo_y, test_ids_pol, batchSize, maxSeqLength)

    test_log,p,q= (sess.run([prediction,comparison,accuracy], {input_data: nextBatch, labels: nextBatchLabels}))
    l_predictions.append(p)
    l_labels.append(nextBatchLabels)
    l_logits.append(test_log)
    l_inds.append(test_i)
    #print("Accuracy for this batch:",q)

#     print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)
    

In [30]:
from sklearn.metrics import classification_report

target_names = emo_bin.classes_.tolist()
def score(preds,labels,target_names):
    predictions = np.asarray(preds).ravel()
    labels = np.argmax(np.asarray(labels),2).ravel()
    
    print classification_report(labels,predictions,target_names=target_names)
    
score(l_predictions,l_labels,target_names)


             precision    recall  f1-score   support

   :: anger       0.17      0.08      0.11      5117
 :: disgust       0.13      0.01      0.02      2653
    :: fear       0.53      0.34      0.41      9789
     :: joy       0.55      0.76      0.63     29432
 :: sadness       0.27      0.23      0.25     14311
:: surprise       0.41      0.39      0.40     13698

avg / total       0.42      0.46      0.43     75000



In [28]:
d.close()