In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import HTMLParser as htm
import string
import re
import time

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split

## Read in data

In [2]:
data = pd.read_csv("tweet_data_1.csv",sep='\t',quoting=3)
data["escape"] = data.apply(lambda row: htm.HTMLParser().unescape(row[1].decode("utf-8")),axis=1)

data.head()

Unnamed: 0,Id,Tweet,Emotion,Positive,escape
0,138881940341260288:,I got a surprise for all you bitches...pull th...,:: surprise,0,I got a surprise for all you bitches...pull th...
1,144479819843911683:,If I was a thief.. The first thing I would ste...,:: joy,1,If I was a thief.. The first thing I would ste...
2,139110849120972800:,"""&quot;@RevRunWisdom: not afraid of tomorrow, ...",:: fear,0,"""""@RevRunWisdom: not afraid of tomorrow, for I..."
3,141532076791971840:,"""Extreme can neither fight nor fly.&#xA;-- Wil...",:: fear,0,"""Extreme can neither fight nor fly.\n-- Willia..."
4,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise,0,Thinks that @melbahughes had a great 50th birt...


## Split into train & test sets

In [3]:
# Train and test data frames
train, test = train_test_split(data, test_size = 0.2)

# Train and test target labels
train_pol_y = train.ix[:,3].tolist()
test_pol_y = test.ix[:,3].tolist()

# MC NEW CODE
# Train and test x
train_pol_x = train.ix[:, 4].tolist()
test_pol_x = test.ix[:, 4].tolist()

## Preprocess & clean data

In [4]:
def process_data(data):
    """Converts to lowercase, strips out punctuation,
    removes excess whitespace within a string & leading & trailing whitespace"""
    new_list = []
    table = string.maketrans("","")
    for elem in data:
        elem = "".join(i for i in elem if ord(i)<128)
        elem = str(elem)        
        elem = elem.lower()
        elem = elem.translate(table, string.punctuation)
        elem = re.sub(' +',' ', elem)
        elem = elem.strip()
        
        new_list.append(elem)
    return new_list

train_pol_x = process_data(train_pol_x)
test_pol_x = process_data(test_pol_x)

## Pull in GLOVE embeddings

In [5]:
# Pull in word list & vectors
wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')
print ('Loaded the word vectors!')

Loaded the word list!
Loaded the word vectors!


## Get Matrix IDs for Training & Test

In [15]:
maxSeqLength

34

In [14]:
maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]]) #Maximum number of words in a tweet
# numDimensions = 25 #Dimensions for each word vector

def get_matrix_ids(data, maxSeqLength):
    numFiles = len(data)
    ids = np.zeros((numFiles, maxSeqLength), dtype='int32')

    for fileCounter, tweet in enumerate(data):
        start = time.time()
        split = tweet.split()
        for indexCounter, word in enumerate(split):
            try:
                ids[fileCounter][indexCounter] = wordsList.index(word)
            except ValueError:
                ids[fileCounter][indexCounter] = 399999 #Vector for unkown words

            if indexCounter >= maxSeqLength:
                break
        if fileCounter % 500 == 0:
            print "Tweet matrices completed:", fileCounter
    end = time.time()
    print "Time elapsed", (end - start)
    return ids


In [7]:
# Takes 20+ minutes to run
#train_ids = get_matrix_ids(train_pol_x, maxSeqLength)

In [8]:
# Takes ~10 minutes to run
#test_ids = get_matrix_ids(test_pol_x, maxSeqLength)

In [9]:
#save ids into ids.npz to call later
#np.savez('ids.npz', train_ids=train_ids, test_ids=test_ids)

In [7]:
d = np.load('ids.npz')
train_ids = d['train_ids']
test_ids = d['test_ids']

In [8]:
print train_ids[0]
print test_ids.shape
train_pol_x[0]

[399999 399999    232  32027   2122    938     61     39   3050    285
    359    766     17      7   3687 399999 399999 399999 399999      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0]
(4211, 34)


'hot fudge sundae and ice cream cookie sandwich flavored poptarts'

## Helper functions for training

In [9]:
from random import randint

# def getTrainBatch(train_data, train_labels, train_ids):
#     labels = []
#     arr = np.zeros([batchSize, maxSeqLength])
#     # iterate through batch size
#     for i in range(batchSize):
#         num = randint(1, (len(train_data)-1))

#         if train_labels[num] == 1:
#             labels.append([1,0])
#         else:
#             labels.append([0,1])
            
#         arr[i] = train_ids[num-1:num]
        
#     return arr, labels

def getTrainBatch(train_data, train_labels, train_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # iterate through batch size
    for i in range(batchSize):
        num = randint(1, (len(train_data)-1))
        if train_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = train_ids[num-1:num]
        
    return arr.astype(int), labels


# def getTestBatch(test_data, test_labels, test_ids):
#     labels = []
#     arr = np.zeros([batchSize, maxSeqLength])
#     for i in range(batchSize):
#         num = randint(1,(len(test_data)-1))
        
#         if test_labels[num] == 1:
#             labels.append([1,0])
#         else:
#             labels.append([0,1])
            
#         arr[i] = test_ids[num-1:num]
        
#     return arr, labels

def getTestBatch(test_data, test_labels, test_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        
        if test_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels

## RNN Model

# CHANGE batchSize, lstmUnits & iterations & RERUN

In [10]:
# Specify parameters
batchSize = 24
lstmUnits = 8
numClasses = 2
numDimensions = 25
maxSeqLength = max([len(elem.split()) for elem in data.ix[:, 4]]) #Maximum number of words in a tweet

iterations = 10000

# Reset graph & create placeholders
tf.reset_default_graph()
labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

# Lookup word vectors
data_vec = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data_vec = tf.nn.embedding_lookup(wordVectors,input_data)

# Feed RNN cell
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data_vec, dtype=tf.float32)

# Get final output
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

# Define correct predictions and accuracy
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

# Define loss & optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

## For Tensorboard

In [11]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

NameError: name 'sess' is not defined

## For Training

In [13]:
sess1 = tf.InteractiveSession()
saver = tf.train.Saver()
sess1.run(tf.global_variables_initializer())
all_labels = []
for i in range(iterations):
    # Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch(train_pol_x, train_pol_y, train_ids);
    all_labels= all_labels + nextBatchLabels
    sess1.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})

    # Write summary to Tensorboard
    #if (i % 50 == 0):
        #summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        #writer.add_summary(summary, i)

    # Save the network every 10,000 training iterations
    if (i % 10000 == 0 and i != 0):
        save_path = saver.save(sess1, "models/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)
#writer.close()

KeyboardInterrupt: 

## For Testing

In [None]:
iterations = 10
all_labels = []
data_ = []
preds = []
c_ = []
for i in range(iterations):
    nextBatch, nextBatchLabels = getTestBatch(test_pol_x, test_pol_y, test_ids);
    #all_labels= all_labels + nextBatchLabels
    all_labels.append(nextBatchLabels)
    c,t,p = sess1.run([correctPred,prediction,accuracy], {input_data: nextBatch, labels: nextBatchLabels})
    print "Accuracy for this batch: %s" % (p*100)
    c_.append(c)
    preds.append(t)

In [101]:
print preds[0]
print np.argmax(preds[0],axis=1)
print all_labels[0]
print c_[0]

[[ 0.05793665  0.10472286]
 [ 0.06268698  0.10740621]
 [-0.00347716  0.18171999]
 [ 0.00554074  0.16998485]
 [-0.05850739  0.24341142]
 [ 0.02074572  0.1834622 ]
 [ 0.00629911  0.16334432]
 [ 0.02640686  0.1551061 ]
 [ 0.02542687  0.10466574]
 [ 0.01040077  0.17917639]
 [-0.06902809  0.23661855]
 [ 0.02018899  0.17193559]
 [-0.15605098  0.28844428]
 [ 0.05412221  0.08053753]
 [ 0.01709051  0.15470444]
 [ 0.03133547  0.14503402]
 [-0.04074945  0.18959814]
 [ 0.01015655  0.16458596]
 [-0.02533872  0.17217892]
 [-0.05069693  0.19471258]
 [ 0.04560164  0.15406677]
 [-0.00190119  0.16734415]
 [ 0.05308925  0.10919257]
 [ 0.01135278  0.15032867]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[[0, 1], [1, 0], [1, 0], [0, 1], [1, 0], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [1, 0]]
[ True False False  True False  True  True False  True  True False  True
  True  True  True  True False  Tru

In [None]:
def getTestBatch(test_data, test_labels, test_ids):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,(len(test_data)-1))
        
        if test_labels[num-1] == 1:
            labels.append([1,0])
        else:
            labels.append([0,1])
            
        arr[i] = test_ids[num-1:num]
        
    return arr.astype(int), labels