In [None]:
import numpy as np
import tensorflow as tf
import csv
import re
from tqdm import tqdm_notebook as tqdm
import nltk
import string
import random

In [None]:
#breaks up the sentences into lists of individual words and appends 'pos' or 'neg' after each list
posFeatures = []
negFeatures = []
all_words = []
def process(words):
    global all_words
    for word in words:
        if word.isdigit() or (word in string.punctuation):
            words.remove(word)
    words = map(string.lower, words)
    all_words.extend(words)
    return words

with open('Sentiment Analysis Dataset.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    print "start importing text data..."
    for row in tqdm(reader):
        if row['Sentiment'] == '1':
            posWords = re.findall(r"[a-zA-Z_']+", row['SentimentText'].rstrip())
            posFeatures.append(process(posWords))
        elif row['Sentiment'] == '0':
            negWords = re.findall(r"[a-zA-Z_']+", row['SentimentText'].rstrip())
            negFeatures.append(process(negWords))
    print "posFeatures size: " + str(len(posFeatures))
    print "negFeatures size: " + str(len(negFeatures))

In [None]:
from collections import Counter
print 'start counts'
counts = Counter(all_words)

print 'sorted!!?'
vocab = sorted(counts, key=counts.get, reverse=True)

print 'create dict'
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [None]:
print vocab[:100]
print len(vocab)

In [None]:
print len(posFeatures)
print len(negFeatures)
print posFeatures[:10]

In [None]:
pos_int = []
neg_int = []
for sent in tqdm(posFeatures):
    pos_int.append([vocab_to_int[word] for word in sent])
for sent in tqdm(negFeatures):
    neg_int.append([vocab_to_int[word] for word in sent])

In [None]:
pos_len = Counter([len(x) for x in pos_int])
print("Zero-length reviews: {}".format(pos_len[0]))
print("Maximum review length: {}".format(max(pos_len)))

In [None]:
seq_len = 50

def length_normalize(feature, seq_len):
    result = []
    for sent in tqdm(feature):
        if len(sent)<seq_len:
            # pad 0 to head
            arr= [0 for _ in range(seq_len-len(sent))]
            sent.extend(arr)
            result.append(sent)
        else:
            result.append(sent[:seq_len])
    return result

pos_input = length_normalize(pos_int, seq_len)     
neg_input = length_normalize(neg_int, seq_len)     

In [70]:
split_frac = 0.8
pos_idx = int(len(pos_input)*0.8)
neg_idx = int(len(neg_input)*0.8)

def combine_pos_neg(pos, neg):
    feature = []
    label = []
    feature.extend(pos)
    feature.extend(neg)
    label.extend([1 for i in range(len(pos))])
    label.extend([0 for i in range(len(neg))])
    
    print len(feature)
    print len(label)
    
    feature_shuf = []
    label_shuf = []
    index_shuf = range(len(feature))
    random.shuffle(index_shuf)
    for i in tqdm(index_shuf):
        feature_shuf.append(feature[i])
        label_shuf.append(label[i])
    
    return np.array(feature_shuf), np.array(label_shuf)

train_pos, val_pos = pos_input[:pos_idx], pos_input[pos_idx:]
train_neg, val_neg = neg_input[:neg_idx], neg_input[neg_idx:]

train_x, train_y = combine_pos_neg(train_pos, train_neg)
val_x, val_y = combine_pos_neg(val_pos, val_neg)

1262890
1262890



315724
315724





In [71]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001

In [72]:
n_words = len(vocab)

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32,[None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32,[None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32)

In [73]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words,embed_size),-1,1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [74]:
with graph.as_default():
    # Your basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop]* lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [75]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [76]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)  
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [77]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [78]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [86]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in tqdm(get_batches(val_x, val_y, batch_size)):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

KeyboardInterrupt: 