In [136]:
import numpy as np
import tensorflow as tf

In [137]:
with open('reviews.txt', 'r') as f:
    reviews = f.read()
with open('labels.txt', 'r') as f:
    labels = f.read()

In [138]:
print(len(reviews))
print(len(labels))

33678267
225000


In [139]:
from string import punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [140]:
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

In [141]:
vocab_to_int = {word : index for index, word in enumerate(set(words))}

reviews_ints = [[vocab_to_int[word] for word in review.split()] for review in reviews]

In [142]:
labels = np.array([1 if label == 'positive' else 0 for label in labels.split()])

In [144]:
from collections import Counter
review_lens = Counter([len(review) for review in reviews_ints])
# print(review_lens)
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [145]:
review_ints = [reviews_int for reviews_int in reviews_ints if len(reviews_int) > 0 ]

In [146]:
seq_len = 200
features = []
for review_int in review_ints:
    temp = []
    length = len(review_int)
    if length <= seq_len:
        temp = [0] * (seq_len - length)
        temp += review_int
    else:
        temp = review_int[:seq_len]
    features.append(temp)
features = np.array(features)

In [147]:
features[:10,:100]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 34001, 40247, 57501,
          781, 42282, 59513, 10873, 56054, 69233, 71774, 39424, 22228,
        48856, 46963, 16215, 22184, 24261, 12971, 48982, 14167, 48856,
        53513, 62473, 67372, 33548, 71774, 38030, 73479, 71816,   645,
        54390, 53015, 29956, 34001, 40247, 32020, 60762, 57501, 59104,
        40777],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     

In [150]:
size = features.shape[0] 
index1, index2 = int(size * 0.8), int(size * 0.9)
print(size, index1, index2)
train_x, val_x, test_x = features[:index1, :],features[index1:index2, :],features[index2:, :]
train_y, val_y, test_y = labels[:index1],labels[index1:index2],labels[index2:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

25000 20000 22500
			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [151]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001

In [152]:
n_words = len(vocab_to_int)

graph = tf.Graph()
with graph.as_default():
    inputs_ = tf.placeholder(dtype = tf.int32, shape = [batch_size, seq_len], name='input')
    labels_ = tf.placeholder(dtype = tf.int32, shape = [batch_size, 1], name='label')
    keep_prob = tf.placeholder(dtype = tf.float32, name='drop_out') 

In [153]:
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [154]:
with graph.as_default():    
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    initial_state = cell.zero_state(batch_size, tf.float32)

In [155]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [156]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [157]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [158]:
embed.shape

TensorShape([Dimension(500), Dimension(200), Dimension(300)])

In [159]:
def get_batches(x, y, batch_size=100):
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [160]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.238
Epoch: 0/10 Iteration: 10 Train loss: 0.240
Epoch: 0/10 Iteration: 15 Train loss: 0.214
Epoch: 0/10 Iteration: 20 Train loss: 0.223
Epoch: 0/10 Iteration: 25 Train loss: 0.195
Val acc: 0.721
Epoch: 0/10 Iteration: 30 Train loss: 0.159
Epoch: 0/10 Iteration: 35 Train loss: 0.248
Epoch: 0/10 Iteration: 40 Train loss: 0.197
Epoch: 1/10 Iteration: 45 Train loss: 0.169
Epoch: 1/10 Iteration: 50 Train loss: 0.167
Val acc: 0.756
Epoch: 1/10 Iteration: 55 Train loss: 0.167
Epoch: 1/10 Iteration: 60 Train loss: 0.173
Epoch: 1/10 Iteration: 65 Train loss: 0.144
Epoch: 1/10 Iteration: 70 Train loss: 0.123
Epoch: 1/10 Iteration: 75 Train loss: 0.143
Val acc: 0.770
Epoch: 1/10 Iteration: 80 Train loss: 0.125
Epoch: 2/10 Iteration: 85 Train loss: 0.132
Epoch: 2/10 Iteration: 90 Train loss: 0.198
Epoch: 2/10 Iteration: 95 Train loss: 0.149
Epoch: 2/10 Iteration: 100 Train loss: 0.129
Val acc: 0.760
Epoch: 2/10 Iteration: 105 Train loss: 0.111
Epoch: 2/10 Ite

In [165]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

Test accuracy: 0.816
