In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
import collections
import math
import os
import random
import re

In [4]:
from six.moves import urllib

In [5]:
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

In [6]:
print(np.__version__)
print(mp.__version__)
print(tf.__version__)

1.13.3
2.0.2
1.4.1


In [7]:
# http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz

def get_reviews(path, positive=True):
    
    label = 1 if positive else 0

    reviews = []
    labels = []

    with open(path, 'r') as f:
        reviews = f.readlines()

    for review in reviews:
        labels.append(label)

    return reviews, labels        

In [8]:
def extract_labels_data():
    
    # This code assumes that the files rt-polarity.pos and rt-polarity.neg have already
    # been downloaded and are in the current working directory

    positive_reviews, positive_labels = get_reviews("rt-polarity.pos", positive=True)

    negative_reviews, negative_labels = get_reviews("rt-polarity.neg", positive=False)

    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels

    return labels, data    

In [10]:
labels, data = extract_labels_data()

In [11]:
labels[:5]

[1, 1, 1, 1, 1]

In [12]:
data[:5]

['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n',
 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . \n',
 'effective but too-tepid biopic\n',
 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start . \n',
 "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . \n"]

In [13]:
len(labels), len(data)

(10662, 10662)

In [14]:
max_document_length = max([len(x.split(" ")) for x in data])

In [15]:
print(max_document_length)

61


In [16]:
MAX_SEQUENCE_LENGTH = 50

vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_SEQUENCE_LENGTH)

In [17]:
x_data = np.array(list(vocab_processor.fit_transform(data)))

In [18]:
y_output = np.array(labels)

In [19]:
vocabulary_size = len(vocab_processor.vocabulary_)
print(vocabulary_size)

21097


In [20]:
data[3:5]

['if you sometimes like to go to the movies to have fun , wasabi is a good place to start . \n',
 "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . \n"]

In [21]:
x_data[3:5]

array([[57, 58, 59, 60,  5, 61,  5,  1, 62,  5, 63, 64, 65,  3, 16, 66, 67,
         5, 68,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [69, 70, 71, 72, 73, 74, 75, 76, 36, 77, 11, 78, 79, 12, 80, 81, 82,
        60, 83,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [22]:
y_output[:5]

array([1, 1, 1, 1, 1])

In [23]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))

x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [24]:
TRAIN_DATA = 9000
TOTAL_DATA = len(labels)

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [25]:
tf.reset_default_graph()

In [26]:
x = tf.placeholder(tf.int32, [None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(tf.int32, [None])

In [27]:
batch_size = 25
embedding_size = 50
max_label = 2

In [28]:
embedding_matrix = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

In [29]:
embeddings = tf.nn.embedding_lookup(embedding_matrix, x)

In [30]:
embeddings

<tf.Tensor 'embedding_lookup:0' shape=(?, 50, 50) dtype=float32>

In [31]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(21097, 50) dtype=float32_ref>

In [32]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(embedding_size)

In [33]:
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)

In [34]:
_, (encoding, _) = tf.nn.dynamic_rnn(lstmCell, embeddings, dtype=tf.float32)

In [35]:
encoding

<tf.Tensor 'rnn/while/Exit_2:0' shape=(?, 50) dtype=float32>

In [36]:
logits = tf.layers.dense(encoding, max_label, activation=None)

In [37]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)

In [38]:
loss = tf.reduce_mean(cross_entropy)

In [39]:
prediction = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))

In [40]:
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

In [41]:
optimizer = tf.train.AdamOptimizer(0.01)
train_step = optimizer.minimize(loss)

In [42]:
init = tf.global_variables_initializer()

In [43]:
num_epochs = 20

In [44]:
with tf.Session() as session:
    init.run()

    for epoch in range(num_epochs):

        num_batches = int(len(train_data) // batch_size) + 1

        for i in range(num_batches):

            # Select train data
            min_ix = i * batch_size
            max_ix = np.min([len(train_data), ((i+1) * batch_size)])

            x_train_batch = train_data[min_ix:max_ix]
            y_train_batch = train_target[min_ix:max_ix]

            train_dict = {x: x_train_batch, y: y_train_batch}
            session.run(train_step, feed_dict=train_dict)

            train_loss, train_acc = session.run([loss, accuracy], feed_dict=train_dict)

        test_dict = {x: test_data, y: test_target}
        test_loss, test_acc = session.run([loss, accuracy], feed_dict=test_dict)    
            
        print('Epoch: {}, Test Loss: {:.2}, Test Acc: {:.5}'.format(epoch + 1, test_loss, test_acc)) 
            

Epoch: 1, Test Loss: 0.7, Test Acc: 0.50301
Epoch: 2, Test Loss: 0.69, Test Acc: 0.50301
Epoch: 3, Test Loss: 0.69, Test Acc: 0.50301
Epoch: 4, Test Loss: 0.69, Test Acc: 0.50301
Epoch: 5, Test Loss: 0.6, Test Acc: 0.67449
Epoch: 6, Test Loss: 0.64, Test Acc: 0.73646
Epoch: 7, Test Loss: 0.83, Test Acc: 0.75271
Epoch: 8, Test Loss: 1.2, Test Acc: 0.74007
Epoch: 9, Test Loss: 1.3, Test Acc: 0.74368
Epoch: 10, Test Loss: 1.2, Test Acc: 0.74789
Epoch: 11, Test Loss: 1.6, Test Acc: 0.74549
Epoch: 12, Test Loss: 1.6, Test Acc: 0.74729
Epoch: 13, Test Loss: 1.8, Test Acc: 0.75211
Epoch: 14, Test Loss: 1.9, Test Acc: 0.74308
Epoch: 15, Test Loss: 2.3, Test Acc: 0.73827
Epoch: 16, Test Loss: 2.5, Test Acc: 0.73105
Epoch: 17, Test Loss: 2.4, Test Acc: 0.73105
Epoch: 18, Test Loss: 2.3, Test Acc: 0.72984
Epoch: 19, Test Loss: 2.3, Test Acc: 0.73947
Epoch: 20, Test Loss: 2.5, Test Acc: 0.73646
