## Sentiment Analysis of Reviews using RNNs in TensorFlow

In [1]:
import collections
import math
import os
import random
import tarfile
import re

In [2]:
from six.moves import urllib

In [3]:
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [4]:
print(np.__version__)
print(mp.__version__)
print(tf.__version__)

1.14.2
2.2.2
1.6.0


#### Download, unzip and untar files in an automated way

In [5]:
DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)

    print('Found and verified file from this path: ', url_path)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

### Extract reviews and the corresponding positive and negative labels from the dataset

In [6]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")


def get_reviews(dirname, positive=True):
    label = 1 if positive else 0

    reviews = []
    labels = []
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+') as f:
                review = f.read()
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
                
                reviews.append(review)
                labels.append(label)
    
    return reviews, labels           

def extract_labels_data():
    # If the file has not already been extracted
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOADED_FILENAME) as tar:
            tar.extractall()
            tar.close()
        
    positive_reviews, positive_labels = get_reviews("aclImdb/train/pos/", positive=True)
    negative_reviews, negative_labels = get_reviews("aclImdb/train/neg/", positive=False)

    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels

    return labels, data

In [7]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

download_file(URL_PATH)

Found and verified file from this path:  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Downloaded file:  ImdbReviews.tar.gz


In [8]:
labels, data = extract_labels_data()

In [9]:
labels[:5]

[1, 1, 1, 1, 1]

In [10]:
data[:5]

['excellent episode movie ala pulp fiction 7 days  7 suicides it doesnt get more depressing than this movie rating 810 music rating 1010',
 'ive just read the most recent remarks about this movie and i would like to respond youre probably not familiar with the original story of rap group nwa which dates back to the beginning in 1988 in 1989 ice cube left the band to go solo and ultimately in 1991 the band breaking up when drdre left which led to a lot of beef starting with the departure of ice cube and drdre in 1991 this story was somewhat based on that  further more this movie was a 90 minute laughing spree the way they explained the bootie juice song to be a political statement was hilarious not to mention the love song tasty was hooking up and when vanilla sherbert got his ass kicked just like the record company executive is also hilarious and having theyre managers getting shot every time too  people who didnt enjoy this movie probably didnt get it or were complete idiots my opinio

In [11]:
len(labels), len(data)

(25000, 25000)

In [12]:
max_document_length = max([len(x.split(" ")) for x in data])
print(max_document_length)

2470


### How many words to consider in each review?

Majority of the reviews fall under 250 words. This a number we've chosen based on some analysis of the data:

* Count the number of words in each file and divide by number of files to get an average i.e. **avg_words_per_file = total_words / num_files**
* Plot the words per file on matplot lib and try find a number which includes a majority of files

Word embeddings all have the same dimensionality which you can specify. A document is a vector of word embeddings (one dbpedia instance is a document in this case)

* Each document should be of the **same length**, documents longer than the MAX_SEQUENCE_LENGTH are truncated to this length
* The other documents will be **padded** by a special symbol to be the same max length

In [13]:
MAX_SEQUENCE_LENGTH = 250

### Vocabulary processor
 
http://tflearn.org/data_utils/
 
Library to map every word which occurs in our dataset to a unique identifer. If there are 10023 words each will be assigned a unique id from 1-10023

In [14]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_SEQUENCE_LENGTH)

#### Transform every word to a representation using unique ids

In [15]:
x_data = np.array(list(vocab_processor.fit_transform(data)))
y_output = np.array(labels)

vocabulary_size = len(vocab_processor.vocabulary_)
print(vocabulary_size)

111526


In [16]:
data[3:5]

['airwolf the movie a variation on the original 2 part pilot yet the movie although shorter does contain extra footage unseen in the 2 hour pilot the pilot is much more of a pilot than the movie where as a pilot movie is normally the same 2 parter combined but the movie is actually a different edit with extras here and cuts there  worth a look even if you have the season 1 dvd set id still pick up a copy of the movie its still in some shops like virgin woolworths and the likes of mixed media stores although it generally needs ordering but it saves needing to buy online as many of us still dont do or trust online shopping but if you look around airwolfs in stores  airwolf was truly 1 of the 80s most under rated shows  a full size airwolf is currently being rebuilt for a helicopter museum  info and work in progress pictures are over at httpairwolforg also with airwolf mods for flashpoint and flight sim games it seams shes finally here to stay ',
 'does anyone happen to know where this fi

In [17]:
x_data[3:5]

array([[182,  24,   3,  66, 183,  74,  24,  40, 184, 185, 186, 187,  24,
          3, 188, 189, 190, 191, 192, 193, 194,  50,  24, 184, 195, 186,
         24, 186, 104, 196,  13,  42,  66, 186,  15,  24,   3, 197, 150,
         66, 186,   3, 104, 198,  24, 199, 184, 200, 201, 202,  24,   3,
        104, 125,  66, 203, 204,  39, 205, 206,  29, 207, 137, 208,  66,
        209, 210, 211, 212, 213,  24, 214, 215, 216, 217, 218, 219, 220,
         62,  66, 221,  42,  24,   3, 222, 219,  50, 223, 224,  32, 225,
        226,  29,  24, 227,  42, 228, 229, 230, 188,  10, 231, 232, 233,
        202,  10, 234, 235,  33, 236, 237, 150, 238,  42, 239, 219, 240,
        241, 118, 242, 237, 243, 202, 211, 212, 209, 244, 245,  50, 230,
        182,  71, 246, 215,  42,  24, 247,  25, 248, 249, 250,  66, 251,
        252, 182, 104, 253, 254, 255, 256,  66, 257, 258, 259,  29, 260,
         50, 261, 262, 164, 180, 263, 264, 105,  39, 182, 265, 256, 266,
         29, 267, 268, 269,  10, 270, 271, 272, 206

In [18]:
x_data[:2]

array([[  1,   2,   3,   4,   5,   6,   7,   8,   7,   9,  10,  11,  12,
         13,  14,  15,  16,   3,  17,  18,  19,  17,  20,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [19]:
y_output[:5]

array([1, 1, 1, 1, 1])

#### Shuffle the data so the training instances are randomly fed to the RNN

In [20]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))

x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [21]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [22]:
tf.reset_default_graph()

x = tf.placeholder(tf.int32, [None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(tf.int32, [None])

In [23]:
num_epochs = 20
batch_size = 25
embedding_size = 50
max_label = 2

### Embeddings to represent words

These embeddings are generated as a part of the training process of the RNN. The embeddings are trained using the reviews in the training dataset.

* *embedding_matrix* This is a matrix which holds the embeddings for every word in the vocabulary. The values are determined during the training process
* *embeddings* The embeddings for the words which are input as a part of one training batch

In [24]:
embedding_matrix = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embeddings = tf.nn.embedding_lookup(embedding_matrix, x)

In [25]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(111526, 50) dtype=float32_ref>

In [26]:
embeddings

<tf.Tensor 'embedding_lookup:0' shape=(?, 250, 50) dtype=float32>

In [27]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(embedding_size)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)

### Results from an RNN of LSTM cells

(ouput, (**final_state**, other_state_info))

We're interested in the final state of this RNN because those are the encodings we feed into the prediction layer of our neural network

In [28]:
_, (encoding, _) = tf.nn.dynamic_rnn(lstmCell, embeddings, dtype=tf.float32)

In [29]:
encoding

<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 50) dtype=float32>

#### A densely connected prediction layer

* *activation=None* because the activation will be part of the tf.nn.sparse_softmax_cross_entropy_with_logits
* *cross_entropy* the loss function for probability distributions
* *max_label* the number of outputs of the prediction layer, here is 2, positive or negative

In [30]:
logits = tf.layers.dense(encoding, max_label, activation=None)

In [31]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
loss = tf.reduce_mean(cross_entropy)

#### Find the output with the highest probability and compare against the true label

In [32]:
prediction = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

In [33]:
optimizer = tf.train.AdamOptimizer(0.01)
train_step = optimizer.minimize(loss)

In [34]:
init = tf.global_variables_initializer()

In [35]:
with tf.Session() as session:
    init.run()
    
    for epoch in range(num_epochs):
        
        num_batches = int(len(train_data) // batch_size) + 1
        
        for i in range(num_batches):
            # Select train data
            min_ix = i * batch_size
            max_ix = np.min([len(train_data), ((i+1) * batch_size)])

            x_train_batch = train_data[min_ix:max_ix]
            y_train_batch = train_target[min_ix:max_ix]
            
            train_dict = {x: x_train_batch, y: y_train_batch}
            session.run(train_step, feed_dict=train_dict)
            
            train_loss, train_acc = session.run([loss, accuracy], feed_dict=train_dict)

        test_dict = {x: test_data, y: test_target}
        test_loss, test_acc = session.run([loss, accuracy], feed_dict=test_dict)    
        print('Epoch: {}, Test Loss: {:.2}, Test Acc: {:.5}'.format(epoch + 1, test_loss, test_acc)) 

Epoch: 1, Test Loss: 0.69, Test Acc: 0.487
Epoch: 2, Test Loss: 1.3, Test Acc: 0.484
Epoch: 3, Test Loss: 0.92, Test Acc: 0.63
Epoch: 4, Test Loss: 0.8, Test Acc: 0.756
Epoch: 5, Test Loss: 0.82, Test Acc: 0.806
Epoch: 6, Test Loss: 1.1, Test Acc: 0.791
Epoch: 7, Test Loss: 1.2, Test Acc: 0.8
Epoch: 8, Test Loss: 1.2, Test Acc: 0.805
Epoch: 9, Test Loss: 1.3, Test Acc: 0.806
Epoch: 10, Test Loss: 1.3, Test Acc: 0.807
Epoch: 11, Test Loss: 1.3, Test Acc: 0.806
Epoch: 12, Test Loss: 1.4, Test Acc: 0.804
Epoch: 13, Test Loss: 1.4, Test Acc: 0.803
Epoch: 14, Test Loss: 1.4, Test Acc: 0.803
Epoch: 15, Test Loss: 1.4, Test Acc: 0.805
Epoch: 16, Test Loss: 1.4, Test Acc: 0.805
Epoch: 17, Test Loss: 1.5, Test Acc: 0.808
Epoch: 18, Test Loss: 1.5, Test Acc: 0.809
Epoch: 19, Test Loss: 1.5, Test Acc: 0.808
Epoch: 20, Test Loss: 1.5, Test Acc: 0.807
