## Sentiment Analysis of IMDB Reviews using Naive Bayes

This is a much larger and more complicated dataset as compared with the Rotten Tomatoes dataset

In [1]:
import os
import re
import tarfile

In [2]:
from six.moves import urllib

#### Import nltk, the Natural Language Processing Toolkit

This is one of the most popular packages for natural language processing on text data. It has APIs to access a large corpus of documents and other lexical resources

In [3]:
import numpy as np
import nltk

In [4]:
print(np.__version__)
print(nltk.__version__)

1.14.2
3.2.5


#### Automate the download, unzip and untar of the reviews dataset

The tarred and gzipped file is stored in the same directory as the code

In [5]:
DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)

    print('Found and verified file from this path: ', url_path)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

#### Clean up the reviews by removing special characters

In [6]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")


def get_reviews(dirname, positive=True):
    label = 1 if positive else 0

    reviews = []
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+') as f:
                review = f.read()
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
                
                # Return a tuple of the review text and a label for whether it 
                # is a positive or negative review
                reviews.append((review, label))
    
    return reviews 

def extract_reviews():
    # If the file has not already been extracted
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOADED_FILENAME) as tar:
            tar.extractall()
            tar.close()
        
    positive_reviews = get_reviews("aclImdb/train/pos/", positive=True)
    negative_reviews = get_reviews("aclImdb/train/neg/", positive=False)
    
    return positive_reviews, negative_reviews

In [7]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

download_file(URL_PATH)

Found and verified file from this path:  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Downloaded file:  ImdbReviews.tar.gz


In [8]:
positive_reviews, negative_reviews = extract_reviews()

In [9]:
positive_reviews[:2]

[('excellent episode movie ala pulp fiction 7 days  7 suicides it doesnt get more depressing than this movie rating 810 music rating 1010',
  1),
 ('ive just read the most recent remarks about this movie and i would like to respond youre probably not familiar with the original story of rap group nwa which dates back to the beginning in 1988 in 1989 ice cube left the band to go solo and ultimately in 1991 the band breaking up when drdre left which led to a lot of beef starting with the departure of ice cube and drdre in 1991 this story was somewhat based on that  further more this movie was a 90 minute laughing spree the way they explained the bootie juice song to be a political statement was hilarious not to mention the love song tasty was hooking up and when vanilla sherbert got his ass kicked just like the record company executive is also hilarious and having theyre managers getting shot every time too  people who didnt enjoy this movie probably didnt get it or were complete idiots m

In [10]:
len(positive_reviews)

12500

In [11]:
len(negative_reviews)

12500

In [12]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_reviews = positive_reviews[:TRAIN_DATA] + negative_reviews[:TRAIN_DATA]

test_positive_reviews = positive_reviews[TRAIN_DATA:TOTAL_DATA]
test_negative_reviews = negative_reviews[TRAIN_DATA:TOTAL_DATA]

#### Get a list of all the unque words in the dataset, the vocabulary

In [13]:
def get_vocabulary(train_reviews):
    words_set = set()
    
    for review in train_reviews:
        words_set.update(review[0].split())
    
    return list(words_set)

vocabulary = get_vocabulary(train_reviews)

In [14]:
len(vocabulary)

68539

In [15]:
vocabulary[:5]

['images', 'midsomer', 'sharmas', 'condone', 'iiis']

### Represent the words in the review as a feature vector

* *review_text* The review in text form

Each review is represented as a dictionary where keys are all words in the vocabulary. The values associated with each key is True if the word is present in the review.

In [16]:
def extract_features(review_text):
    # Split the review into words, and create a set of the words
    review_words = set(review_text.split())

    features = {}
    for word in vocabulary:
        features[word] = (word in review_words)
        
    return features    

#### Map feature vector to labels

* *extract_features* Function to extract the features in feature vector form
* *train_reviews* Training dataset, a list of tuples of the form (review_text, label)

In [17]:
train_features = nltk.classify.apply_features(extract_features, train_reviews)

#### Train the classifier on the training data

In [18]:
trained_classifier = nltk.NaiveBayesClassifier.train(train_features)

In [19]:
def sentiment_calculator(review_text):
    features = extract_features(review_text)
    return trained_classifier.classify(features)

In [20]:
sentiment_calculator("What an amazing movie!")

1

In [21]:
sentiment_calculator("What a terrible movie")

0

In [22]:
sentiment_calculator('Light travels faster than sound. This is why some people appear bright until they speak.')

0

In [23]:
sentiment_calculator('I don’t believe in plastic surgery, But in your case, Go ahead.')

0

In [27]:
sentiment_calculator('I am not young enough to know everything.')

0

#### Classify and measure the accuracy of the model on test data

In [25]:
def classify_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator):
    positive_results = [sentiment_calculator(review[0]) for review in test_positive_reviews]
    negative_results = [sentiment_calculator(review[0]) for review in test_negative_reviews]
    
    true_positives = sum(x > 0 for x in positive_results)
    true_negatives = sum(x == 0 for x in negative_results)
    
    percent_true_positive = float(true_positives) / len(positive_results)
    percent_true_negative = float(true_negatives) / len(negative_results)

    total_accurate = true_positives + true_negatives
    total = len(positive_results) + len(negative_results)

    print("Accuracy on positive reviews = " +"%.2f" % (percent_true_positive * 100) + "%")
    print("Accurance on negative reviews = " +"%.2f" % (percent_true_negative * 100) + "%")
    print("Overall accuracy = " + "%.2f" % (total_accurate * 100/ total) + "%")

In [26]:
classify_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator)

Accuracy on positive reviews = 81.10%
Accurance on negative reviews = 86.50%
Overall accuracy = 83.80%


 ** D O N E ! **