# Experiment: Applying Naive Bayes to Movies Reviews

In the age of data science, empiricial application of theory is vital. In this section, we implement the naive bayes algorithm based on the page 7 of https://web.stanford.edu/~jurafsky/slp3/6.pdf. We apply the algorithm on a set of movie reviews that are already labeled as positive or negative. The data set "polarity_dataset v2.0" of 1000 postive and 1000 negative samples can be found here: http://www.cs.cornell.edu/people/pabo/movie-review-data/.

We divide the data into training and test sets. The training set contains 800 positive and 800 negative samples, and the test set contains 200 positive and 200 negative samples. We train the naive bayes algorithm on the training set. We use the popular nltk tokenizer, and throw away english stop words.

Even with this quite naive algorithm, all of the positive precision, positive recall, negative precision, and negative recall are around 81% as follows:

true positive count: 162
false positive count: 39
true negative count: 161
false negative count: 38
precision: 0.806
recall: 0.810
negative precision: 0.809
negative recall: 0.805
accuracy: 0.808

The test is completely balanced, so achieving these ~81% metrics is significant compared to 50%, which we may get just by guessing one class all the time or purely randomizing. By this experiment, we empirically confirm that while naive bayes algorithm is very naive in theory, it can be quite useful in practice with quick and easy implementation.

Please note that all the below code is original.

In [30]:
positive_documents = []
negative_documents = []

import glob   
path = 'movie_reviews_data/pos/*'   
files=glob.glob(path)   
for el in files:     
    f=open(el, 'r')  
    temp = f.readlines()   
    positive_documents.append('. '.join(temp))
    f.close() 
    
path = 'movie_reviews_data/neg/*'   
files=glob.glob(path)   
for el in files:     
    f=open(el, 'r')  
    temp = f.readlines()   
    negative_documents.append('. '.join(temp))
    f.close() 

In [34]:
from random import shuffle
shuffle(positive_documents)
shuffle(negative_documents)

In [114]:
train_to_test_ratio = 4
num_train_positive = int(len(positive_documents) * train_to_test_ratio / float(train_to_test_ratio + 1.0))
num_train_negative = int(len(negative_documents) * train_to_test_ratio / float(train_to_test_ratio + 1.0))

print(num_train_positive)
print(num_train_negative)

positive_documents_train = positive_documents[:num_train_positive]
negative_documents_train = negative_documents[:num_train_negative]

positive_documents_test = positive_documents[num_train_positive:]
negative_documents_test = negative_documents[num_train_negative:]

print(len(positive_documents_train))
print(len(negative_documents_train))
print(len(positive_documents_test))
print(len(negative_documents_test))

800
800
800
800
200
200


In [136]:
import nltk
from nltk.corpus import stopwords
import collections

stop_words = set(stopwords.words('english'))
stop_words.add('.')

def get_tokens_from_document(document):
    tokens = nltk.word_tokenize(document)
    tokens = [x for x in tokens if x not in stop_words]
    return tokens

class NaiveBayesClassifier:
    def __init__(self, positive_documents_train, negative_documents_train):
        self.positive_documents_train_ = positive_documents_train
        self.negative_documents_train_ = negative_documents_train
        self.positive_label_ = "POSITIVE"
        self.negative_label_ = "NEGATIVE"
        self.stop_words_ = set(stopwords.words('english'))
        self.stop_words_.add('.')
        
    def initialize_vocab(self, max_vocab_size):
        token_appearance_counter = collections.defaultdict(int)
        positive_appearance_counter = collections.defaultdict(int)
        negative_appearance_counter = collections.defaultdict(int)
        
        for positive_doc in self.positive_documents_train_:
            tokens = get_tokens_from_document(positive_doc)
            for t in tokens:
                token_appearance_counter[t] += 1
                positive_appearance_counter[t] += 1
                
        for negative_doc in self.negative_documents_train_:
            tokens = get_tokens_from_document(negative_doc)
            for t in tokens:
                token_appearance_counter[t] += 1
                negative_appearance_counter[t] += 1
                
        sorted_list = sorted(token_appearance_counter.items(), key=lambda x: x[1])
        
        self.vocab_ = [tup[0] for tup in sorted_list]
        if (len(self.vocab_) > max_vocab_size):
            self.vocab_ = self.vocab_[:max_vocab_size]
            
        self.positive_vocab_counter_ = {}
        self.negative_vocab_counter_ = {}
        
        for word in self.vocab_:
            self.positive_vocab_counter_[word] = positive_appearance_counter[word] if word in positive_appearance_counter else 0
            self.negative_vocab_counter_[word] = negative_appearance_counter[word] if word in negative_appearance_counter else 0
        
        self.positive_loglikelihood_ = {}
        self.negative_loglikelihood_ = {}
        
        temp_denom = sum(self.positive_vocab_counter_.values()) + len(self.positive_vocab_counter_)
        for word, val in self.positive_vocab_counter_.items():
            self.positive_loglikelihood_[word] = math.log((val + 1) / float(temp_denom))
            
        temp_denom = sum(self.negative_vocab_counter_.values()) + len(self.negative_vocab_counter_)
        for word, val in self.negative_vocab_counter_.items():
            self.negative_loglikelihood_[word] = math.log((val + 1) / float(temp_denom))

    def initialize(self, max_vocab_size):
        self.log_priors_ = {}
        num_total_documents = len(self.positive_documents_train_) + len(self.negative_documents_train_)
        self.log_priors_[self.positive_label_] = len(self.positive_documents_train_) / float(num_total_documents)
        self.log_priors_[self.negative_label_] = len(self.negative_documents_train_) / float(num_total_documents)
        
        self.initialize_vocab(max_vocab_size)
        
    def classify(self, document):
        tokens = get_tokens_from_document(document)
        scores = {}
        scores[self.positive_label_] = self.log_priors_[self.positive_label_]
        scores[self.negative_label_] = self.log_priors_[self.negative_label_]
        
        for word in tokens:
            if word in self.positive_loglikelihood_:
                scores[self.positive_label_] += self.positive_loglikelihood_[word]
            scores[self.negative_label_] += self.negative_loglikelihood_[word] if word in self.negative_loglikelihood_ else 0.0
        
        return 1 if scores[self.positive_label_] >= scores[self.negative_label_] else -1
        

nbc = NaiveBayesClassifier(positive_documents_train, negative_documents_train)
nbc.initialize(1000000)

In [144]:
true_positive_count = 0
false_positive_count = 0
true_negative_count = 0
false_negative_count = 0

for x in positive_documents_test:
    if nbc.classify(x) == 1:
        true_positive_count += 1
    else:
        false_negative_count += 1
        
for x in negative_documents_test:
    if nbc.classify(x) == 1:
        false_positive_count += 1
    else:
        true_negative_count += 1

In [152]:
print 'true positive count: {}'.format(true_positive_count)
print 'false positive count: {}'.format(false_positive_count)
print 'true negative count: {}'.format(true_negative_count)
print 'false negative count: {}'.format(false_negative_count)

print 'precision: {}'.format(true_positive_count/float((true_positive_count + false_positive_count)))
print 'recall: {}'.format(true_positive_count/float((true_positive_count + false_negative_count)))
print 'negative precision: {}'.format(true_negative_count/float((true_negative_count + false_negative_count)))
print 'negative recall: {}'.format(true_negative_count/float((true_negative_count + false_positive_count)))

print 'accuracy: {}'.format((true_positive_count + true_negative_count) / 
                            float(true_positive_count + true_negative_count + false_positive_count + false_negative_count))

true positive count: 162
false positive count: 39
true negative count: 161
false negative count: 38
precision: 0.805970149254
recall: 0.81
negative precision: 0.809045226131
negative recall: 0.805
accuracy: 0.8075
