# Naive Bayes Classification

The dataset was obtained from https://www.kaggle.com/nltkdata/movie-review.

In [1]:
import numpy as np
import pandas as pd
import re
import math
import string
from nltk.corpus import stopwords
from collections import Counter

In [7]:
# read data and parse it
data = pd.read_csv('../datasets/movie_reviews_smaller.csv')
data = data[['fold_id', 'text', 'tag']]

In [8]:
# Splitting data 70 % train, 30 % test
train_df = data.loc[data['fold_id'] < 3]
test_df = data.loc[data['fold_id'] == 3]

In [9]:
def get_best_alpha(train_data):
    alpha_values = [x/100 for x in range(500, 600, 5)]
    accuracies = np.zeros([len(alpha_values), len(train_data.fold_id.values)])
    for fold in range(len(train_data.fold_id.values)):
        test = train_data.loc[train_data['fold_id'] == fold]
        train = train_data.loc[train_data['fold_id'] != fold]
        nbt = NaiveBayesText(train)
        if len(test['tag']) == 0:
            continue
        i = 0
        for a in alpha_values:
            y_hat = nbt.predict_batch(test['text'], alpha=a)
            accuracy = calculate_accuracy(y_hat, test['tag'])
            print("validation accuracy for fold = {}, alpha = {} is {:1.3f}"
                  .format(fold, a, accuracy))
            accuracies[i][fold] = accuracy
            i += 1
    return alpha_values[np.argmax(accuracies.mean(axis=1))]

def calculate_accuracy(y_hat, y):
    return sum(1 for a, b in zip(y_hat,y) if a == b) / len(y)

In [10]:
class NaiveBayesText:
    
    def __init__(self, train_df):
        """
        train_df (pandas.DataFrame) : Training dataset
        train_df must have a column called 'tag' representing the output class,
            and a column called 'text'
        """
        self.train_df = train_df
        # change documents to a bag of words representation
        if type(self.train_df.text.values[0]) == 'str':
            self.clean_documents()
        self.tags = list(set(train_df.tag.values))
        self.tag_vocabulary_counts = {tag : Counter(self.get_vocabulary(
            self.train_df.loc[self.train_df['tag'] == tag])) for tag in self.tags}
        self.total_vocabulary_size = len(self.get_vocabulary(self.train_df))
        
    def clean_documents(self):
        waste = '\d|\"|\'|!|\)|\(|\.|,|' + '\\b(' + \
            '|'.join(stopwords.words('english')) + ')\\W'
        self.train_df.loc[:, self.train_df.columns == 'text'] = self.train_df.text.apply(
            lambda doc : self.clean_document(doc, waste))

    def clean_document(self, doc, waste):
        """ Removes useless filler words/punctuations from text """
        return re.sub(waste, '', doc).split()
    
    def get_vocabulary(self, df):
        vocabulary = []
        for doc_list in df.text.values:
            for word in doc_list:
                vocabulary.append(word)
        return vocabulary
    
    def prob_tag(self, tag):
        return len(self.train_df.loc[self.train_df['tag'] == tag])/len(self.train_df)

    def prob_doc_given_tag(self, doc, tag, alpha):
        prob_words = np.array([self.prob_word_given_tag(word, tag, alpha) 
                               for word in doc.split()])
        return np.prod(prob_words) * self.prob_tag(tag)

    def prob_word_given_tag(self, word, tag, alpha):
        word_count = self.tag_vocabulary_counts[tag][word]
        return (word_count + alpha)/(len(self.tag_vocabulary_counts) + alpha * 
                                          self.total_vocabulary_size)

    def predict_tag(self, doc, alpha):
        prob_tags = np.array([self.prob_doc_given_tag(doc, tag, alpha) for tag in self.tags])
        return self.tags[np.argmax(prob_tags)]
    
    def predict_batch(self, docs, print_progress = False, alpha=1):
        predict_tags = []
        i = 0
        for doc in docs:
            predict_tags.append(self.predict_tag(doc, alpha))
            if print_progress and i % 2000 == 0:
                print("Predicted {} of {} documents".format(i, len(docs)))
            i += 1
        if print_progress:
            print("Predicted all documents\n")
        return predict_tags

In [11]:
best_alpha = get_best_alpha(train_df)
# Use the best parameter
nbt = NaiveBayesText(train_df)
y_hat = nbt.predict_batch(test_df['text'], print_progress=True, alpha=best_alpha)
test_acc = calculate_accuracy(test_df['tag'], y_hat)
print("Test set accuracy: {:1.3f}%".format(test_acc * 100))

validation accuracy for fold = 0, alpha = 5.0 is 0.530
validation accuracy for fold = 0, alpha = 5.05 is 0.530
validation accuracy for fold = 0, alpha = 5.1 is 0.530
validation accuracy for fold = 0, alpha = 5.15 is 0.530
validation accuracy for fold = 0, alpha = 5.2 is 0.530
validation accuracy for fold = 0, alpha = 5.25 is 0.530
validation accuracy for fold = 0, alpha = 5.3 is 0.530
validation accuracy for fold = 0, alpha = 5.35 is 0.530
validation accuracy for fold = 0, alpha = 5.4 is 0.530
validation accuracy for fold = 0, alpha = 5.45 is 0.530
validation accuracy for fold = 0, alpha = 5.5 is 0.530
validation accuracy for fold = 0, alpha = 5.55 is 0.530
validation accuracy for fold = 0, alpha = 5.6 is 0.530
validation accuracy for fold = 0, alpha = 5.65 is 0.530
validation accuracy for fold = 0, alpha = 5.7 is 0.530
validation accuracy for fold = 0, alpha = 5.75 is 0.530
validation accuracy for fold = 0, alpha = 5.8 is 0.530
validation accuracy for fold = 0, alpha = 5.85 is 0.530
v