# Naive Bayes Classification

The dataset was obtained from https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection.

In [3]:
import numpy as np
import pandas as pd
import re
import math
import string
from collections import Counter

In [6]:
# read data and parse it
data = pd.read_csv('../datasets/sms_spam.csv', names=['tag', 'text'])
# stripping punctuation
punct_stripper = str.maketrans(dict.fromkeys(string.punctuation))
# tokenizing data
data.text = [str(doc).translate(punct_stripper).split() for doc in data['text']]
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
tag     5574 non-null object
text    5574 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None


Unnamed: 0,tag,text
0,ham,"[Go, until, jurong, point, crazy, Available, o..."
1,ham,"[Ok, lar, Joking, wif, u, oni]"
2,spam,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,"[U, dun, say, so, early, hor, U, c, already, t..."
4,ham,"[Nah, I, dont, think, he, goes, to, usf, he, l..."


In [7]:
# Splitting data 80 % train, 20 % test
train_df = data.iloc[:4458, :]
test_df = data.iloc[4459:, :]

In [17]:
class NaiveBayesText:
    
    def __init__(self, train_df):
        """
        train_df (pandas.DataFrame) : Training dataset
        train_df must have a column called 'tag' representing the output class,
            and a column called 'text'
        """
        self.train_df = train_df
        self.tags = list(set(train_df.tag.values))
        self.tag_probs = {tag: len(train_df.loc[train_df['tag'] == tag])/len(train_df)
                         for tag in self.tags}
        self.total_vocabulary_size = len(set(self.get_vocabulary(self.train_df)))
        self.tag_word_counts = {tag : Counter(self.get_vocabulary(
            self.train_df.loc[self.train_df['tag'] == tag])) for tag in self.tags}
    
    def get_vocabulary(self, df):
        vocabulary = []
        for doc_list in df.text.values:
            for word in doc_list:
                vocabulary.append(word)
        return vocabulary

    def prob_doc_given_tag(self, tokenized_doc, tag, alpha):
        prob_words = np.array([self.posterior_word_prob(word, tag, alpha) 
                               for word in tokenized_doc])
        return np.sum(prob_words) + math.log(self.tag_probs[tag])

    def posterior_word_prob(self, word, tag, alpha):
        word_count = self.tag_word_counts[tag][word]
        return math.log((word_count + alpha)/(len(self.tag_word_counts[tag]) + alpha * 
                                          self.total_vocabulary_size))

    def predict_tag(self, doc, alpha):
        prob_tags = np.array([self.prob_doc_given_tag(doc, tag, alpha) for tag in self.tags])
        return self.tags[np.argmax(prob_tags)]
    
    def predict_batch(self, docs, alpha=1):
        predict_tags = []
        for doc in docs:
            predict_tags.append(self.predict_tag(doc, alpha))        
        return predict_tags

In [18]:
def calculate_accuracy(y_hat, y):
    return sum(1 for a, b in zip(y_hat,y) if a == b) / len(y)

In [19]:
%%time
nbt = NaiveBayesText(train_df)
y_hat = nbt.predict_batch(test_df['text'], alpha=0.08)
test_acc = calculate_accuracy(test_df['tag'], y_hat)
print("Test set accuracy: {:1.3f}%".format(test_acc * 100))

Test set accuracy: 96.951%
CPU times: user 2.65 s, sys: 3.74 ms, total: 2.65 s
Wall time: 2.65 s
