# Naive Bayes Classification

The dataset was obtained from https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection.

In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter

In [13]:
# read data and parse it
data = pd.read_csv('../datasets/sms_spam.csv', names=['tag', 'text'])
data.text = [str(doc) for doc in data['text']]
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
tag     5574 non-null object
text    5574 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None


Unnamed: 0,tag,text
0,ham,Go until jurong point crazy.. Available only i...
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,Nah I don't think he goes to usf he lives arou...


In [3]:
# Splitting data 70 % train, 30 % test
train_df = data.iloc[:3000, :]
test_df = data.iloc[3001:, :]

In [10]:
class NaiveBayesText:
    
    def __init__(self, train_df):
        """
        train_df (pandas.DataFrame) : Training dataset
        train_df must have a column called 'tag' representing the output class,
            and a column called 'text'
        """
        self.train_df = train_df
        # pre-processing to remove punctuations
        self.train_df['text'] = [
            re.sub('\d|\"|\'|!|\)|\(|\.|,|\\b', '', doc).split() 
            for doc in train_df.text.values]
        self.tags = list(set(train_df.tag.values))
        self.total_vocabulary_size = len(self.get_vocabulary(self.train_df))
        self.tag_vocabulary_counts = {tag : Counter(self.get_vocabulary(
            self.train_df.loc[self.train_df['tag'] == tag])) for tag in self.tags}
    
    def get_vocabulary(self, df):
        vocabulary = []
        for doc_list in df.text.values:
            for word in doc_list:
                vocabulary.append(word)
        return vocabulary
    
    def prob_tag(self, tag):
        return len(self.train_df.loc[self.train_df['tag'] == tag])/len(self.train_df)

    def prob_doc_given_tag(self, doc, tag, alpha):
        prob_words = np.array([self.prob_word_given_tag(word, tag, alpha) 
                               for word in doc.split()])
        return np.prod(prob_words) * self.prob_tag(tag)

    def prob_word_given_tag(self, word, tag, alpha):
        word_count = self.tag_vocabulary_counts[tag][word]
        return (word_count + alpha)/(len(self.tag_vocabulary_counts) + alpha * 
                                          self.total_vocabulary_size)

    def predict_tag(self, doc, alpha):
        prob_tags = np.array([self.prob_doc_given_tag(doc, tag, alpha) for tag in self.tags])
        return self.tags[np.argmax(prob_tags)]
    
    def predict_batch(self, docs, print_progress = False, alpha=1):
        predict_tags = []
        i = 0
        for doc in docs:
            predict_tags.append(self.predict_tag(doc, alpha))
            if print_progress and i % len(docs)//10 == 0:
                print("Predicted {} of {} documents".format(i * len(docs)//10, len(docs)))
            i += 1
        if print_progress:
            print("Predicted all documents\n")
        return predict_tags

In [11]:
def calculate_accuracy(y_hat, y):
    return sum(1 for a, b in zip(y_hat,y) if a == b) / len(y)

In [12]:
# Use the best parameter
nbt = NaiveBayesText(train_df)
y_hat = nbt.predict_batch(test_df['text'], print_progress=True)
test_acc = calculate_accuracy(test_df['tag'], y_hat)
print("Test set accuracy: {:1.3f}%".format(test_acc * 100))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Predicted 0 of 2573 documents
Predicted 257 of 2573 documents
Predicted 514 of 2573 documents
Predicted 771 of 2573 documents
Predicted 1029 of 2573 documents
Predicted 1286 of 2573 documents
Predicted 1543 of 2573 documents
Predicted 1801 of 2573 documents
Predicted 2058 of 2573 documents
Predicted 2315 of 2573 documents
Predicted all documents

Test set accuracy: 90.983%
