#### Dataset: [SMS Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection)
#### Method: Naive Bayes with Smoothing

In [243]:
# import packages
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
import re
from collections import Counter

### Function Definition

In [322]:
# get vocabulary in a list of string
def get_voc(df, text_name = 'message'):
    # get unique words
    voc = ' '.join(df[text_name].values).lower() # change to lower case
    voc = re.sub(r'[^\w]',' ',voc).split() # remove symbols
    voc = set([st.strip() for st in voc]) # unique words in the data set

    # set of unique words in the data set
    voc_ind = {v: k for k, v in enumerate(voc)} # word-index
    ind_voc = {k: v for k, v in enumerate(voc)} # index-word

    return voc

# count words in a string
def count_words(s):
    prep   = re.sub(r'[^\w]',' ',s.lower()).split()
    counts = Counter(prep)
    dicts  = {k: v for k, v in counts.items()}

    return dicts

# split data into train, test, with same class ratio
def split_data(df, class_name = 'label', test_size = 0.2):
    train_size = 1 - test_size
    num_class = df[class_name].unique()
    classes = [df[df[class_name] == c].reset_index(drop = True) for c in num_class]

    train_index = [np.random.choice(len(c), int(len(c) * train_size), replace = False) for c in classes]
    test_index = [list(set(range(len(classes[i]))) - set(train_index[i])) for i in range(len(train_index))]

    train = pd.concat([classes[i].iloc[train_index[i], :] for i in range(len(train_index))])
    test = pd.concat([classes[i].iloc[test_index[i], :] for i in range(len(test_index))])

    return train, test

def learn_nb(df, voc, class_name = 'label', text_name = 'message'):
    # vocabulary
    voc_counts = len(voc)

    # class counts
    num_class = df[class_name].unique()
    p_class = [len(df[df[class_name] == c]) / len(df) for c in num_class]

    # word counts
    p_words = []
    for c in num_class:
        texts = count_words(' '.join(df[df[class_name] == c][text_name].values))
        counts_c = sum(texts.values())

        p_word_c = {}
        
        for w in voc:
            if w in texts:
                p_word_c[w] = (texts[w] + 1 ) / (counts_c + voc_counts)
            else:
                p_word_c[w] = 1 / (counts_c + voc_counts)

        p_words.append(p_word_c)

    return num_class, p_class, p_words

def apply_nb(df, classes, p_class, p_words, class_name = 'label', text_name = 'message'):
    df['word_counts'] = df[text_name].apply(lambda x: count_words(x))
    p_likelihood = [df['word_counts'].apply(lambda x: np.log(p_class[i]) + np.sum([np.log(p_words[i][k]) * v for k, v in x.items()])).values for i in range(len(classes))]
    df['pred_label'] = np.argmax(p_likelihood, axis = 0)

    return df

def cal_acc(df, class_name = 'label', pred_class_name = 'pred_label'):
    classes = df[class_name].unique()
    overall_accuracy = len(df[df[class_name] == df[pred_class_name]]) / len(df)
    class_accuracy = [len(df[(df[class_name] == c) & (df[class_name] == df[pred_class_name])]) / len(df[df[class_name] == c]) for c in classes]
    
    return overall_accuracy, class_accuracy

### Test on the Data Set

In [323]:
# read data set 
df = pd.read_csv('SMSSpamCollection', delimiter = '\t', header=None, names = ['class', 'message'])
df['label'] = 0
df.loc[df['class'] == 'spam','label'] = 1

train, test = split_data(df)
voc = get_voc(df)
num_class, p_class, p_words = learn_nb(train, voc)

# training
pred_df = apply_nb(train, num_class, p_class, p_words)
acc, class_acc = cal_acc(pred_df)

print("Overall Train Accuracy: %.3f" % acc)
p = [print("Class %d Accuracy: %.3f" % (c, class_acc[c])) for c in range(len(class_acc))]

# test
pred_df = apply_nb(test, num_class, p_class, p_words)
acc, class_acc = cal_acc(pred_df)

print("Overall Test Accuracy: %.3f" % acc)
p = [print("Class %d Accuracy: %.3f" % (c, class_acc[c])) for c in range(len(class_acc))]

Overall Train Accuracy: 0.992
Class 0 Accuracy: 0.997
Class 1 Accuracy: 0.958
Overall Test Accuracy: 0.985
Class 0 Accuracy: 0.983
Class 1 Accuracy: 0.993


### Compare with sklearn

In [325]:
# sklearn Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

x = df.message
y = df.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
vectorizer = CountVectorizer()
word_counts = vectorizer.fit_transform(x_train.values)

classifer = MultinomialNB()
classifer.fit(word_counts, y_train.values)

test_count = vectorizer.transform(x_test.values)
preds = classifer.predict(test_count)

print("Test Accuracy: %.3f" % accuracy_score(y_test.values, preds))
print(classification_report(y_test.values, preds))

Test Accuracy: 0.987
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       959
           1       0.99      0.92      0.95       156

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

