## Implementation of a Multinomial Naive Bayes Classifier

In [1]:
import re
from math import log
from collections import Counter
import pandas as pd
import nltk
import random

In [2]:
# Load data
data = pd.read_csv('spam.csv',encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Keep the text, and class columns
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1,inplace=True)

# Rename the two columns
data.columns = ['class','text']

In [6]:
data.groupby('class').count()

Unnamed: 0_level_0,text
class,Unnamed: 1_level_1
ham,4825
spam,747


In [9]:
# split data into train and test data
rows = random.sample(data.index.tolist(), int(data.shape[0]*0.1))
df_10 = data.iloc[rows]
df_90 = data.drop(rows)

In [10]:
# separare the Ham and Spam texts for train and test 
ham_doc = [i for i in df_90['text'][df_90['class']=='ham'].tolist()]
spam_doc = [i for i in df_90['text'][df_90['class']=='spam'].tolist()]
all_doc = spam_doc + ham_doc

# test 
spam_test = df_10[df_10['class']=='spam']
ham_test  = df_10[df_10['class']=='ham']

In [11]:
print('Spam Doc', len(spam_doc))
print('Ham Doc', len(ham_doc))
print('All Doc', len(all_doc))

Spam Doc 674
Ham Doc 4341
All Doc 5015


In [12]:
# make a list of all terms after cleaning all texts 
terms_in_all = [term.lower() for fx in all_doc for term in fx.split() if term.lower().isalpha() and term.lower() not in nltk.corpus.stopwords.words('english')]

In [13]:
# Frequency county of terms
vocab = Counter(terms_in_all)
print ("Total number of terms in all files :", len(terms_in_all))
print ("Size of vocab :", len(vocab))

Total number of terms in all files : 32563
Size of vocab : 5688


In [14]:
terms_in_spam = [term.lower() for fx in spam_doc for term in fx.split() if term.lower().isalpha() and term.lower() not in nltk.corpus.stopwords.words('english')]
counts_spam = Counter(terms_in_spam)
print ("Terms in spam docs :", len(terms_in_spam))
print ("Spam vocab :", len(counts_spam))

terms_in_ham = [term.lower() for fx in ham_doc for term in fx.split() if term.lower().isalpha() and term.lower() not in nltk.corpus.stopwords.words('english')]
counts_ham = Counter(terms_in_ham)
print ("Terms in ham docs :", len(terms_in_ham))
print ("Ham vocab :", len(counts_ham))

Terms in spam docs : 6911
Spam vocab : 1391
Terms in ham docs : 25652
Ham vocab : 5022


In [15]:
# calculate probabilities of Spam and Ham
P_spam = len(spam_doc)/float(len(all_doc))
print ("Spam prior probability : {:.2f}".format(P_spam))
P_ham = len(ham_doc)/float(len(all_doc))
print ("Ham prior probability : {:.2f}".format(P_ham))

Spam prior probability : 0.13
Ham prior probability : 0.87


In [16]:
# Calculate conditional probability
# Log of probabilities is taken to prevent underflow (extremely small prob. values)
#Laplace smoothing (counts+1) is done to prevent multiplication by zero
cond_prob = {'spam': {}, 'ham': {}}
score_spam = log(P_spam)
score_ham = log(P_ham)
for term in vocab:
    term_spam_count = counts_spam[term] 
    term_ham_count = counts_ham[term]
    cond_prob['spam'][term] = (term_spam_count+1)/float(len(terms_in_spam)+len(vocab)) 
    cond_prob['ham'][term] = (term_ham_count+1)/float(len(terms_in_ham)+len(vocab))

In [17]:
class SpamClassifier:
    def __init__(self):
        self.prior_spam = None
        self.prior_ham = None
        self.likelihood = None
        
    def classify(self, message_terms):
        score_spam = self.prior_spam
        score_ham = self.prior_ham
        for term in message_terms:
            try:
                score_spam += log(self.likelihood['spam'][term]) 
            except KeyError as e:
                score_spam += log(1/float(len(terms_in_spam)+len(vocab)))
            try:
                score_ham += log(self.likelihood['ham'][term])
            except KeyError as e:
                score_ham += log(1/float(len(terms_in_ham)+len(vocab)))
        if score_spam > score_ham:
            return 1 #SPAM
        else:
            return 0 #HAM

In [18]:
# Initiliaze the classifier
clf = SpamClassifier()
clf.prior_spam = log(P_spam) 
clf.prior_ham = log(P_ham)
clf.likelihood = cond_prob
n_spam = spam_test.shape[0]
n_ham = ham_test.shape[0]


In [20]:
TP=0
FP=0
TN=0
FN=0
# Make predictions
for row in df_10.itertuples():
    label = row[1]
    text  = [term.lower() for term in row[2].split()  if term.lower().isalpha() and term.lower() not in nltk.corpus.stopwords.words('english')]
    result = clf.classify(text)
    if label == 'spam' and result == 1:   #file spam and classified spam
        TP += 1
    elif label == 'ham' and result == 1:  #file ham but classifed spam
        FP += 1
    elif label == 'spam' and result == 0: #file spam but classifed ham
        FN += 1                 
    elif label == 'ham' and result ==0:   #file ham and classified ham
        TN += 1

Total = TP+TN+FP+FN
TPR = TP/float(n_spam) #sensitivity
FPR = FP/float(n_ham) # 1-specificity            

Number of Positives : 73  Number of Negatives  : 484
True Positives (TP) : 68  False Positives (FP) : 15
True Negatives (TN) : 469  False Negatives (FN) : 5
True Positive Rate (TPR) : 0.932  False Positive Rate (FPR) : 0.031


In [None]:
# Confusion Matrix
print ("Number of Positives :", n_spam, " Number of Negatives  :", n_ham)
print ("True Positives (TP) :", TP, " False Positives (FP) :", FP)
print ("True Negatives (TN) :", TN, " False Negatives (FN) :", FN)
print ("True Positive Rate (TPR) :", round(TPR, 3), " False Positive Rate (FPR) :", round(FPR, 3))

In [21]:
# Calculate Accuracy, Precision, Recall, and F1-score
accuracy = (TP+TN)/float(Total)
precision =  TP/float(TP+FP)
recall = TP/float(TP+FN)
f1_score = (2*(precision*recall))/float(precision+recall)
error_rate = (FP+FN)/float(Total)
print ("Classifier Metrics : ")
print ("Accuracy  :", round(accuracy, 3), "   Error Rate :", round(error_rate, 3))
print ("Precision :", round(precision, 3), "   Recall     :", round(recall, 3))
print ("F1-score  :", round(f1_score, 3))

Classifier Metrics : 
Accuracy  : 0.964    Error Rate : 0.036
Precision : 0.819    Recall     : 0.932
F1-score  : 0.872
