# Naive Bayes Implentation

Naive Bayes is a family of probabilistic algorithms that take advantage of probability theory and Bayes’ Theorem to predict the tag of a text. 

They calculate the probability of each tag for a given text, and then output the tag with the highest one. 

The way they get these probabilities is by using Bayes’ Theorem, which describes the probability of a feature, based on prior knowledge of conditions that might be related to that feature.


P(A|B) = P(B|A)P(A)/P(B)

P(A|B) is called as posterior probability.

P(A) is called as the prior probability of the hypothesis.

P(B|A) is called as the likelihood.

P(B) is called as the prior probability of the predictor.


In [1]:

import numpy as np
import math 

def cleantext(text):
    text = text.lower().strip()
    for letters in text:
        if letters in """~!@#$%^&*()_-+|{}[]<>?/\=''""":
            text.replace(letters, " ")
    return text

def countwords(words, is_spam, counted):    
    for each_word in words:
        if each_word in counted:
            if is_spam == 1:
                counted[each_word][1] = counted[each_word][1] + 1
            else:
                counted[each_word][0] = counted[each_word][0] + 1
        else:
            if is_spam == 1:
                counted[each_word] = [0,1]
            else:
                counted[each_word] = [1,0]
    return counted

def make_percent_list(k, theCount, spams, hams):
    for each_key in theCount:
        theCount[each_key][0] = (theCount[each_key][0] + k )/(2*k+hams)
        theCount[each_key][1] = (theCount[each_key][1] + k )/(2*k+spams)
    return theCount

def StopWords():
    stopwords_list = []
    for line in stopwords:
        stopwords_list.append(line.strip())
    common = set(stopwords_list)
    common = common.difference('')
    return common

#Import Training File
dataset = input("Enter the Train file name \n")
train = open(dataset, "r", encoding = 'unicode-escape')

#Import Stop Words File
stopwords_file = input("Enter the Stopwords file name \n")
stopwords = open(stopwords_file, "r", encoding = 'unicode-escape')

spam = 0
ham = 0
k = 1
counted = dict()
line = train.readline()
while line != "":
    k = k + 1
    is_spam = int(line[:1])
    if is_spam == 1:
        spam = spam + 1
    else:
        ham = ham + 1
    line = cleantext(line[2:])
    words = line.split()
    words = set(words)
    common = StopWords()
    words = words.difference(common)
    counted = countwords(words, is_spam, counted)
    line = train.readline()
vocab = (make_percent_list(1, counted, spam, ham))

prob_spam = spam/(spam+ham)
prob_ham = ham/(spam+ham)

#Import Test File
fname = input("Enter the Test file name  \n")
test = open(fname, "r", encoding = 'unicode-escape')

line = test.readline()
spam_count = 0
ham_count = 0
count = 1
correct = 0
tp = 0
tn = 0
fn = 0
fp = 0
while line != "":
    count = count + 1
    is_spam = int(line[:1])
    if is_spam == 1:
        spam_count += 1
    else:
        ham_count += 1
    spam_prob = 1
    ham_prob = 1
    line = cleantext(line[2:])
    words = line.split()
    words = set(words)
    words = words.difference(common)
    for w in counted:
        if w in words:
            spam_prob +=np.log(vocab[w][1])
            ham_prob += np.log(vocab[w][0])
        else:
            spam_prob += np.log((1 - vocab[w][1]))
            ham_prob += np.log((1 - vocab[w][0]))
    spam_prob = math.exp(spam_prob)
    ham_prob = math.exp(ham_prob)
    prob_main  = (spam_prob*prob_spam)/((spam_prob*prob_spam)+(ham_prob*prob_ham))
    if prob_main >= 0.5:
        pred = 1
    else:
        pred = 0
    if pred == 1 and is_spam == 1:
        tp = tp + 1
    elif is_spam == 1 and pred == 0:
        fn = fn + 1
    elif is_spam == 0 and pred == 0:
        tn = tn + 1
    else:
        fp = fp + 1
    try:
        accuracy = (tp+tn)/(tp+tn+fp+fn)
    except ZeroDivisionError:
        accuracy = 0
    try:
        precision = tp / (tp+fp)
    except ZeroDivisionError:
        precision = 0
    try:
        recall = tp / (tp+fn)
    except ZeroDivisionError:
        recall = 0
    try:
        f1 = 2*(1/((1/precision)+(1/recall)))
    except ZeroDivisionError:
        f1 = 0
    line = test.readline()

#Results
print("\n Results \n")
print("Total Spam emails in Test set:", spam_count)
print("Total Ham emails in Test set: ", ham_count)
print("FP: ", fp)
print("TP: ",tp)
print("FN: ", fn)
print("TN: ", tn)
print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1: ",f1)


Enter the Train file name 
SHTrain.txt
Enter the Stopwords file name 
StopWords.txt
Enter the Test file name  
SHTest.txt

 Results 

Total Spam emails in Test set: 240
Total Ham emails in Test set:  240
FP:  50
TP:  147
FN:  93
TN:  190
Accuracy:  0.7020833333333333
Precision:  0.7461928934010152
Recall:  0.6125
F1:  0.6727688787185355
