# Naive Bayes

In [33]:
from __future__ import division
from collections import Counter, defaultdict
import math, random, re, glob

In [34]:
def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

In [35]:
def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9']+",message)
#     print(all_words)
    return set(all_words)

In [36]:
def count_words(training_set):
    counts = defaultdict(lambda: [0,0])
    for msg,is_spam in training_set:
        for word in tokenize(msg):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [59]:
def cal_prob(s,k,ts):
    return (s+k)/(2*k + ts)
def word_probabilities(counts,total_spams, total_non_spams,k = 0.5):
    return [(w,
             cal_prob(spam,k,total_spams),
            cal_prob(non_spam,k,total_non_spams))
           for w,(spam,non_spam) in counts.items()]

In [60]:
def spam_probability(word_probs,msg):
    messages_words = tokenize(msg)
    log_prob_if_spam = log_prob_if_not_spam = 0
    
    for word,prob_if_spam,prob_if_non_spam in word_probs:
        if word in messages_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_non_spam)
        else:
            log_prob_if_spam += math.log(1-prob_if_spam)
            log_prob_if_not_spam += math.log(1-prob_if_non_spam)
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    
    return prob_if_spam/(prob_if_spam + prob_if_not_spam)

In [61]:
# k_main 
class NaiveBayesClassifier:
    def __init__(self,k=0.5):
        self.k = k
        self.wordprobs = []
    def train(self,training_set):
        num_spams = len([is_spam for _,is_spam in training_set if is_spam])
        num_non_spam = len(training_set) - num_spams
        
        word_counts = count_words(training_set)
        self.wordprobs = word_probabilities(word_counts,num_spams,num_non_spam,self.k)
    def classify(self, msg):
        return spam_probability(self.wordprobs,msg)

## Now we need to test our model
For testing we will be using [the SpamAssassin public corpus data](https://spamassassin.apache.org/old/publiccorpus/)
Please download below 3 files, (already downloaded here in data folder)
>- [20021010_easy_ham](https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2)
>- [20021010_hard_ham](https://spamassassin.apache.org/old/publiccorpus/20021010_hard_ham.tar.bz2)
>- [20021010_spam](https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2)

In [62]:
path = r".\data\public_corpus\*\*"
data = []
for fileName in glob.glob(path):
    is_spam = "ham" not in fileName
    #print(fileName)
    with open(fileName,'r') as file:
        try:
            for line in file:
                #print(line)
                if line.startswith("Subject:"):
                    subject = re.sub(r"^Subject: ","",line).strip()
                    data.append((subject,is_spam))
        except:
            pass


In [63]:
# print(data)
random.seed(0) # put according to your wish
train_data,test_data = split_data(data,0.75)


### Training

In [64]:
classifier = NaiveBayesClassifier()
classifier.train(train_data)

### Testing

In [67]:
classified = [(subject,is_spam,classifier.classify(subject))
             for subject,is_spam in test_data]
# print(classified)
counts = Counter((is_spam,spam_probability > 0.5)
                for _,is_spam,spam_probability in classified)
print(counts)

Counter({(False, False): 711, (True, True): 89, (True, False): 49, (False, True): 26})


In [76]:
TP = counts[(True, True)]
TN = counts[(False, False)]
FP = counts[(False, True)]
FN = counts[(True, False)]
precission = TP/(TP+FP)
recall = TP/(TP+FN)
f1 = 2*precission*recall / (precission+recall)

In [78]:

print(TP,TN,FP,FN)

89 711 26 49
