In [1]:
# https://github.com/joelgrus/data-science-from-scratch/blob/master/code/naive_bayes.py
# Data: https://spamassassin.apache.org/publiccorpus/

from naive_bayes import *

In [8]:
#   P(S|w) = [P(w|S) P(S)] / [P(w|S) P(S) + P(w|¬S) P(¬S)]
# Knowing the probability of seeing each word in spam messages, and non spam messages. 
# Using Bayes Theorem we calculate the probability that a message is spam, given its word.

data = [("viagra is good", True), 
        ("buy rolex", True), 
        ("erection problems? why don´t you get some viagra", True), 
        ("not spam", False), 
        ("is this a good spam filter", False), 
        ("viagra and rolex", True),
        ("nati is drunk", False),
        ("Is this the real life", False),
        ("Or is it just fantasy", False),
        ("Caught in a landslide", False),
        ("No escape from reality", False),
        ("Open your eyes, and buy some viagra", True),
        ("I`m just a poor boy", False),
        ("I have no money for a rolex", True),
       ]
random.seed(0)      # just so you get the same answers as me
train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                 for _, is_spam, spam_probability in classified)

print(counts)

classified.sort(key=lambda row: row[2])
spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

print("spammiest_hams", spammiest_hams)
print("hammiest_spams", hammiest_spams)

words = sorted(classifier.word_probs, key=p_spam_given_word)

spammiest_words = words[-5:]
hammiest_words = words[:5]

print("spammiest_words", spammiest_words)
print("hammiest_words", hammiest_words)

Counter({(True, False): 3, (False, False): 2})
('spammiest_hams', [('nati is drunk', False, 0.0011704359357386858), ('No escape from reality', False, 0.008135916138459133)])
('hammiest_spams', [('I have no money for a rolex', True, 0.004764857915140413), ('viagra is good', True, 0.05290129280980171), ('buy rolex', True, 0.33291055944947473)])
('spammiest_words', [('rolex', 0.375, 0.07142857142857142), ('t', 0.375, 0.07142857142857142), ('and', 0.625, 0.07142857142857142), ('some', 0.625, 0.07142857142857142), ('viagra', 0.875, 0.07142857142857142)])
('hammiest_words', [('is', 0.125, 0.5), ('a', 0.125, 0.5), ('just', 0.125, 0.35714285714285715), ('spam', 0.125, 0.35714285714285715), ('this', 0.125, 0.35714285714285715)])
