In [68]:
import re
import random, math
from collections import defaultdict, Counter
from __future__ import division

In [1]:
def tokenize(message):
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    return set(all_words)                           # remove duplicates

In [3]:
re.findall("[a-z0-9']+", "paquito el chocolatero'")

['paquito', 'el', "chocolatero'"]

In [4]:
def count_words(training_set):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts


In [15]:
subject_regex = re.compile(r"^Subject:\s+")
subject_regex.sub("", "Subject: Hello, pepe")

def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

In [6]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | spam) and p(w | ~spam)"""
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.iteritems()]

In [7]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    # iterate through each word in our vocabulary
    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # if *word* appears in the message,
        # add the log probability of seeing it
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # if *word* doesn't appear in the message
        # add the log probability of _not_ seeing it
        # which is log(1 - probability of seeing it)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [8]:
class NaiveBayesClassifier:

    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):

        # count spam and non-spam messages
        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams

        # run training data through our "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [58]:
import glob, re

# modify the path with wherever you've put the files
path = r"./data/*/*"

data = []

# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
       is_spam = "ham" not in fn
       with open(fn,'r') as file:
        for line in file:
            if line.startswith("Subject:"):
                # remove the leading "Subject: " and keep what's left
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam)) 

In [59]:
len(data)

3423

In [60]:
data[:10]

[('[Lockergnome Digital Media]  Endorsed Compatibility', False),
 ('What you need for iPaq wireless connectivity (Tech Update)', False),
 ('Astrology.com: daily horoscope', False),
 ('[Lockergnome Tech Specialist]  Geothermal Caffeine', False),
 ('testing for taint.org, part 2', False),
 ('Take all your music on the road (ZDNET SHOPPER)', False),
 ('[Lockergnome Penguin Shell]  Retail Ready', False),
 ('JavaServer Pages updated', False),
 ("You're signed up for the Reich Report!", False),
 ('[Lockergnome Windows Daily]  Entrance Slouch', False)]

In [61]:
random.seed(0)      # just so you get the same answers as me
train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

In [62]:
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

In [63]:
# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5)
                 for _, is_spam, spam_probability in classified)

In [64]:
counts

Counter({(False, False): 710,
         (False, True): 27,
         (True, False): 50,
         (True, True): 89})

In [69]:
precision = counts[(True,True)]/(counts[(True,True)]+counts[(False, True)])
precision

0.7672413793103449

In [70]:
recall = counts[(True,True)]/(counts[(True,True)]+counts[(True, False)])
recall

0.6402877697841727

In [71]:
# sort by spam_probability from smallest to largest
classified.sort(key=lambda row: row[2])

# the highest predicted spam probabilities among the non-spams
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]

# the lowest predicted spam probabilities among the actual spams
hammiest_spams = filter(lambda row: row[1], classified)[:5]

In [72]:
spammiest_hams

[('Attn programmers: support offered [FLOSS-Sarai Initiative]',
  False,
  0.971447877047375),
 ('Re: Adam dont job for no one, see.', False, 0.9807146024453691),
 ('Ray Ozzie: "How long before we see auto pingback generator spambots?"',
  False,
  0.9908585900710797),
 ('Adam dont job for no one, see.', False, 0.9984510399838439),
 ('=?iso-2022-jp?B?UmU6IBskQjswSSkyPTNYJSglcyU4JUslIiVqJXMlME1NJVcbKEI=?=',
  False,
  0.9993353660686523)]

In [73]:
hammiest_spams

[('Re: girls', True, 0.0007234593214204142),
 ('*****SPAM*****', True, 0.0017766969268745516),
 ('-> IN THE NEWS TODAY <----', True, 0.006564528980421035),
 ('Have tax problems?', True, 0.00798643088569293),
 ('Congratualtions zzzz8969 ! !!', True, 0.009093552937086429)]

In [76]:
filter(lambda t: t[0]=='needed',classifier.word_probs)

[('needed', 0.023287671232876714, 0.0011446886446886447)]

In [77]:
0.023287671232876714/0.0011446886446886447

20.344109589041096

In [81]:
def p_spam_given_word(word_prob):
    """uses bayes's theorem to compute p(spam | message contains word)"""

    # word_prob is one of the triplets produced by word_probabilities
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

words = sorted(classifier.word_probs, key=p_spam_given_word)

spammiest_words = words[-10:]
hammiest_words = words[:10]

In [82]:
spammiest_words

[('norton', 0.023287671232876714, 0.00022893772893772894),
 ('clearance', 0.026027397260273973, 0.00022893772893772894),
 ('reps', 0.026027397260273973, 0.00022893772893772894),
 ('500', 0.026027397260273973, 0.00022893772893772894),
 ('zzzz', 0.028767123287671233, 0.00022893772893772894),
 ('systemworks', 0.028767123287671233, 0.00022893772893772894),
 ('sale', 0.028767123287671233, 0.00022893772893772894),
 ('rates', 0.031506849315068496, 0.00022893772893772894),
 ('money', 0.03424657534246575, 0.00022893772893772894),
 ('adv', 0.03972602739726028, 0.00022893772893772894)]

In [83]:
hammiest_words

[('spambayes', 0.0013698630136986301, 0.04601648351648352),
 ('users', 0.0013698630136986301, 0.04006410256410257),
 ('razor', 0.0013698630136986301, 0.03502747252747253),
 ('zzzzteana', 0.0013698630136986301, 0.03411172161172161),
 ('sadev', 0.0013698630136986301, 0.029532967032967032),
 ('apt', 0.0013698630136986301, 0.026327838827838828),
 ('ouch', 0.0013698630136986301, 0.022664835164835164),
 ('bliss', 0.0013698630136986301, 0.021749084249084248),
 ('selling', 0.0013698630136986301, 0.021749084249084248),
 ('wedded', 0.0013698630136986301, 0.021291208791208792)]