In [1]:
from typing import NamedTuple, List, Set, Tuple, Dict, Iterable
import re
import math
from collections import defaultdict

# Tokenizing data

In [2]:
def tokenize(text: str) -> Set[str]:
    text = text.lower()  # convert to lowercase
    all_words = re.findall("\w+", text)  # extract the words
    return set(all_words)

assert tokenize("Data Science is science") == {"data", "science", "is"}

In [3]:
class Message(NamedTuple):
    text: str
    is_spam: bool

# Naive Bayes Classifier

In [4]:
class NaiveBayesClassifier:

    def __init__(self, k: float = 0.5) -> None:
        self.k = k # smoothing factor
        self.tokens: Set[str] = set()
        self.tokens_spam_counts: Dict[str, int] = defaultdict(int)
        self.tokens_ham_counts: Dict[str, int] = defaultdict(int)
        self.spam_messages = self.ham_messages = 0

    def train(self, messages: Iterable[Message]) -> None:
        for message in messages:
            # Increment message counts
            if message.is_spam:
                self.spam_messages += 1
            else:
                self.ham_messages += 1
            # increment word counts
            for token in tokenize(message.text):
                self.tokens.add(token)
                if message.is_spam:
                    self.tokens_spam_counts[token] += 1
                else:
                    self.tokens_ham_counts[token] += 1

    def _probabilities(self, token: str) -> Tuple[float, float]:
        '''returns P(token | spam) and P(token | ham)'''
        spam = self.tokens_spam_counts[token]
        ham = self.tokens_ham_counts[token]

        p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k)
        p_token_ham = (ham + self.k) / (self.ham_messages + 2 * self.k)

        return p_token_spam, p_token_ham


    def predict(self, text: str) -> float:
        text_tokens = tokenize(text)
        log_prob_if_spam = log_prob_if_ham = 0.0

        # Iterate through each word in our vocabulary
        for token in self.tokens:
            prob_if_spam, prob_if_ham = self._probabilities(token)

            # If *token* appears in the message,
            # add the log probabilty of seeing it.
            if token in text_tokens:
                log_prob_if_spam += math.log(prob_if_spam)
                log_prob_if_ham += math.log(prob_if_ham)

            # Otherwise add the log probability of _not_ seeing it, 
            # which is log(1 - probability of seeing it)

            else:
                log_prob_if_spam += math.log(1.0 - prob_if_spam)
                log_prob_if_ham += math.log(1.0 - prob_if_ham)
        
        prob_if_spam = math.exp(log_prob_if_spam)
        prob_if_ham = math.exp(log_prob_if_ham)
        return prob_if_spam/(prob_if_spam + prob_if_ham)

# Testing Our Model

In [5]:
message = [Message("spam rules", is_spam=True),
           Message("ham rules", is_spam=False),
           Message("hello ham", is_spam=False)]

In [6]:
model = NaiveBayesClassifier(k=0.5)
model.train(message)

In [7]:
assert model.tokens == {'spam', 'ham', 'rules', 'hello'}
assert model.spam_messages == 1
assert model.ham_messages == 2
assert model.tokens_spam_counts == {'spam': 1, 'rules': 1}
assert model.tokens_ham_counts == {'ham': 2, 'rules': 1, 'hello': 1}

In [8]:
text = 'hello spam'
model.predict(text)

0.8350515463917526

In [9]:
probs_if_spam = [(1 + 0.5)/(1+2*0.5), # "spam" (present)
                  1 - (0 + 0.5) / (1+2*0.5), # "ham" (not present)
                  1 - (1 + 0.5) / (1+2*0.5), # "rules" (not present)
                  (0+0.5)/(1+2*0.5) # "hello" (present)
                  ]

probs_if_ham = [(0+0.5)/(2+2*0.5), # spam (present)
               1-(2+0.5)/(2+2*0.5), # ham (not present)
               1-(1+0.5)/(2+2*0.5), # rules (not present)
               (1+0.5) / (2+2*0.5) # hello (present)
               ]

In [17]:
p_if_spam = math.exp(sum(math.log(p) for p in probs_if_spam))
p_if_ham = math.exp(sum(math.log(p) for p in probs_if_ham))

In [18]:
assert round(model.predict(text)) == round(p_if_spam / (p_if_spam+p_if_ham))

# Using Model

In [21]:
from io import BytesIO
import requests
import tarfile
import glob, re

In [7]:
BASE_URL = "https://spamassassin.apache.org/old/publiccorpus"
FILES = ["20021010_easy_ham.tar.bz2", "20021010_hard_ham.tar.bz2",              "20021010_spam.tar.bz2"]
Output_dir = "Spam_data"

In [24]:
path = 'spam_data/*/*'

In [25]:
data: List[Message] = []

In [26]:
for filename in glob.glob(path):
    is_spam = 'ham' not in filename
    with open(filename, errors='ignore') as email_file:
        for line in email_file:
            if line.startswith("Subject:"):
                subject = line.lstrip("Subject: ")
                data.append(Message(subject, is_spam))
                break

In [27]:
data[:4]

[Message(text='Re: New Sequences Window\n', is_spam=False),
 Message(text='[zzzzteana] RE: Alexander\n', is_spam=False),
 Message(text='[zzzzteana] Moscow bomber\n', is_spam=False),
 Message(text="[IRR] Klez: The Virus That  Won't Die\n", is_spam=False)]

In [28]:
import random
from machineLearning import split_data
from collections import Counter

In [29]:
random.seed(0)
train_messages, test_messagses = split_data(data, 0.75)

In [30]:
model = NaiveBayesClassifier()

In [31]:
model.train(train_messages)

In [32]:
predictions = [(message, model.predict(message.text))
                for message in test_messagses]

In [33]:
confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
                            for message, spam_probability in predictions)

In [34]:
print(confusion_matrix)

Counter({(False, False): 673, (True, True): 85, (True, False): 41, (False, True): 26})


In [35]:
from machineLearning import precision, recall, f1_score

In [36]:
print(f"precision is {precision(85, 26)}")
print(f"Recall is {recall(85, 41)}")
print(f"f1 score is {f1_score(85, 26, 673, 41)}")

precision is 0.7657657657657657
Recall is 0.6746031746031746
f1 score is 0.19562715765247413


In [37]:
def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
    # we probably shouldn't call private methods, but it's for a good cause
    probs_if_spam, probs_if_ham = model._probabilities(token)
    return probs_if_spam / (probs_if_spam + probs_if_ham)

In [39]:
words = sorted(model.tokens, key = lambda t: p_spam_given_token(t, model))

In [41]:
print("Spammiest_words", words[-10:])
print()
print("hammiest_words", words[:10])

Spammiest_words ['assistance', 'zzzz', '99', 'attn', '95', 'clearance', 'money', 'per', 'systemworks', 'adv']

hammiest_words ['spambayes', 'users', 'razor', 'zzzzteana', 'sadev', 'apt', 'perl', 'ouch', 'spamassassin', 'bliss']
