In [1]:
from typing import NamedTuple, List, Set, Tuple, Dict, Iterable
import re
import math
from collections import defaultdict

In [2]:
def tokenize(text: str) -> Set[str]:
    text = text.lower()  # convert to lowercase
    all_words = re.findall("\w+", text)  # extract the words
    return set(all_words)

assert tokenize("Data Science is science") == {"data", "science", "is"}

In [3]:
class Message(NamedTuple):
    text: str
    is_spam: bool

In [21]:
class NaiveBayesClassifier:
    def __init__(self, k: float = 0.5) -> None:
        self.k = k # smoothing factor
        self.tokens: Set[str] = set()
        self.tokens_spam_counts: Dict[str, int] = defaultdict(int)
        self.tokens_ham_counts: Dict[str, int] = defaultdict(int)
        self.spam_messages = self.ham_messages = 0
    
    def train(self, messages: Iterable[Message]) -> None:
        for message in messages:
            # Increment message counts
            if messages.is_spam:
                self.spam_messages += 1
            else:
                self.ham_messages += 1
            #increment word counts
            for token in tokenize(message.text):
                self.tokens.add(token)
                if message.is_spam:
                    self.tokens_spam_counts[token] += 1
                else:
                    self.tokens_ham_counts[token] += 1