# Naive Bayes Classifier

## Implementation of Naive Bayes

In [1]:
from collections import defaultdict
from typing import Set, Iterable, NamedTuple, Dict, Tuple
import re
import math

In [2]:
def tokenize(text: str) -> Set[str]:
    text = text.lower()
    # text = text.replace("'","")
    all_words = re.findall(r"[a-z0-9]+", text)
    return set(all_words)

In [3]:
message = """This message contains the word "bitcoin" and- is a spam message of occurance of 0"""

all_words = tokenize(message)
print(all_words)

{'and', 'a', '0', 'of', 'message', 'the', 'contains', 'occurance', 'bitcoin', 'word', 'is', 'this', 'spam'}


In [4]:
class Message(NamedTuple):
    text: str
    is_spam: bool


# Now we have to count, tokenize and label our training data, spam_count = count spam messages, ham_count=no of non spam messages
class NaiveBayesClassifier:
    def __init__(self, k: float = 0.5) -> None:
        self.k = k  # smoothing factor
        self.tokens: Set[str] = set()       # vocabulary
        self.token_spam_counts: Dict[str, int] = defaultdict(int)
        self.token_ham_counts: Dict[str, int] = defaultdict(int)
        self.spam_counts = self.ham_counts = 0


    # Now we have to train the model by tokenizing each message and for each token we check and increment either spam count or ham count
    def train(self, messages: Iterable[Message]) -> None:
        for message in messages:
            # increment message count
            if message.is_spam:
                self.spam_counts += 1
            else:
                self.ham_counts += 1

            # check and increment token counts
            for token in tokenize(message.text):
                self.tokens.add(token)
                if message.is_spam:
                    self.token_spam_counts[token] += 1
                else:
                    self.token_ham_counts[token] += 1


    # Now ultimately we want to predict P(spam | token) ,as we saw, applying Bayes theoram, we need to to know P(token | s) and we multiply all such individual proabilities, so we have to create a helper function to achieve this
    def _probabilities(self, token: str) -> Tuple[float, float]:
        """Returns P(token| spam) and P(token|ham)"""
        spam = self.token_spam_counts[token]
        ham = self.token_ham_counts[token]

        p_token_spam = (spam + self.k) / (self.spam_counts + 2 * self.k)        # k is Smooting parameter and applying it to the probabilities
        p_token_ham = (ham + self.k) / (self.ham_counts + 2 * self.k)

        return p_token_spam, p_token_ham
    

    def predict(self, text: str) -> float:
        text_tokens = tokenize(text)
        log_prob_if_spam = log_prob_if_ham = 0

        # iterate through each word in vocabulary
        for token in self.tokens:
            prob_if_spam, prob_if_ham = self._probabilities(token)

            # if *token* appears in message, add the log probability of seeing it
            if token in text_tokens:
                log_prob_if_spam += math.log(prob_if_spam)
                log_prob_if_ham += math.log(prob_if_ham)

            # otherwise, add log probability of not seeing it, which is log(1-probability of seeing it)
            else:
                log_prob_if_spam += math.log(1.0 - prob_if_spam)
                log_prob_if_ham += math.log(1.0 - prob_if_ham)

        prob_if_spam = math.exp(log_prob_if_spam)       # P(Xi = wi|S)
        prob_if_ham = math.exp(log_prob_if_ham)       # P(Xi = wi|~S)

        total_probability = prob_if_spam / (prob_if_spam + prob_if_ham)       # P(S | Xi = W)
        return total_probability

## Testing our Naive Bayes classifying model

In [5]:
training_messages = [
    Message("happy birthday. hope to see you next week", is_spam=False),
    Message("spam rules", is_spam=True),
    Message("be alert of spam messages", is_spam=False),
    Message("Claim your reward", is_spam=True),
    Message("Buy our hottest product", is_spam=True),
    Message("hurry offer exrires now", is_spam=True),
    Message("where is the report", is_spam=False),
    Message("you are invited to the event", is_spam=False),
    Message("your computer has a virus", is_spam=True),
    Message("we have detected a fault in your previous transaction", is_spam=True),
    Message("join our course to quickly make money from trading", is_spam=True),
    Message("if you buy our mattress, you will get a free coupon for the next purchase", is_spam=True),
]

model = NaiveBayesClassifier(k=0.5)
model.train(training_messages)

In [6]:
print(f"Words extracted (Tokens) from the messages are: {model.tokens}")
print(f"No. of spam messages are: {model.spam_counts}")
print(f"No. of ham messages are: {model.ham_counts}")
print(f"Word count in spam messages: {dict(model.token_spam_counts)}")
print(f"Word count in ham messages: {dict(model.token_ham_counts)}")

Words extracted (Tokens) from the messages are: {'week', 'a', 'computer', 'detected', 'hottest', 'you', 'see', 'happy', 'reward', 'your', 'now', 'if', 'has', 'quickly', 'in', 'purchase', 'hurry', 'mattress', 'where', 'fault', 'virus', 'claim', 'offer', 'have', 'our', 'the', 'we', 'exrires', 'trading', 'hope', 'transaction', 'event', 'money', 'are', 'free', 'report', 'invited', 'messages', 'coupon', 'alert', 'previous', 'join', 'course', 'buy', 'to', 'rules', 'be', 'get', 'for', 'will', 'birthday', 'from', 'next', 'product', 'of', 'is', 'make', 'spam'}
No. of spam messages are: 8
No. of ham messages are: 4
Word count in spam messages: {'rules': 1, 'spam': 1, 'reward': 1, 'your': 3, 'claim': 1, 'buy': 2, 'product': 1, 'hottest': 1, 'our': 3, 'offer': 1, 'hurry': 1, 'exrires': 1, 'now': 1, 'a': 3, 'virus': 1, 'has': 1, 'computer': 1, 'detected': 1, 'fault': 1, 'have': 1, 'we': 1, 'previous': 1, 'transaction': 1, 'in': 1, 'from': 1, 'join': 1, 'make': 1, 'quickly': 1, 'course': 1, 'trading

Now lets see how the prediction works

In [7]:
test_message = "you are invited to our spam detection course"

Calculating the probability of spam and ham using our **model**:

In [9]:
percentile_spam = round(100 * (model.predict(test_message)), 1)

print(f"The given message has a chance of {percentile_spam}% being a spam message based on training data above")

The given message has a chance of 3.8% being a spam message based on training data above
