# Naïve Bayes from Scratch
To implement Naïve Bayes manually we compute:
- **Class priors**: probability that a message is spam or normal.
- **Likelihoods**: probability of each word appearing in each class. We apply Laplace smoothing so unseen words do not zero out the probability.
When classifying a new message we split it into tokens, sum the log-probabilities for each class, and choose the larger value. This simple approach works well for text because many token probabilities are small yet independent.

In [None]:
import math
from collections import Counter
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
records = pd.DataFrame({
    "text": [
        "limited time offer claim your prize now",
        "meeting reminder for project update",
        "win cash by entering free lottery",
        "family dinner plans for saturday",
        "exclusive deal just for you click",
        "invoice attached for last month",
        "cheap meds available order today",
        "team outing scheduled at 5pm",
        "congratulations you have won a voucher",
        "please review the attached report"
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
})

train_df = records.sample(frac=0.7, random_state=42)
test_df = records.drop(train_df.index)

print("Training samples:", len(train_df))
print("Testing samples:", len(test_df))

In [None]:
def tokenize(text):
    return text.lower().split()

classes = sorted(train_df["label"].unique())
class_priors = {}
word_counts = {cls: Counter() for cls in classes}
total_words = {cls: 0 for cls in classes}

for _, row in train_df.iterrows():
    label = row["label"]
    tokens = tokenize(row["text"])
    class_priors[label] = class_priors.get(label, 0) + 1
    word_counts[label].update(tokens)
    total_words[label] += len(tokens)

total_docs = len(train_df)
vocab = set(token for counts in word_counts.values() for token in counts)
vocab_size = len(vocab)

for cls in classes:
    class_priors[cls] = math.log(class_priors[cls] / total_docs)

In [None]:
def predict(text):
    tokens = tokenize(text)
    scores = {}
    for cls in classes:
        score = class_priors[cls]
        for token in tokens:
            count = word_counts[cls][token]
            score += math.log((count + 1) / (total_words[cls] + vocab_size))
        scores[cls] = score
    return max(scores, key=scores.get)

predictions = test_df["text"].apply(predict)

print("Confusion matrix:")
print(confusion_matrix(test_df["label"], predictions))

print("\nDetailed report:")
print(classification_report(test_df["label"], predictions, target_names=["normal", "spam"]))