# Exercise: Naïve Bayes Implementation

Melissa Marielle Valdez BSCS 3B - AI

In [10]:
import re
from collections import Counter, defaultdict
import math


## Task 1: Manual Naïve Bayes Implementation


### Training Data

In [11]:
docs = [
    ("Free money now!!!", 'SPAM'),
    ("Hi mom, how are you?", 'HAM'),
    ("Lowest price for your meds", 'SPAM'),
    ("Are we still on for dinner?", 'HAM'),
    ("Win a free iPhone today", 'SPAM'),
    ("Let's catch up tomorrow at the office", 'HAM'),
    ("Meeting at 3 PM tomorrow", 'HAM'),
    ("Get 50% off, limited time!", 'SPAM'),
    ("Team meeting in the office", 'HAM'),
    ("Click here for prizes!", 'SPAM'),
    ("Can you send the report?", 'HAM'),
]

def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    return text.split()  # Tokenize into words


### Part 1.a: Bag of Words (Word Frequency)
Build the vocabulary and generate word frequency counts for each class.

In [12]:
# Build vocabulary and word frequency counts
vocab = set()
class_counts = defaultdict(Counter)
doc_counts = defaultdict(int)
total_docs = len(docs)

for text, label in docs:
    tokens = tokenize(text)
    vocab.update(tokens)
    class_counts[label].update(tokens)
    doc_counts[label] += 1

V = len(vocab)
N_spam = sum(class_counts['SPAM'].values())
N_ham = sum(class_counts['HAM'].values())

# Display Bag of Words (Word Frequencies)
print("\n--- PART 1.a: BAG OF WORDS (Word Frequencies) ---")
print("\nSPAM - Word Frequencies:")
print(f"{'Word':<15} | {'Count':>10}")
print("-" * 28)
for word, count in sorted(class_counts['SPAM'].items(), key=lambda x: x[1], reverse=True):
    print(f"{word:<15} | {count:>10}")
print("\nHAM - Word Frequencies:")
print(f"{'Word':<15} | {'Count':>10}")
print("-" * 28)
for word, count in sorted(class_counts['HAM'].items(), key=lambda x: x[1], reverse=True):
    print(f"{word:<15} | {count:>10}")



--- PART 1.a: BAG OF WORDS (Word Frequencies) ---

SPAM - Word Frequencies:
Word            |      Count
----------------------------
free            |          2
for             |          2
money           |          1
now             |          1
lowest          |          1
price           |          1
your            |          1
meds            |          1
win             |          1
a               |          1
iphone          |          1
today           |          1
get             |          1
50              |          1
off             |          1
limited         |          1
time            |          1
click           |          1
here            |          1
prizes          |          1

HAM - Word Frequencies:
Word            |      Count
----------------------------
the             |          3
are             |          2
you             |          2
tomorrow        |          2
at              |          2
office          |          2
meeting         |          2

### Part 1.b: Prior Probabilities for HAM and SPAM
Calculate P(SPAM) and P(HAM) from the training data.

In [13]:
prior_spam = doc_counts['SPAM'] / total_docs
prior_ham = doc_counts['HAM'] / total_docs

print("\n--- PART 1.b: PRIOR PROBABILITIES ---")
print(f"\n{'Metric':<25} | {'SPAM':>15} | {'HAM':>15}")
print("-" * 60)
print(f"{'Document Count':<25} | {doc_counts['SPAM']:>15} | {doc_counts['HAM']:>15}")
print(f"{'Token Count':<25} | {N_spam:>15} | {N_ham:>15}")
print(f"{'Prior Probability':<25} | {prior_spam:>15.4f} | {prior_ham:>15.4f}")
print(f"\nVocabulary size V = {V}")
print(f"Total documents = {total_docs}")


--- PART 1.b: PRIOR PROBABILITIES ---

Metric                    |            SPAM |             HAM
------------------------------------------------------------
Document Count            |               5 |               6
Token Count               |              22 |              33
Prior Probability         |          0.4545 |          0.5455

Vocabulary size V = 44
Total documents = 11


### Part 1.c: Likelihood of Tokens with Respect to Class
Calculate P(token|class) for tokens in vocabulary with Laplace smoothing.

In [14]:
def likelihood(token, label):
    count = class_counts[label][token]
    N = sum(class_counts[label].values())
    return (count + 1) / (N + V)   # Laplace smoothing

# Display all likelihoods in column format
print("\n--- PART 1.c: LIKELIHOOD OF TOKENS ---")
print("\nToken Likelihood Table:")
print(f"{'Token':<12} | {'P(token|SPAM)':>15} | {'P(token|HAM)':>15}")
print("-" * 48)
for token in sorted(vocab):
    p_spam = likelihood(token, 'SPAM')
    p_ham = likelihood(token, 'HAM')
    print(f"{token:<12} | {p_spam:>15.4f} | {p_ham:>15.4f}")



--- PART 1.c: LIKELIHOOD OF TOKENS ---

Token Likelihood Table:
Token        |   P(token|SPAM) |    P(token|HAM)
------------------------------------------------
3            |          0.0152 |          0.0260
50           |          0.0303 |          0.0130
a            |          0.0303 |          0.0130
are          |          0.0152 |          0.0390
at           |          0.0152 |          0.0390
can          |          0.0152 |          0.0260
catch        |          0.0152 |          0.0260
click        |          0.0303 |          0.0130
dinner       |          0.0152 |          0.0260
for          |          0.0455 |          0.0260
free         |          0.0455 |          0.0130
get          |          0.0303 |          0.0130
here         |          0.0303 |          0.0130
hi           |          0.0152 |          0.0260
how          |          0.0152 |          0.0260
in           |          0.0152 |          0.0260
iphone       |          0.0303 |          0.0130
lets

### Part 1.d: Classification of Test Sentences
Use the trained manual Naïve Bayes classifier to classify the following test sentences:
- i. "Limited offer, click here!"
- ii. "Meeting at 2 PM with the manager."

In [15]:
def classify(sentence):
    tokens = tokenize(sentence)
    log_spam = math.log(prior_spam)
    log_ham  = math.log(prior_ham)
    for t in tokens:
        log_spam += math.log(likelihood(t,'SPAM'))
        log_ham  += math.log(likelihood(t,'HAM'))
    return ('SPAM' if log_spam > log_ham else 'HAM', log_spam, log_ham)

tests = ["Limited offer, click here!", "Meeting at 2 PM with the manager."]
print("\n--- PART 1.d: CLASSIFICATION OF TEST SENTENCES ---")
print()
for s in tests:
    predicted_class, log_spam, log_ham = classify(s)
    
    # Calculate confidence from log probabilities
    prob_spam = math.exp(log_spam) / (math.exp(log_spam) + math.exp(log_ham))
    prob_ham = math.exp(log_ham) / (math.exp(log_spam) + math.exp(log_ham))
    confidence = max(prob_spam, prob_ham) * 100
    
    print(f"Sentence: {s}")
    print(f"  Predicted: {predicted_class}")
    print(f"  P(SPAM) = {prob_spam:.4f}")
    print(f"  P(HAM)  = {prob_ham:.4f}")
    print(f"  logP(SPAM|s) = {log_spam:.3f}")
    print(f"  logP(HAM|s)  = {log_ham:.3f}")
    print(f"  Confidence: {confidence:.2f}%")
    print()



--- PART 1.d: CLASSIFICATION OF TEST SENTENCES ---

Sentence: Limited offer, click here!
  Predicted: SPAM
  P(SPAM) = 0.9251
  P(HAM)  = 0.0749
  logP(SPAM|s) = -15.468
  logP(HAM|s)  = -17.981
  Confidence: 92.51%

Sentence: Meeting at 2 PM with the manager.
  Predicted: HAM
  P(SPAM) = 0.0329
  P(HAM)  = 0.9671
  logP(SPAM|s) = -30.116
  logP(HAM|s)  = -26.736
  Confidence: 96.71%



## Task 2: Scikit-Learn Multinomial Naïve Bayes

Train a Multinomial Naïve Bayes classifier using scikit-learn and classify the same test sentences:
- i. "Limited offer, click here!"
- ii. "Meeting at 2 PM with the manager."

In [16]:
# Scikit-learn version
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

train_texts = [t for t,_ in docs]
train_labels = [l for _,l in docs]
vec = CountVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b')
X = vec.fit_transform(train_texts)
clf = MultinomialNB(alpha=1.0)   # Laplace smoothing
clf.fit(X, train_labels)

X_test = vec.transform(tests)
preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

print("\n--- TASK 2: SCIKIT-LEARN PREDICTIONS ---")
print()
for i, sentence in enumerate(tests):
    predicted_class = preds[i]
    prob_ham = probs[i][0] if clf.classes_[0] == 'HAM' else probs[i][1]
    prob_spam = probs[i][1] if clf.classes_[0] == 'HAM' else probs[i][0]
    confidence = max(probs[i]) * 100
    
    print(f"Sentence: {sentence}")
    print(f"  Predicted: {predicted_class}")
    print(f"  P(HAM)  = {prob_ham:.4f}")
    print(f"  P(SPAM) = {prob_spam:.4f}")
    print(f"  Confidence: {confidence:.2f}%")
    print()



--- TASK 2: SCIKIT-LEARN PREDICTIONS ---

Sentence: Limited offer, click here!
  Predicted: SPAM
  P(HAM)  = 0.0838
  P(SPAM) = 0.9162
  Confidence: 91.62%

Sentence: Meeting at 2 PM with the manager.
  Predicted: HAM
  P(HAM)  = 0.9781
  P(SPAM) = 0.0219
  Confidence: 97.81%

