In [29]:
# toy dataset
train_texts = [
    "win a free vacation now",          # spam
    "free lottery winner claim",        # spam
    "call mom for dinner",              # ham
    "schedule a meeting for tomorrow",  # ham
    "winner winner free prize",         # spam
    "are we still on for lunch",        # ham
]
train_labels = ["spam", "spam", "ham", "ham", "spam", "ham"]


In [30]:
import re

def tokenize(text):
    # keep only letters and spaces, then split
    tokens = re.findall(r"[a-zA-Z]+", text.lower())
    return tokens


In [31]:
print(tokenize("winner! winner free pri$e"))

['winner', 'winner', 'free', 'pri', 'e']


In [32]:
from collections import Counter

def build_vocab(texts):
    vocab = {}
    for text in texts:
        for tok in tokenize(text):
            if tok not in vocab:
                #print(len(vocab))
                vocab[tok] = len(vocab)
    return vocab

vocab = build_vocab(train_texts)
print("vocab size:", len(vocab))
print("sample mapping:", dict(list(vocab.items())[:10]))


vocab size: 21
sample mapping: {'win': 0, 'a': 1, 'free': 2, 'vacation': 3, 'now': 4, 'lottery': 5, 'winner': 6, 'claim': 7, 'call': 8, 'mom': 9}


In [33]:
train_texts

['win a free vacation now',
 'free lottery winner claim',
 'call mom for dinner',
 'schedule a meeting for tomorrow',
 'winner winner free prize',
 'are we still on for lunch']

In [34]:
def vectorize(text, vocab):
    vec = [0] * len(vocab)
    for tok in tokenize(text):
        if tok in vocab:
            vec[vocab[tok]] += 1
    return vec

X = [vectorize(t, vocab) for t in train_texts]
y = train_labels

print("first example bow:", X[0])
print("label:", y[0])

print("first example bow:", X[1])
print("label:", y[1])



first example bow: [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
label: spam
first example bow: [0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
label: spam


In [35]:
train_texts

['win a free vacation now',
 'free lottery winner claim',
 'call mom for dinner',
 'schedule a meeting for tomorrow',
 'winner winner free prize',
 'are we still on for lunch']

In [36]:
# tokenize each text in the list and count total tokens
all_tokens = []
for text in train_texts:
    tokens = tokenize(text)
    all_tokens.extend(tokens)

print("Total tokens across all texts:", len(all_tokens))
print("All tokens:", all_tokens)

Total tokens across all texts: 28
All tokens: ['win', 'a', 'free', 'vacation', 'now', 'free', 'lottery', 'winner', 'claim', 'call', 'mom', 'for', 'dinner', 'schedule', 'a', 'meeting', 'for', 'tomorrow', 'winner', 'winner', 'free', 'prize', 'are', 'we', 'still', 'on', 'for', 'lunch']


In [41]:
from collections import defaultdict

def count_words_by_class(texts, labels, vocab):
    # word_counts[class][word] = count
    print(set(labels))
    word_counts = {c: [0]*len(vocab) for c in set(labels)}
    print(word_counts)
    class_counts = defaultdict(int)

    for text, label in zip(texts, labels):
        counts = vectorize(text, vocab)
        #print(counts)
        for i in range(len(vocab)):
            word_counts[label][i] += counts[i]

        #word_counts[label] = [wc + c for wc, c in zip(word_counts[label], counts)]
        class_counts[label] += 1

    return word_counts, class_counts

word_counts, class_counts = count_words_by_class(train_texts, train_labels, vocab)

print("class_counts:", dict(class_counts))
print("spam word total:", sum(word_counts["spam"]))
print("ham word total:", sum(word_counts["ham"]))


{'ham', 'spam'}
{'ham': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'spam': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
class_counts: {'spam': 3, 'ham': 3}
spam word total: 13
ham word total: 15


In [38]:
# Let's break down this line step by step:
# word_counts[label] = [wc + c for wc, c in zip(word_counts[label], counts)]

print("Understanding the element-wise addition:")
print("=" * 50)

# Example with simple numbers
existing_counts = [1, 0, 2, 0, 1]  # Current word counts for a class
new_counts =      [0, 1, 1, 0, 2]  # New document's word counts

print("Existing counts:", existing_counts)
print("New counts:     ", new_counts)

# The zip() function pairs up elements from both lists
print("\nzip() pairs elements:")
for i, (wc, c) in enumerate(zip(existing_counts, new_counts)):
    print(f"Position {i}: existing={wc}, new={c}, sum={wc + c}")

# List comprehension does the addition
result = [wc + c for wc, c in zip(existing_counts, new_counts)]
print("\nResult after addition:", result)

print("\n" + "="*50)
print("In the context of our spam/ham classifier:")
print("- word_counts[label] = running total of word counts for that class")
print("- counts = word counts for the current document being processed")
print("- We're accumulating word frequencies across all documents in each class")

# Let's see this with actual vocab
print(f"\nOur vocabulary has {len(vocab)} words:")
print("Sample vocab mapping:", dict(list(vocab.items())[:5]))

# Show actual word_counts structure
print(f"\nword_counts['spam'] has {len(word_counts['spam'])} elements (one per vocab word)")
print("First 10 spam word counts:", word_counts['spam'][:10])
print("First 10 ham word counts:", word_counts['ham'][:10])

Understanding the element-wise addition:
Existing counts: [1, 0, 2, 0, 1]
New counts:      [0, 1, 1, 0, 2]

zip() pairs elements:
Position 0: existing=1, new=0, sum=1
Position 1: existing=0, new=1, sum=1
Position 2: existing=2, new=1, sum=3
Position 3: existing=0, new=0, sum=0
Position 4: existing=1, new=2, sum=3

Result after addition: [1, 1, 3, 0, 3]

In the context of our spam/ham classifier:
- word_counts[label] = running total of word counts for that class
- counts = word counts for the current document being processed
- We're accumulating word frequencies across all documents in each class

Our vocabulary has 21 words:
Sample vocab mapping: {'win': 0, 'a': 1, 'free': 2, 'vacation': 3, 'now': 4}

word_counts['spam'] has 21 elements (one per vocab word)
First 10 spam word counts: [1, 1, 3, 1, 1, 1, 3, 1, 0, 0]
First 10 ham word counts: [0, 1, 0, 0, 0, 0, 0, 0, 1, 1]


In [39]:
# Let's trace through the actual execution step by step
print("Tracing through the word counting process:")
print("=" * 60)

# Let's manually trace what happens
vocab_items = list(vocab.items())[:10]  # First 10 words for demonstration
print("First 10 vocabulary words:")
for word, idx in vocab_items:
    print(f"  '{word}' -> index {idx}")

print(f"\nProcessing training texts for spam vs ham:")
print("-" * 40)

# Show what happens for each text
for i, (text, label) in enumerate(zip(train_texts, train_labels)):
    print(f"\nDocument {i+1}: '{text}' -> {label}")
    
    # Get the word vector for this document
    doc_vector = vectorize(text, vocab)
    
    # Show which words appear in this document
    tokens = tokenize(text)
    print(f"  Tokens: {tokens}")
    
    # Show the first few elements of the vector
    print(f"  Vector (first 10): {doc_vector[:10]}")
    
    # Show which specific words got counted
    for word, idx in vocab_items:
        if doc_vector[idx] > 0:
            print(f"    Word '{word}' appears {doc_vector[idx]} time(s)")

print(f"\nFinal word counts by class:")
print(f"Spam class - first 10 word counts: {word_counts['spam'][:10]}")
print(f"Ham class  - first 10 word counts: {word_counts['ham'][:10]}")

print(f"\nWhat this means:")
print(f"- Total spam documents: {class_counts['spam']}")
print(f"- Total ham documents: {class_counts['ham']}")
print(f"- Word 'free' (index 2) appears {word_counts['spam'][2]} times in spam docs")
print(f"- Word 'free' (index 2) appears {word_counts['ham'][2]} times in ham docs")

Tracing through the word counting process:
First 10 vocabulary words:
  'win' -> index 0
  'a' -> index 1
  'free' -> index 2
  'vacation' -> index 3
  'now' -> index 4
  'lottery' -> index 5
  'winner' -> index 6
  'claim' -> index 7
  'call' -> index 8
  'mom' -> index 9

Processing training texts for spam vs ham:
----------------------------------------

Document 1: 'win a free vacation now' -> spam
  Tokens: ['win', 'a', 'free', 'vacation', 'now']
  Vector (first 10): [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    Word 'win' appears 1 time(s)
    Word 'a' appears 1 time(s)
    Word 'free' appears 1 time(s)
    Word 'vacation' appears 1 time(s)
    Word 'now' appears 1 time(s)

Document 2: 'free lottery winner claim' -> spam
  Tokens: ['free', 'lottery', 'winner', 'claim']
  Vector (first 10): [0, 0, 1, 0, 0, 1, 1, 1, 0, 0]
    Word 'free' appears 1 time(s)
    Word 'lottery' appears 1 time(s)
    Word 'winner' appears 1 time(s)
    Word 'claim' appears 1 time(s)

Document 3: 'call mom for dinn