In [9]:
import re
import math
from collections import defaultdict, Counter

class NaiveBayesTextClassifier:
    def __init__(self):
        self.vocab = set()  # Unique words in training data
        self.class_counts = Counter()  # Number of documents per class
        self.word_counts = {}  # Word frequency per class
        self.total_docs = 0  # Total documents in training data

    def preprocess(self, text):
        """Lowercase & remove non-alphabetic characters"""
        text = text.lower()
        words = re.findall(r'\b[a-z]+\b', text)  # Extract words and put into list
        return words

    def fit(self, X_train, y_train):
        """Train the model using word frequencies"""
        self.total_docs = len(y_train)
        self.class_counts = Counter(y_train)
        for x, y in zip(X_train, y_train):
            x = self.preprocess(x)
            self.vocab.update(x)
            for word in x:
                if y not in self.word_counts:
                    self.word_counts[y]={}
                if word not in self.word_counts[y]:
                    self.word_counts[y][word]=1
                else:
                    self.word_counts[y][word]+=1
        print(self.word_counts)

    def predict(self, text):
        """Predict the class of a new document"""
        words = self.preprocess(text)
        class_probs = {}  # Store log probabilities for each class

        for label in self.class_counts.keys(): #Iterate Over Each Class and Compute Probabilities
            # Compute log prior probability P(y)
            # P(y) = (# of documents in class y) / (total documents)
            prior = math.log(self.class_counts[label] / self.total_docs)

            # Compute log likelihood P(x|y) with Laplace smoothing
            #total_words_in_class: Total word occurrences in a class.
            #vocab_size: Number of unique words in training data.
            total_words_in_class = sum(self.word_counts[label].values())
            vocab_size = len(self.vocab)

            #Laplace Smoothing (+1) ensures no word has zero probability.
            for word in words:
                word_freq = self.word_counts[label].get(word, 0) + 1  # Laplace smoothing
                word_prob = word_freq / (total_words_in_class + vocab_size)
                prior += math.log(word_prob)  # Sum log probabilities

            class_probs[label] = prior

        return max(class_probs, key=class_probs.get)  # Return the class with the highest probability

# === Example Usage ===
X_train = [
    "I love coding",
    "Python is amazing",
    "I hate bugs",
    "Debugging is hard"
]

y_train = ["positive", "positive", "negative", "negative"]

# Train the model
nb_classifier = NaiveBayesTextClassifier()
nb_classifier.fit(X_train, y_train)

# Predict new examples
print(nb_classifier.predict("I enjoy coding in Python"))  # Likely: "positive"
print(nb_classifier.predict("I hate bugs its hard to fix"))  # Likely: "negative"

{'positive': {'i': 1, 'love': 1, 'coding': 1, 'python': 1, 'is': 1, 'amazing': 1}, 'negative': {'i': 1, 'hate': 1, 'bugs': 1, 'debugging': 1, 'is': 1, 'hard': 1}}
positive
negative


In [10]:
class_probs = {"tech":1, "finance": 1.2, "health": 1.3}
sorted_dict = sorted(class_probs, reverse= True)
sorted_dict

['tech', 'health', 'finance']

In [None]:
X_train = [
    "I love coding",
    "Python is amazing",
    "I hate bugs",
    "Debugging is hard"
]

y_train = ["positive", "positive", "negative", "negative"]

# After calling fit(), the model stores:
#Class counts (how many examples per class)
{'positive': 2, 'negative': 2}

#	2.	Word counts per class
# self.word_counts
{
    'positive': {'I': 1, 'love': 1, 'coding': 1, 'Python': 1, 'is': 1, 'amazing': 1},
    'negative': {'I': 1, 'hate': 1, 'bugs': 1, 'Debugging': 1, 'is': 1, 'hard': 1}
}
#	3.	Vocabulary (unique words across all texts)
{'I', 'love', 'coding', 'Python', 'is', 'amazing', 'hate', 'bugs', 'Debugging', 'hard'}

In [None]:
self.word_counts['positive'] = {'i': 1, 'love': 1, 'coding': 1, 'python': 1, 'is': 1, 'amazing': 1}
total_words_in_class = 6
vocab_size = 10

#	For "love":
word_freq = 1 + 1 = 2
word_prob = 2 / (6 + 10) = 2/16 = 0.125
log_prob += log(0.125) = -2.08

In [None]:
''' likelihood 是怎么算的
P(word | class) = \frac{\text{word count in class} + 1}{\text{total words in class} + \text{vocabulary size}}

If we calculate probabilities without smoothing, the probability of a word given a class is:

这个容易理解；
P(word | class) = \frac{\text{word count in class}}{\text{total words in class}}


However, if a word never appeared in the training data for a particular class, its count is zero, which results in:


P(word | class) = \frac{0}{\text{total words in class}} = 0


Since Naive Bayes multiplies probabilities, having even one zero probability makes the entire class probability zero. This can wrongly eliminate the class from consideration.

Laplace Smoothing fixes this by adding 1 to each word count.


Numerator:  \text{word count in class} + 1
	•	If a word appears  N  times in a class, we count it as  N+1 .
	•	If a word never appeared ( N = 0 ), we still give it a small probability ( 0 + 1 = 1 ).
	•	This prevents zero probabilities.

Denominator:  \text{total words in class} + \text{vocabulary size}
	•	The total count of all words in the class is increased by the vocabulary size (the number of unique words in all training data).
	•	This ensures that the probabilities remain valid and still sum to 1.


'''