In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


In [2]:
class MultinomialNaiveBayes:
    
    def __init__(self, alpha=1.0):
        self.alpha = alpha  
        self.classes = None
        self.feature_prob = None 
        self.priors = None
    
    def fit(self, X, Y):
        n_samples, n_features = X.shape
        
        self.classes = np.unique(Y)
        n_classes = len(self.classes)
        
        # Initialize the feature probability matrix
        self.feature_prob = np.zeros((n_classes, n_features))
        self.priors = np.zeros(n_classes)
        
        # Calculate prior probabilities and feature probabilities
        for idx, c in enumerate(self.classes):
            X_c = X[Y == c]  # Samples belonging to class c
            self.priors[idx] = X_c.shape[0] / float(n_samples)
            
            # Add Laplace smoothing (alpha)
            feature_count = np.sum(X_c, axis=0) + self.alpha
            # Sum of all features for the given class + smoothing for each feature
            total_count = np.sum(feature_count)
            
            # Probability of each feature for the given class
            self.feature_prob[idx] = feature_count / total_count
    
    def predict(self, X):
        Y_predicted = [self._predict_sample(x) for x in X]
        return np.array(Y_predicted)
    
    def _predict_sample(self, x):
        posteriors = []
        
        for idx, c in enumerate(self.classes):
            # Prior probability (in logarithmic scale)
            prior = np.log(self.priors[idx])
            
            # Calculate posterior probability
            # For multinomial distribution, use the sum of x[i] * log(p[i])
            feature_prob = self.feature_prob[idx]
            # Use only non-zero values of x to speed up calculations
            log_likelihood = np.sum(x * np.log(feature_prob))
            posterior = prior + log_likelihood
            posteriors.append(posterior)
        
        # Return the class with the highest posterior probability
        return self.classes[np.argmax(posteriors)]
    
    def predict_probability(self, X):
        probas = []
        
        for x in X:
            posteriors = []
            
            for idx, c in enumerate(self.classes):
                prior = np.log(self.priors[idx])
                feature_prob = self.feature_prob[idx]
                log_likelihood = np.sum(x * np.log(feature_prob))
                posterior = prior + log_likelihood
                posteriors.append(posterior)
            
            # Convert log probabilities to probabilities
            # and normalize so that the sum equals 1
            posteriors = np.array(posteriors)
            posteriors = np.exp(posteriors - np.max(posteriors))
            posteriors = posteriors / np.sum(posteriors)
            probas.append(posteriors)
            
        return np.array(probas)

In [None]:
# 1. Load text data (e.g., 2 categories)
categories = ['sci.space', 'rec.sport.baseball']
data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

# 2. Transform texts into features (bag-of-words)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data.data)
Y = data.target

# 3. Split into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Convert to numpy arrays
X_train_np = X_train.toarray()
X_test_np = X_test.toarray()

# 4. Train your custom classifier
my_nb = MultinomialNaiveBayes(alpha=1.0)
my_nb.fit(X_train_np, Y_train)
my_preds = my_nb.predict(X_test_np)

# 5. Train sklearn's MultinomialNB
sk_nb = MultinomialNB(alpha=1.0)
sk_nb.fit(X_train, Y_train)
sk_preds = sk_nb.predict(X_test)

# 6. Compare accuracy
print("Accuracy of your Naive Bayes:", accuracy_score(Y_test, my_preds))
print("Accuracy of sklearn Naive Bayes:", accuracy_score(Y_test, sk_preds))

Accuracy of your Naive Bayes: 0.9355742296918768
Accuracy of sklearn Naive Bayes: 0.9355742296918768
