Below is an implementation of the multinomial naive Bayes classifier together with a unit test on synthetic data and a performance comparison with the multinomial naive Bayes model in sklearn on the 20 Newsgroups dataset.  The preformance of the model is essentially the same as the sklearn version with the small difference being a slight difference in preprocessing.    

In [149]:
import collections # for Counter
import math        # for log

class MultinomialNaiveBayes(object):

    def __init__(self, alpha=1.0, uniform_priors=False):
        '''
        The parameters ``theta_hat`` are stored in a matrix after training.  
        Therefore, we need to assign indices to the classes (and have the 
        reverse mapping) and indices to the words -- hence the attributes
        ``class_to_index``, ``index_to_class``, and ``word_to_index``.  

        The priors are stored in an array ``priors`` after training 
        with the indices of the array matching the indices in 
        ``class_to_index``.  ``alpha`` is the smoothing parameter and 
        ``uniform_priors`` is a boolean that allows us to set the priors 
        to uniform.
        '''
        self.class_to_index = {}
        self.index_to_class = {} # needed for prediction.
        # list of the priors
        self.priors = None
        self.word_to_index = {}
        # matrix with ci entry theta^hat_ci
        self.theta_hat = None
        self.alpha = alpha
        self.uniform_priors=uniform_priors

    def train(self, X_train, y_train):
        '''
        Input: ``X_train`` is a list of lists of strings and ``y_train`` is 
        a list of the same length as ``X_train`` of corresponding classes. 
        '''
        assert len(X_train) == len(y_train)

        N = len(y_train) # number of datapoints.

        # assign indices to the classes.
        class_counts = collections.Counter(y_train)
        self.class_to_index = {c:i for i,c in enumerate(class_counts.keys())}
        self.index_to_class = {v:k for k,v in self.class_to_index.items()}

        # compute priors.
        C  = len(self.class_to_index) # number of classes.  
        self.priors = [1/C] * C
        if not self.uniform_priors:
            for c,count in class_counts.items():
                self.priors[self.class_to_index[c]] = count / N

        # compute the total word counts and set the indices.  
        total_words_in_class = [0] * C # entries are N_c in the paper.
        word_counts = collections.Counter()

        for x, y in zip(X_train, y_train):
            word_counts += collections.Counter(x)
            total_words_in_class[self.class_to_index[y]] += len(x)

        self.word_to_index = {word:i for i,word in enumerate(word_counts.keys())}        

        # compute theta hats.  
        # need N_ci from the paper. First we fill the matrix with them, then compute theta^hat_ci.
        W = len(self.word_to_index) # number of words.  
        self.theta_hat = [[0.0] * len(self.word_to_index) for _ in range(C)]
    
        for x, y in zip(X_train, y_train):
            for word, count in collections.Counter(x).items():
                self.theta_hat[self.class_to_index[y]][self.word_to_index[word]] += count    

        for c in range(C):
            for i in range(W):
                self.theta_hat[c][i] = ((self.theta_hat[c][i] + self.alpha) / 
                                        (total_words_in_class[c] + W * self.alpha))

    def _get_max_index(nums):
        '''Returns the index of ``nums`` with the max value.'''
        return max(range(len(nums)), key=nums.__getitem__)

    def predict_log_proba(self, X_test):
        '''Returns the raw log probabilities predicted by the model as 
        an array for an array of inputs.  
        '''
        res = []
        for x in X_test:
            # index is the same as in ``index_to_class``
            log_liklihoods = [math.log(prior) for prior in self.priors]
            C = len(self.class_to_index)
            for word, count in collections.Counter(x).items():
                for c in range(C):
                    # skip the words that are new.
                    if word in self.word_to_index:
                        log_liklihoods[c] += count * math.log(self.theta_hat[c][self.word_to_index[word]])
            res.append(log_liklihoods)
        return res

    def predict(self, X_test):
        '''Returns an the classes predicted by the trained model for
        as an array corresponding the the array of inputs.
        '''
        log_probas = MultinomialNaiveBayes.predict_log_proba(self, X_test)
        return [self.index_to_class[MultinomialNaiveBayes._get_max_index(lp)] for lp in log_probas]


The following cells provide example usage for the ``MultinomialNaiveBayes`` class. In addition, we computed the results of this example by hand and see that they agree with the output of the classifier.  

In [150]:
# Test case

X_train = [["this", "is", "an", "entry"], 
          ["this", "is", "too"], 
          ["so", "is", "this"]]

y_train = ["red",
          "blue",
          "red"]

X_test = [["this", "is", "an", "entry"],
          ["so", "this", "is"],
          ["hello"],
          ["this", "is", "too"]]


clf = MultinomialNaiveBayes()
clf.train(X_train, y_train)
clf.predict(X_test)


['red', 'red', 'red', 'blue']

In [151]:
print(f'{clf.class_to_index=}')
print(f'{clf.word_to_index=}')
print(f'{clf.priors=}')

# The entries in the lists in the next two lines should agree.  
print(f'{clf.theta_hat[0]=}')
print([3/13, 3/13, 2/13, 2/13, 1/13, 2/13])

# The entries in the lists in the next two lines should agree.
print(f'{clf.theta_hat[1]=}')
print([2/9, 2/9, 1/9, 1/9, 2/9, 1/9])


clf.class_to_index={'red': 0, 'blue': 1}
clf.word_to_index={'this': 0, 'is': 1, 'an': 2, 'entry': 3, 'too': 4, 'so': 5}
clf.priors=[0.6666666666666666, 0.3333333333333333]
clf.theta_hat[0]=[0.23076923076923078, 0.23076923076923078, 0.15384615384615385, 0.15384615384615385, 0.07692307692307693, 0.15384615384615385]
[0.23076923076923078, 0.23076923076923078, 0.15384615384615385, 0.15384615384615385, 0.07692307692307693, 0.15384615384615385]
clf.theta_hat[1]=[0.2222222222222222, 0.2222222222222222, 0.1111111111111111, 0.1111111111111111, 0.2222222222222222, 0.1111111111111111]
[0.2222222222222222, 0.2222222222222222, 0.1111111111111111, 0.1111111111111111, 0.2222222222222222, 0.1111111111111111]


We now compare our ``MultinomialNaiveBayes`` classifier with the multinomial naive Bayes classifier built into [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html).  We do this on the [20 Newsgroups](https://scikit-learn.org/0.19/modules/generated/sklearn.datasets.fetch_20newsgroups.html#sklearn.datasets.fetch_20newsgroups) text classification benchmark dataset (see also [here](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)).  We see that our ``MultinomialNaiveBayes`` classifier achieves $0.774$ classification accuracy on this dataset while the builtin sklearn ``MultinomialNB`` achieves an accuracy of $0.772$ (this difference is likely due to some difference in preprocssing).

For our use of the ``MultinomialNaiveBayes`` classifier, we somewhat mimick the preprocessing of sklearns [``CountVectorizer``](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) class.  In particular, we split words at punctuation and whitespace, remove length one words (``text_preprocess`` below), and remove words that only occur once (``filter_min_count`` below).  

In [177]:
from sklearn.datasets import fetch_20newsgroups

#categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                                  #categories=categories, 
                                  shuffle=True, 
                                  random_state=42)


In [178]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(twenty_train.data, twenty_train.target)


In [179]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
                                 #categories=categories, 
                                 shuffle=True, 
                                 random_state=42)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)


0.7728359001593202

In [176]:
import string
import collections

def text_preprocess(s):
    # Remove punctuation and replace with whitespace
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    s = s.translate(translator)
    # lowercase, remove and split at whitespace
    s = s.lower().strip().split()
    # remove words of length 1 -- done in 'CountVectorizer' in sklearn.
    return [c for c in s if len(c) > 1]

def filter_min_count(X, min_count):
    count = collections.Counter()
    for x in X:
        count += collections.Counter(x)
    remove = set()
    for k,v in count.items():
        if v < min_count:
            remove.add(k)
    res = []
    for x in X:
        current = []
        for c in x:
            if c in remove:
                continue
            current.append(c)
        res.append(current)
    return res


In [180]:
X_train = [text_preprocess(x) for x in twenty_train.data]
y_train = twenty_train.target

my_clf = MultinomialNaiveBayes()
my_clf.train(X_train, y_train)
X_test = filter_min_count([text_preprocess(doc) for doc in twenty_test.data], 2)
#X_test = [text_preprocess(doc) for doc in twenty_test.data]
y_test = twenty_test.target

predicted = my_clf.predict(X_test)
np.mean(predicted == y_test)


0.7754912373871482