In [3]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
import tensorflow_probability as tfp

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score

tfd = tfp.distributions

In [4]:
def get_news_data(categories=None, verbose=True, max_df=0.2):
    # Load the 20 newsgroups dataset
    newsgroup_data_train = fetch_20newsgroups(subset='train', categories=categories)
    newsgroup_data_test = fetch_20newsgroups(subset='test', categories=categories)
    orig_Data = [newsgroup_data_train, newsgroup_data_test]
    # Vectorize the data using the count vectorizer
    n_docs = len(newsgroup_data_train['data'])
    # preprocess the data
    vectorizer = CountVectorizer(
        input='content',
        analyzer='word',
        stop_words='english',
        binary=True, 
        max_df=0.2,  # 0.2*11314=2262.8
        min_df=1.01/n_docs,
    )
    X_train = vectorizer.fit_transform(newsgroup_data_train.data).todense()
    X_test = vectorizer.transform(newsgroup_data_test.data).todense()
    y_train = newsgroup_data_train.target
    y_test = newsgroup_data_test.target
    Data = [X_train, X_test, y_train, y_test]
    class_names = newsgroup_data_train.target_names
    idx2word = {v: k for k, v in vectorizer.vocabulary_.items()}
    if verbose:
        print('Train data shape:', newsgroup_data_train.filenames.shape)
        print('Test data shape:', newsgroup_data_test.filenames.shape)
        print('Vocabulary size:', len(vectorizer.vocabulary_))
        print('Number of classes:', np.max(y_train) + 1)
    
    return Data, orig_Data, idx2word, class_names

In [5]:
categs = ['sci.space', 'rec.sport.baseball', 'comp.graphics', 'talk.politics.guns']


data, orig_data, idx2word, class_names = get_news_data(categories=categs)
X_train, X_test, y_train, y_test = data
newsgroup_data_train, newsgroup_data_test = orig_data

Train data shape: (2320,)
Test data shape: (1544,)
Vocabulary size: 18619
Number of classes: 4


In [7]:
class BernoulliNaiveBayes():

    def __init__(self, alpha=1.0):
        """
        alpha: float, Laplace smoothing parameter.
        """
        self.alpha = alpha
        self.class_priors = None
        self.class_conditionals = None
        self.n_classes = None
        self.n_features = None

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_features = X.shape[1]
        self.class_priors = self.compute_class_prioirs(y)
        self.class_conditionals = self.compute_class_conditionals(X, y)

    def compute_class_prioirs(self, y):
        N = len(y)
        class_priors = np.zeros(self.n_classes)
        for i in range(self.n_classes):
            class_priors[i] = np.sum(y == i) / N
        assert np.sum(class_priors) == 1, 'Class priors should sum to 1'
        return tf.cast(class_priors, tf.float32)
    
    def compute_class_conditionals(self, X, y):
        class_conditionals = np.zeros((self.n_classes, self.n_features))
        for i in range(self.n_classes):
            X_class = X[y == i]
            n_class = X_class.shape[0]
            class_conditionals[i] = (np.sum(X_class, axis=0) + self.alpha) / (n_class + self.alpha * 2)
        return tf.cast(class_conditionals, tf.float32)
    
    def make_distribution(self):
        bernoulli_batch = tfd.Bernoulli(probs=self.class_conditionals)
        bernoulli_ind = tfd.Independent(bernoulli_batch, reinterpreted_batch_ndims=1)
        return bernoulli_ind
    
    def predict_logprob_single(self, X):
        bernoulli_ind = self.make_distribution()
        cond_log_probs = bernoulli_ind.log_prob(X)
        joint_log_probs = tf.add(tf.math.log(self.class_priors), cond_log_probs)
        p_x = tf.reduce_logsumexp(joint_log_probs, axis=-1, keepdims=True)
        log_prob = joint_log_probs - p_x
        return log_prob
    
    def predict(self, X, prob=False):
        log_probs = np.zeros((X.shape[0], self.n_classes))
        for i in range(X.shape[0]):
            log_probs[i] = self.predict_logprob_single(X[i])
        if prob:
            return tf.exp(log_probs)
        else:
            return log_probs
        
    def predict_class(self, X):
        return np.argmax(self.predict(X), axis=1)

In [8]:
bnb = BernoulliNaiveBayes(alpha=1.0)
bnb.fit(X_train, y_train)

In [9]:
# example prediction
bnb.predict_logprob_single(X_test[0])

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([  0.      , -27.061478, -50.856857, -71.56946 ], dtype=float32)>

In [10]:
# Test Performance
test_probs = bnb.predict(X_test, prob=True)
test_preds = bnb.predict_class(X_test)

acc = accuracy_score(y_test, test_preds)
f1 = f1_score(y_test, test_preds, average='macro')

print(f'Accuracy: {acc:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.8944
F1 Score: 0.8970


In [11]:
# Show example news
import pandas as pd

pd.option_context('display.max_colwidth', 500)

n_sample = 5
n_charaxcters = 1000

np.random.seed(73)
sample_idx = np.random.choice(X_test.shape[0], n_sample)
news = [newsgroup_data_test.data[i] for i in sample_idx]
news_labels = [class_names[y_test[i]] for i in sample_idx]
news_preds = [class_names[test_preds[i]] for i in sample_idx]

for i in range(n_sample):
    print(f"True label: {news_labels[i]}")
    print(f"Predicted label: {news_preds[i]}")
    print(f"\nContent:\n {news[i][:n_charaxcters]}")
    print('='*50)


decoded_words = [', '.join([idx2word[j] for j in np.where(X_test[i] == 1)[1]]) for i in sample_idx]

df = pd.DataFrame({'True Label': news_labels, 'Predicted Label': news_preds, 'Decoded Words': decoded_words})
df

True label: comp.graphics
Predicted label: comp.graphics

Content:
 From: carlos@carlos.jpr.com (Carlos Dominguez)
Subject: Re: Where did the hacker ethic go?
Reply-To: carlos@carlos.jpr.com
Organization: Private Helldiver/Usenet system, Brooklyn, NY, USA
Lines: 38
X-Newsreader: Helldiver 1.07 (Waffle 1.65)

In <1sp4qj$243@dorsai.dorsai.org> crawls@dorsai.dorsai.org (Charles Rawls) writes:

>The hacker ethic is ALIVE and WELL here.  I know of what you speak, and my
>only answer is "SCREW 'EM".  You have to do what make you feel right.

amen.. I too have learned by example, specifically yours. :)

>What can I say but keep the faith, there are others who do likewise.

.. but dorsai leads the way.. Unlike other services that are commercial
in nature, dorsai is a community based service. While others charge
monthly fees for access, dorsai accepts donations from those who can
afford to contribute.

   While other systems don't respond to user input, dorsai thrives on it.
Other systems sell 

Unnamed: 0,True Label,Predicted Label,Decoded Words
0,comp.graphics,comp.graphics,"07, 243, 38, 65, accepts, access, address, adm..."
1,rec.sport.baseball,rec.sport.baseball,"10, 1024, 12, 140, 14853, 32, 50, 75, 87, able..."
2,rec.sport.baseball,rec.sport.baseball,"10, 1960, 1980, 1993apr18, 1993apr19, 1b, 25, ..."
3,talk.politics.guns,talk.politics.guns,"1993, 1bfdqsj53kostz6hroshsdzlvul1, 202, 23, 3..."
4,comp.graphics,comp.graphics,"aron, berkeley, bonar, california, ced, conver..."


## Test against sklearn

In [None]:

from sklearn.naive_bayes import BernoulliNB

bnb_sk = BernoulliNB(alpha=1.0)
bnb_sk.fit(np.asarray(X_train), y_train)

test_probs_sk = bnb_sk.predict_proba(np.asarray(X_test)[0:4])
test_probs_sk

array([[1.00000000e+000, 1.76704423e-012, 8.18625756e-023,
        8.27307091e-032],
       [9.23201264e-022, 2.40051991e-034, 1.00000000e+000,
        1.01861610e-010],
       [9.58339542e-010, 2.39660974e-009, 9.98979339e-001,
        1.02065778e-003],
       [1.95015711e-122, 4.86095747e-107, 1.25694169e-090,
        1.00000000e+000]])

In [23]:
bnb = BernoulliNaiveBayes(alpha=1.0)
bnb.fit(X_train, y_train)
bnb.predict(np.asarray(X_test)[0:4], prob=True)

<tf.Tensor: shape=(4, 4), dtype=float64, numpy=
array([[1.00000000e+000, 1.76745994e-012, 8.18742826e-023,
        8.27521554e-032],
       [9.23305532e-022, 2.40163558e-034, 1.00000000e+000,
        1.01957024e-010],
       [9.58291465e-010, 2.39740975e-009, 9.98962940e-001,
        1.02132169e-003],
       [1.94974907e-122, 4.86016663e-107, 1.25595674e-090,
        1.00000000e+000]])>