In [2]:
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

In [10]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score
import numpy as np

In [11]:
# Making a function to get data which:
#   1) Fetches the 20 newsgroups dataset
#   2) Performs a word count on the articles and binarizes the result
#   3) Returns the data as a numpy matrix with the labels

def get_data(categories):
    
    newsgroups_train_data = fetch_20newsgroups(data_home='20_Newsgroup_Data/',
                                              subset='train', categories=categories)
    newsgroups_test_data = fetch_20newsgroups(data_home='20_Newsgroup_Data/',
                                             subset='test', categories=categories)
    
    n_documents = len(newsgroups_train_data['data'])
    
    # Count words occurences and binarize the result filtering out
    # excessively common words or words that only appear once in our corpus
    count_vectorizer = CountVectorizer(input='content', 
                                       binary=True, 
                                       max_df=0.25, 
                                       min_df=1.01/n_documents)
    
    train_binary_bag_of_words = count_vectorizer.fit_transform(newsgroups_train_data['data'])
    test_binary_bag_of_words = count_vectorizer.transform(newsgroups_test_data['data'])
    
    return (train_binary_bag_of_words.todense(), newsgroups_train_data['target']), (test_binary_bag_of_words.todense(), newsgroups_test_data['target']), 

In [12]:
# Defining a function to conduct Laplace smoothing. This adds a base level of probability for a given feature
# to occur in every class

def laplace_smoothing(labels, binary_data, n_classes):
    # Compute the parameter estimates (adjusted fraction of documents in class that contain word)
    n_words = binary_data.shape[1]
    alpha = 1 # parameter for the Laplace smoothing
    theta = np.zeros([n_classes, n_words]) # stores parameter values - prob. word given class
    for c_k in range(n_classes): # 0, 1, ..., 19
        class_mask = (labels == c_k)
        N = class_mask.sum() # number of articles in class
        theta[c_k,:] = (binary_data[class_mask, :].sum(axis=0) + alpha)/(N + alpha*2)
        
    return theta

In [13]:
# Getting a subset of the 20 newsgroups dataset

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

(train_data, train_labels), (test_data, test_labels) = get_data(categories)
smoothed_counts = laplace_smoothing(labels=train_labels, binary_data=train_data, n_classes=len(categories))

To make our NB classifier we need to build three functions:

* Compute the class priors
* Build our class conditional statements
* Put it all together and classify our data

In [19]:
# Function which computes the prior probability of every class based on the frequency of occurence in
# the dataset

def class_priors(n_classes, labels):
    counts = np.zeros(n_classes)
    for c_k in range(n_classes):
        counts[c_k] = np.sum(np.where(labels==c_k, 1, 0))
    priors = counts/np.sum(counts)
    print('The class priors are {}'.format(priors))
    return priors

In [20]:
# Run the function

priors = class_priors(n_classes=len(categories), labels=train_labels)

The class priors are [0.2359882  0.28711898 0.29154376 0.18534907]


In [30]:
# Now we will do a function that given the feature occurence counts returns a Bernoulli distribution of
# batch_shape = number of classes and event_shape = number of features

def make_distribution(probs):
    batch_of_bernoullis = tfd.Bernoulli(probs=probs)
    dist = tfd.Independent(batch_of_bernoullis,
                          reinterpreted_batch_ndims=1)
    return dist

tf_dist = make_distribution(smoothed_counts)
tf_dist

<tfp.distributions.Independent 'IndependentBernoulli' batch_shape=[4] event_shape=[17495] dtype=int32>

In [31]:
# The final function predict_sample which given the distribution, a test sample, and the class priors:
#    1) Computes the class conditional probabilities given the sample
#    2) Forms the joint likelihood
#    3) Normalizes the joint likelihood and returns the log prob

def predict_sample(dist, sample, priors):
    cond_probs = dist.log_prob(sample)
    joint_likelihood = tf.add(np.log(priors), cond_probs)
    norm_factor = tf.math.reduce_logsumexp(joint_likelihood, axis=-1, keepdims=True)
    log_prob = joint_likelihood - norm_factor
    
    return log_prob

#### Computing log_probs

In [32]:
# Predicting one example from our test data

log_probs = predict_sample(tf_dist, test_data[0], priors)
log_probs

<tf.Tensor: shape=(4,), dtype=float32, numpy=
array([-6.1736359e+01, -1.5258789e-05, -1.1619873e+01, -6.3327652e+01],
      dtype=float32)>

In [33]:
# Loop over our test data and classify

probabilities =[]
for sample, label in zip(test_data, test_labels):
    probabilities.append(tf.exp(predict_sample(tf_dist, sample, priors)))
    
probabilities = np.asarray(probabilities)
predicted_classes = np.argmax(probabilities, axis=-1)
print('f1 ', f1_score(test_labels, predicted_classes, average='macro'))

f1  0.7848499112849504


In [38]:
# Make a Bernoulli Naive Bayes classifier using sklearn with the same leevl of alpha smoothing

clf = BernoulliNB(alpha=1)
clf.fit(train_data, train_labels)
pred = clf.predict(test_data)
print('f1 from sklearn', f1_score(test_labels, pred, average='macro'))

f1 from sklearn 0.7848499112849504
