In [82]:

import re
from sklearn.datasets import fetch_20newsgroups
from collections import Counter
from math import log
from tqdm import tqdm

In [96]:
train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_data = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

print("Classes: ", train_data.target_names)
print("Example text:\n", train_data.data[0])
print("Label: ", train_data.target_names[train_data.target[0]])

Classes:  ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Example text:
 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.
Label:  rec.autos


In [93]:
class NaiveBayes():
    def __init__(self, train_data):
        self.train_data = train_data   
        self.train_class_priors = self.get_class_priors(train_data)
        self.train_vocab = self.get_vocabulary(train_data)
        self.get_conditionals(train_data)
        
    def get_class_priors(self, data, verbose = False):
        total_documents = len(data.target)
        class_counts = Counter(data.target)

        class_priors = {class_id: count / total_documents for class_id, count in class_counts.items()}

        if verbose:
            for class_id, prior in class_priors.items():
                class_name = data.target_names[class_id]
                print(f"Class: {class_name}, Prior: {prior:.4f}")
            
        return class_priors
            
    def get_word_list(self, doc):
        
        words = doc.lower()
        words = re.sub(r'[^a-z0-9\s]', '', words)
        words = words.split()
        
        return words

    def get_vocabulary(self, data, verbose = False):
        vocab = set()
        for doc in data.data:
            vocab.update(self.get_word_list(doc))
            
        if verbose:
            print(f"Vocabulary Size: {len(vocab)}")
        
        return vocab
    
    def get_conditionals(self, data):
        for i in range(0, 20):
            setattr(self, f"counts_{i}", Counter())
            
        for ind, doc in enumerate(data.data):
            doc_class = data.target[ind] 
            words = self.get_word_list(doc)
            getattr(self, f"counts_{doc_class}").update(words)
            
        for i in range(0, 20):
            setattr(self, f"total_{i}", sum(getattr(self, f"counts_{i}").values()))
        
    # returns log(P(w|c) if w is in vocab 0 otherwise
    def __getitem__(self, *args):
        if len(*args) != 2:
            raise TypeError("Two arguments required: word, class_index.")
        
        word, c = args[0]
        return log((getattr(self, f"counts_{c}")[word] + 1) / (len(self.train_vocab) + getattr(self, f"total_{c}"))) if (word in self.train_vocab) else 0

    def get_prob(self, doc, c):
        words = self.get_word_list(doc)
        return sum([self[w, c] for w in words]) + self.train_class_priors[c]
    
    def __call__(self, *args, verbose=False):
        if len(args) != 1:
            raise TypeError("One argument required: document.")
        x = {}
        
        for class_id in range(0, 20):
                class_name = self.train_data.target_names[class_id]
                class_prob = self.get_prob(*args, class_id)
                if verbose:
                    print(f"Class: {class_name}, Logits: {class_prob:.4f}")
                x[class_name] = class_prob
        return max(x, key=x.get)
            

In [94]:
def eval(model, data):
    corr = 0
    total = len(data.data)
    
    for ind, doc in tqdm(enumerate(data.data), total=total):
        y = data.target[ind]
        
        pred_class_name = model(doc, verbose=False)
        pred = data.target_names.index(pred_class_name)
        corr += (pred == y)
    
    return corr / total


In [102]:
model = NaiveBayes(train_data=train_data)


In [103]:
print(eval(model, train_data))
print(eval(model, test_data))

100%|██████████| 11314/11314 [00:46<00:00, 244.82it/s]


0.7526073890754817


100%|██████████| 7532/7532 [00:38<00:00, 194.29it/s]

0.5225703664365374



