In [1]:
import math
import os

import numpy as np

from spacy.tokenizer import Tokenizer
from spacy.lang.en import English


  from .autonotebook import tqdm as notebook_tqdm


NotFoundError: dlopen(/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.10/site-packages/tensorflow-plugins/libmetal_plugin.dylib, 0x0006): symbol not found in flat namespace '__ZN10tensorflow8internal10LogMessage16VmoduleActivatedEPKci'

In [None]:
nlp = English

In [None]:
data = []
classes = []

# Spam files
for file in os.listdir("nlp/corpus1/spam/"):
    with open(f"nlp/corpus1/spam/{file}", encoding="latin-1") as f:
        data.append(f.read())
        classes.append("spam")

# Ham files
for file in os.listdir("nlp/corpus1/ham/"):
    with open(f"nlp/corpus1/ham/{file}", encoding="latin-1") as f:
        data.append(f.read())
        classes.append("spam")

In [None]:
len(data)

## Principal Class 

To calculate the infered class we use naive Bayes:

$$\hat{c}=\arg\max_{(c)}=\log(P(c))+\sum_{i=1}^{n}{P(f_i|c)}$$

If a class has no values, then the probability is zero, and the $\log(0)$ cannot be calculated. To avoid this, we use
the Laplace smoothing:

$$P(f_i|c) = \frac {C(f_i, c) + 1} {C(c) + |V|}$$

where $|V|$ is the length of the vocabulary.

In [None]:
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

In [None]:
print([t.text for t in tokenizer(data[0])])

In [None]:
class NaiveBayesClassifier:
    def __init__(self):
        self.nlp = English()
        self.tokenizer = Tokenizer(self.nlp.vocab)

    def tokenize(self, doc):
        return [t.text.lower() for t in self.tokenizer(doc)]

    def word_count(self, words):
        word_count = {}
        for w in words:
            if w in word_count.keys():
                word_count[w] += 1
            else:
                word_count[w] = 1

        return word_count

    def fit(self, data, classes):
        n = len(data)
        self.unique_classes = set(classes)
        self.vocab = set()
        self.class_count = {}  # C(c)
        self.log_class_prior_prob = {}  # log(P(c))
        self.word_conditional_count = {}  # C(w|c)
        # Counting of the classes
        for c in classes:
            if c in self.class_count.keys():
                self.class_count[c] += 1
            else:
                self.class_count[c] = 1
        # Calculation of P(c)
        for c in self.class_count.keys():
            self.log_class_prior_prob[c] = math.log(self.class_count[c] / n)
            self.word_conditional_count[c] = {}
            # Calculation of C(w|c)
            for text, c in zip(data, classes):
                counts = self.word_count(self.tokenize(text))
                for word, count in count.items():
                    if word not in self.vocab:
                        self.vocab.add(word)
                    if word not in self.word_conditional_count[c]:
                        self.word_conditional_count[c][word] = 0.0
                    self.word_conditional_count[c][word] += count

    def predict(self, data):
        results = []
        for text in data:
            words = set(self.tokenize(data))
            score_probability = {}
            for word in words:
                # We ignore if the word is not in the vocab
                if word not in self.vocab:
                    continue
                # Laplace Smoothing
                for c in self.unique_classes:
                    log_word_class_prob = math.log(
                        (self.word_conditional_count.get(word, 0.0) + 1)
                        / (self.class_count[c] + len(self.vocab))
                    )
                    score_probability[c] = score_probability.get(c, self.log_class_prior_prob[c]) + log_word_class_prob
            
            arg_max_prob = np.argmax()
