In [10]:
import math
import os

import numpy as np

from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
nlp = English

In [3]:
data = []
classes = []

# Spam files
for file in os.listdir("nlp/corpus1/spam/"):
    with open(f"nlp/corpus1/spam/{file}", encoding="latin-1") as f:
        data.append(f.read())
        classes.append("spam")

# Ham files
for file in os.listdir("nlp/corpus1/ham/"):
    with open(f"nlp/corpus1/ham/{file}", encoding="latin-1") as f:
        data.append(f.read())
        classes.append("spam")

In [4]:
len(data)

5172

## Principal Class 

To calculate the infered class we use naive Bayes:

$$\hat{c}=\arg\max_{(c)}=\log(P(c))+\sum_{i=1}^{n}{P(f_i|c)}$$

If a class has no values, then the probability is zero, and the $\log(0)$ cannot be calculated. To avoid this, we use
the Laplace smoothing:

$$P(f_i|c) = \frac {C(f_i, c) + 1} {C(c) + |V|}$$

where $|V|$ is the length of the vocabulary.

In [5]:
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

In [6]:
print([t.text for t in tokenizer(data[0])])

['Subject:', 'what', 'up', ',', ',', 'your', 'cam', 'babe', '\n', 'what', 'are', 'you', 'looking', 'for', '?', '\n', 'if', 'your', 'looking', 'for', 'a', 'companion', 'for', 'friendship', ',', 'love', ',', 'a', 'date', ',', 'or', 'just', 'good', 'ole', "'", '\n', 'fashioned', '*', '*', '*', '*', '*', '*', ',', 'then', 'try', 'our', 'brand', 'new', 'site', ';', 'it', 'was', 'developed', 'and', 'created', '\n', 'to', 'help', 'anyone', 'find', 'what', 'they', "'", 're', 'looking', 'for', '.', 'a', 'quick', 'bio', 'form', 'and', 'you', "'", 're', '\n', 'on', 'the', 'road', 'to', 'satisfaction', 'in', 'every', 'sense', 'of', 'the', 'word', '.', '.', '.', '.', 'no', 'matter', 'what', '\n', 'that', 'may', 'be', '!', '\n', 'try', 'it', 'out', 'and', 'youll', 'be', 'amazed', '.', '\n', 'have', 'a', 'terrific', 'time', 'this', 'evening', '\n', 'copy', 'and', 'pa', 'ste', 'the', 'add', '.', 'ress', 'you', 'see', 'on', 'the', 'line', 'below', 'into', 'your', 'browser', 'to', 'come', 'to', 'the', '

In [31]:
class NaiveBayesClassifier:
    def __init__(self):
        self.nlp = English()
        self.tokenizer = Tokenizer(self.nlp.vocab)

    def tokenize(self, doc):
        return [t.text.lower() for t in self.tokenizer(doc)]

    def word_count(self, words):
        word_count = {}
        for w in words:
            if w in word_count.keys():
                word_count[w] += 1
            else:
                word_count[w] = 1

        return word_count

    def fit(self, data, classes):
        n = len(data)
        self.unique_classes = set(classes)
        self.vocab = set()
        self.class_count = {}  # C(c)
        self.log_class_prior_prob = {}  # log(P(c))
        self.word_conditional_count = {}  # C(w|c)
        # Counting of the classes
        for c in classes:
            if c in self.class_count.keys():
                self.class_count[c] += 1
            else:
                self.class_count[c] = 1
        # Calculation of P(c)
        for c in self.class_count.keys():
            self.log_class_prior_prob[c] = math.log(self.class_count[c] / n)
            self.word_conditional_count[c] = {}
            # Calculation of C(w|c)
            for text, c in zip(data, classes):
                counts = self.word_count(self.tokenize(text))
                for word, count in counts.items():
                    if word not in self.vocab:
                        self.vocab.add(word)
                    if word not in self.word_conditional_count[c]:
                        self.word_conditional_count[c][word] = 0.0
                    self.word_conditional_count[c][word] += count

    def predict(self, data):
        results = []
        for text in data:
            words = set(self.tokenize(text))
            score_probability = {}
            for word in words:
                # We ignore if the word is not in the vocab
                if word not in self.vocab:
                    continue
                # Laplace Smoothing
                for c in self.unique_classes:
                    log_word_class_prob = math.log(
                        (self.word_conditional_count.get(word, 0.0) + 1)
                        / (self.class_count[c] + len(self.vocab))
                    )
                    score_probability[c] = (
                        score_probability.get(c, self.log_class_prior_prob[c])
                        + log_word_class_prob
                    )

            arg_max_prob = np.argmax(np.array(list(score_probability.values())))
        results.append(list(score_probability.keys())[arg_max_prob])
        return results


In [32]:
data_train, data_test, classes_train, classes_test = train_test_split(
    data, classes, test_size=0.1, random_state=1992
)


In [33]:
classifier = NaiveBayesClassifier()
classifier.fit(data_train, classes_train)

In [34]:
classes_prediction = classifier.predict(data_test)

TypeError: unsupported operand type(s) for +: 'dict' and 'int'

In [None]:
classes_prediction

['spam']

In [None]:
accuracy_score(classes_test, classes_prediction)

ValueError: Found input variables with inconsistent numbers of samples: [518, 1]