### Exercises

1\. Describe in writing what an assumption is made when a naive Bayes classifier is created. Why the classifier is naive?

2\. Describe in writing what means maximum likelihood?

3\. Make a copy of a naive Bayes classifier that we used above to create a spam filter and try to improve its performance.
Split the data set into training, validation and test data. Select the best model using the validation dataset and then compute your final score on the testing data. To improve the model for example the whole message content can be taken into account instead of the subject only. Also lengths of tokens that are taken into account can be varied. May be it would be interesting to split the messages into digramms: couples of words going one after another. And so on.

4\. Try to improve the Gaussian naive Bayes classifier. Split the data set into training, validation and test data. Select the best model using the validation dataset and then compute your final score on the testing data.

5\. Previously we discussed that in the most cases data must be standardized before creation of a machine learning model. Why it does not influences the performance of a Gaussian naive Bayes classifier?

1. При создании наивного байесовского классификатора предполагается, что все признаки условно независимы от переменной класса. Это предположение "наивное", поскольку оно упрощает вычисление вероятности заданных признаков переменной класса, предполагая, что признаки независимы друг от друга. Однако редко это предположение верно для реальных данных, всегда есть какая-нибудь корелляция.

2. Метод максимального подобия — это метод, используемый в статистике для оценки параметров статистической модели, его цель — найти значения параметров модели, которые дадут максимум функции "подобия", которая, в свою очередь, является произведением функций плотности распределения вероятности для наблюдаемых данных.

5. В случае гауссовского наивного байесовского классификатора нормализация данных не требуется и не оказывает влияния на производительность модели, потому что здесь мы не учитываем величину параметров, а работаем с распределениями (пдф) этих величин внутри каждого класса. Поскольку этот классификатор предполагает, что объекты следуют нормальному распределению и вычисляет среднее значение и стандартное отклонение каждого параметра внутри классов, в итоге масштаб не влияет на расчеты модели.

In [8]:
"""In the _text2tokens method, the code now considers the entire text content without focusing only on the subject."""
from collections import defaultdict
import re
import numpy as np
import csv
import requests
from io import BytesIO, TextIOWrapper
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from sklearn import metrics


class NaiveBayes:
    def __init__(self, k, drop_short):
        self.k = k
        self.vocab = set()
        self.token_in_spam = defaultdict(int)
        self.token_in_ham = defaultdict(int)
        self.pcond_spam = self.pcond_ham = None
        self.spam_total = self.ham_total = 0
        self.p_spam_total = self.p_ham_total = None
        self.re_token = re.compile(r"[a-z']+")
        self.drop_short = drop_short

    def _text2tokens(self, text):
        text_lower = text.lower()
        all_tokens = self.re_token.findall(text_lower)
        good_tokens = [tok for tok in all_tokens if len(tok) > self.drop_short]
        return good_tokens

    def fit(self, messages, labels):
        for mes, lab in zip(messages, labels):
            tokens = self._text2tokens(mes)
            if lab == 'spam':
                self.spam_total += 1
                for token in tokens:
                    self.token_in_spam[token] += 1
            else:
                self.ham_total += 1
                for token in tokens:
                    self.token_in_ham[token] += 1
            self.vocab.update(tokens)
        self.pcond_spam = defaultdict(int)
        self.pcond_ham = defaultdict(int)
        for token in self.vocab:
            self.pcond_spam[token] = (self.token_in_spam[token] + self.k) / (self.spam_total + 2 * self.k)
            self.pcond_ham[token] = (self.token_in_ham[token] + self.k) / (self.ham_total + 2 * self.k)
        self.p_spam_total = self.spam_total / (self.spam_total + self.ham_total)
        self.p_ham_total = 1 - self.p_spam_total

    def predict(self, messages):
        pred = []
        for mes in messages:
            message_tokens = self._text2tokens(mes)
            log_sum_spam = np.log(self.p_spam_total)
            log_sum_ham = np.log(self.p_ham_total)
            for tok in self.vocab:
                p_spam = self.pcond_spam[tok]
                p_ham = self.pcond_ham[tok]
                if tok not in message_tokens:
                    p_spam = 1 - p_spam
                    p_ham = 1 - p_ham
                log_sum_spam += np.log(p_spam)
                log_sum_ham += np.log(p_ham)
            pred.append('spam' if log_sum_spam > log_sum_ham else 'ham')
        return pred

    def explore_vocab(self):
        spam_words = []
        for tok in self.vocab:
            p_spam = self.pcond_spam[tok] * self.p_spam_total
            p_ham = self.pcond_ham[tok] * self.p_ham_total
            if p_spam > p_ham:
                spam_words.append([tok, p_spam])

        spam_words = sorted(spam_words, key=lambda x: -x[1])
        words_only = [s[0] for s in spam_words]
        return words_only


def load_zipcsv_categorical(file_name):
    """Downloads zipped csv dataset from repo and return it as a nested list."""
    base_url = "https://raw.githubusercontent.com/kupav/data-sc-intro/main/data/"
    web_data = requests.get(base_url + file_name)
    assert web_data.status_code == 200
    zf = ZipFile(BytesIO(web_data.content))
    zipped_name = zf.namelist()[0]
    print(f"Download {file_name}, unzip {zipped_name}")
    with zf.open(zipped_name, 'r') as file:
        reader = csv.reader(TextIOWrapper(file), delimiter=',')
        data = []
        for row in reader:
            data.append(row)
    return data


raw_data = load_zipcsv_categorical("spam_and_ham.zip")
data_lab = [row[1] for row in raw_data[1:]]
data_mes = [row[2] for row in raw_data[1:]]
p_test = 0.1
n_test = round(p_test * len(data_lab))

X_train, X_test, y_train, y_test = train_test_split(data_mes, data_lab, random_state=0,
                                                    test_size=n_test, shuffle=True)
nbc = NaiveBayes(k=1, drop_short=2)
nbc.fit(X_train, y_train)
y_pred = nbc.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred, average='binary', pos_label='spam')
print(f"Accuracy = {acc:.4f}")
print(f"F1-score = {f1:.4f}")
spam_words = nbc.explore_vocab()
print(spam_words[:100])

Download spam_and_ham.zip, unzip spam_ham_dataset.csv


  log_sum_ham += np.log(p_ham)
  log_sum_spam += np.log(p_spam)


Accuracy = 0.7118
F1-score = 0.0000
['http', 'company', 'more', 'www', 'here', 'font', 'only', 'statements', 'email', 'nbsp', 'height', 'width', 'its', 'size', 'free', 'stock', 'within', 'pills', 'investment', 'money', 'inc', 'securities', 'align', 'computron', 'click', 'best', 'prices', 'looking', 'online', 'products', 'news', 'color', 'microsoft', 'net', 'face', 'windows', 'save', 'future', 'border', 'million', 'many', 'software', 'href', 'most', 'professional', 'link', 'internet', 'high', 'without', 'account', 'companies', 'market', 'info', 'world', 'international', 'viagra', 'such', 'adobe', 'src', 'act', 'reply', 'special', 'offer', 'stocks', 'low', 'cialis', 'remove', 'stop', 'section', 'advice', 'center', 'site', 'shares', 'home', 'own', 'results', 'visit', 'security', 'soft', 'buy', 'dollars', 'meds', 'performance', 'newsletter', 'style', 'full', 'family', 'prescription', 'even', 'top', 'quality', 'works', 'long', 'offers', 'limited', 'interest', 'bgcolor', 'pro', 'index', 'pal