# Importações

Aqui importamos algumas bibliotecas que iremos utilizar

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords_list = stopwords.words('english')
vectorizer = CountVectorizer()

SEED = 41

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Função para pré-processamento dos dados

In [2]:
def preprocessing(corpus):
    preprocessed_corpus = []
    for document in corpus:
        # Conversão para minúsculas
        document = document.lower()

        # Remoção de pontuação
        document = re.sub(r'[^\w\s]', '', document)

        # Tokenização
        tokens = document.split()

        # Remoção de stopwords (exemplo com stopwords em português)
        tokens = [token for token in tokens if token not in stopwords_list]


        # Reconstrução do documento pré-processado
        preprocessed_document = ' '.join(tokens)
        preprocessed_corpus.append(preprocessed_document)

    return np.array(preprocessed_corpus)

Leitura dos dados, eles podem ser encontrados no [Kaggle](https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset)

In [3]:
PATH = "https://raw.githubusercontent.com/luiz826/naive_bayes/main/dataset/spam.csv"

df = pd.read_csv(PATH, sep=',', encoding="latin-1")[["v1", "v2"]].rename(columns={"v1": "label", "v2": "text"})

Aqui separamos em treino e teste. Muito importante pois passaremos a utilizar somente a nossa amostra de treino daqui em diante. O teste só servirá para pra (uau!) testar o modelo.

In [4]:
X = df["text"].values
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), test_size=0.25, random_state=SEED)

X_train_prep = preprocessing(X_train)

In [5]:
print(X_train[0])
print(X_train_prep[0])
print(y_train[0])

\Happy valentines day\" I know its early but i have hundreds of handsomes and beauties to wish. So i thought to finish off aunties and uncles 1st..."
happy valentines day know early hundreds handsomes beauties wish thought finish aunties uncles 1st
ham


In [6]:
# Vetorização dos documentos pré-processados
vectorizer = CountVectorizer()
X_train_prep_vec = vectorizer.fit_transform(X_train_prep)

# Criação e treinamento do modelo Naive Bayes
model = MultinomialNB()
model.fit(X_train_prep_vec, y_train)

In [7]:
X_test_prep = preprocessing(X_test)

X_test_prep_vec = vectorizer.transform(X_test_prep)

Score nos dados de treino

In [8]:
y_pred_train = model.predict(X_train_prep_vec)

print("Acurácia: ", accuracy_score(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))

Acurácia:  0.994256999282125
[[3616   12]
 [  12  539]]
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3628
        spam       0.98      0.98      0.98       551

    accuracy                           0.99      4179
   macro avg       0.99      0.99      0.99      4179
weighted avg       0.99      0.99      0.99      4179



Score nos dados de teste

In [9]:
y_pred_test = model.predict(X_test_prep_vec)

print("Acurácia: ", accuracy_score(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

Acurácia:  0.9770279971284996
[[1185   12]
 [  20  176]]
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99      1197
        spam       0.94      0.90      0.92       196

    accuracy                           0.98      1393
   macro avg       0.96      0.94      0.95      1393
weighted avg       0.98      0.98      0.98      1393



Testando no Gausian

In [10]:
model = GaussianNB()

model.fit(X_train_prep_vec.toarray(), y_train)

In [11]:
y_pred_train = model.predict(X_train_prep_vec.toarray())

print("Acurácia: ", accuracy_score(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))

Acurácia:  0.957166786312515
[[3449  179]
 [   0  551]]
              precision    recall  f1-score   support

         ham       1.00      0.95      0.97      3628
        spam       0.75      1.00      0.86       551

    accuracy                           0.96      4179
   macro avg       0.88      0.98      0.92      4179
weighted avg       0.97      0.96      0.96      4179



In [12]:
y_pred_test = model.predict(X_test_prep_vec.toarray())

print("Acurácia: ", accuracy_score(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

Acurácia:  0.8894472361809045
[[1061  136]
 [  18  178]]
              precision    recall  f1-score   support

         ham       0.98      0.89      0.93      1197
        spam       0.57      0.91      0.70       196

    accuracy                           0.89      1393
   macro avg       0.78      0.90      0.82      1393
weighted avg       0.92      0.89      0.90      1393



# Referências

- [Spam - Monty Python](https://www.youtube.com/watch?v=ycKNt0MhTkk)
- [Naive Bayes, Clearly Explained!!!](https://www.youtube.com/watch?v=O2L2Uv9pdDA)
- [Ciência de Dados: Classificador Naive Bayes](https://www.youtube.com/watch?v=Bk2mSIMw_XE)