In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Naive Bayes

### Load the Dataset

In [2]:
dataset = pd.read_table('DataSet/03_dataset.txt', header=None, names=['target', 'text'])

print(f'Dimensiones: {dataset.shape}')

dataset.head()

Dimensiones: (5572, 2)


Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Vectorize

Transform the input from text into a *bag of words* matrix.

In [3]:
vectorized_data = CountVectorizer().fit_transform(dataset.text)

vectorized_data

<5572x8713 sparse matrix of type '<class 'numpy.int64'>'
	with 74169 stored elements in Compressed Sparse Row format>

### Train/Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_data, dataset.target, test_size=0.2)

### Model Training

In [5]:
clf = MultinomialNB().fit(X_train, y_train)

### Model Evaluation

In [6]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       956
        spam       0.94      0.94      0.94       159

    accuracy                           0.98      1115
   macro avg       0.96      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115

