# Multinomial Naive Bayes - Scikit-Learn

O classificador Multinomial Naive Bayes é adequado para classificação com variáveis discretas (por exemplo, contagens de palavras para a classificação de texto). A distribuição multinomial normalmente requer contagens de entidades inteiras. No entanto, na prática, contagens fracionadas como tf-idf também podem funcionar.

### Classificador de Notícias

http://qwone.com/~jason/20Newsgroups/

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
# Definindo as categorias
# (usando apenas 4 de um total de 20 disponíveis para que o processo de classificação seja mais rápido)
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [3]:
# Treinamento
twenty_train = fetch_20newsgroups(subset = 'train', categories = categories, shuffle = True, random_state = 42)

In [4]:
# Classes
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
len(twenty_train.data)

2257

In [6]:
# Visualizando alguns dados (atributos)
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [7]:
# Visualizando variáveis target
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [8]:
# O Scikit-Learn registra os labels como array de números, a fim de aumentar a velocidade
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [9]:
# Visualizando as classes dos 10 primeiros registros
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


### Bag of Words (Saco de Palavras)

In [10]:
# Tokenizing
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(twenty_train.data)
count_vect.vocabulary_.get(u'algorithm')
x_train_counts.shape

(2257, 35788)

In [11]:
# De ocorrências a frequências - Term Frequency times Inverse Document Frequency (Tfidf)
tf_transformer = TfidfTransformer(use_idf = False).fit(x_train_counts)
x_train_tf = tf_transformer.transform(x_train_counts)
x_train_tf.shape

(2257, 35788)

In [12]:
# Mesmo resultado da célula anterior, mas combinando as funções
tf_transformer = TfidfTransformer()
x_train_tfidf = tf_transformer.fit_transform(x_train_counts)
x_train_tfidf.shape

(2257, 35788)

In [13]:
# Criando o modelo Multinomial
clf = MultinomialNB().fit(x_train_tfidf, twenty_train.target)

In [14]:
# Previsões
docs_new = ['God is love', 'OpenGL on the GPU is fast']
x_new_counts = count_vect.transform(docs_new)
x_new_tfidf = tf_transformer.transform(x_new_counts)

predicted = clf.predict(x_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [15]:
# Criando um Pipeline - Classificador Composto
# vectorizer => transformer => classifier
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB())])

In [16]:
# Fit
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [17]:
# Acurácia do Modelo
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8348868175765646

In [18]:
# Métricas
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

              accuracy                           0.83      1502
             macro avg       0.89      0.82      0.83      1502
          weighted avg       0.88      0.83      0.84      1502



In [19]:
# Confusion Matrix
metrics.confusion_matrix(twenty_test.target, predicted)

array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]])

In [20]:
# Parâmetros para GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3)
}

In [21]:
# GridSearchCV
gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)

In [22]:
# Fit
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [23]:
# Test
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [24]:
# Score
gs_clf.best_score_

0.9349999999999999

In [25]:
# Parâmetros utilizados
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 2)
