## Naive Bayes 실습 - news 데이터셋
---

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# matplotlib 한글 문제
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname = "c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family = font_name)

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
news = fetch_20newsgroups()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
print(news.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features       

In [6]:
# 독립변수와 종속변수 분리
X, Y, labels = news.data, news.target, news.target_names

In [7]:
len(X)

11314

In [8]:
print(Y[3000:3500])

[17  9 17  5  1  3 19 13  5 11 19 15  6 15 12  1 14  8 14 15  6 19  6  5
 19  1  6  3 14 17 15 14 17  7 16  3 11  6 12 12 15 10  7 14 14 10  1  2
 13  3  2 12 13  6  3  1  9  4  9  4  5 18  8 18 15 18 13  8  9 15 14  9
  6  1 14  9 15  1  2  4  3  4  8  9  2 18  6 16  1 13 14 11 12  8  1 16
 14 14  6 15 17 17  8  7  9  5  4 17  4 12  7  1  7 12 12  9 10 17  2  1
  1  0  6 15  1 17  5  6  9 13  5  7 13 17 16 14 17 13  0  0  4 16  0 13
 17  8  8  5  3 19 10  4 17  3  9  3  0 17  1  4  5 13 10  6 18  0 15 10
  2  6 11  3  0 15  2 19  5 13  6  6 19 11 16  7 18 13  6 13  3 15  0 16
 19  1  6  4 16  6  2 15 17  8 14 18 18  5 18 15 13  3 18 17 19 13 14  8
 15  2  1 18 11 15 14  5  1  8  7  4 16  9  2 11 11 15 18 11 11  0 13 11
  9  6  9  7 15  3  3  4 15  9 12 13 18  8  1 13  8 17 19 15  3  1 19 17
  1 13 10 16  5  8 11 11 14  4  8 11 14  6 14  1 15  4 17  2  8 14 15 11
  3 19  8 10 12  2 14 14 14  1  9  8  6 16  6 13 16  8 13  5 15  6  1  7
  6 10  2  3 18 10  9  7 17  8 18  1  6 13 18  9 14

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# 학습 데이터, 테스트 데이터 분리
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=1234, stratify=Y)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [17]:
# 데이터 전처리(벡터화)
vectorizer = CountVectorizer()
tfid = TfidfTransformer()

In [21]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# dataframe 형태에서 오류남

In [22]:
X_train_tfid = tfid.fit_transform(X_train_vec)
X_test_tfid = tfid.transform(X_test_vec)

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

In [26]:
# (다중분류) Naive Bayes, Grid Search로 모델 학습
nb = MultinomialNB()
param_grid = [{'alpha': np.linspace(0.01, 1, 100)}]
gs = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gs.fit(X_train_tfid, Y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'alpha': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
       0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33,
       0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0....
       0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55,
       0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66,
       0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77,
       0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88,
       0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99,
       1.  ])}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=

In [28]:
print('best hyperparameter: {0}'.format(gs.best_params_))
print('accuracy: {0:.2f}'.format(gs.best_score_))

best hyperparameter: {'alpha': 0.01}
accuracy: 0.90


In [29]:
# Naive Bayes 모형 생성
naive_bayes_m = gs.best_estimator_

In [30]:
Y_predict = naive_bayes_m.predict(X_test_tfid)
print(Y_predict)

[12  7 15 ...  9  4 13]


In [31]:
# Test Dataset Accuracy
accuracy_test = naive_bayes_m.score(X_test_tfid, Y_test)
print('Test dataset Accuracy: {0:.2f}'.format(accuracy_test))

Test dataset Accuracy: 0.90


In [32]:
# 실제값과 예측값 비교
Y_predict = naive_bayes_m.predict(X_test_tfid)
for i in range(10):
    print('Actual: {0} || Predicted: {1}'.format(labels[Y_test[i]], labels[Y_predict[i]]))

Actual: comp.sys.ibm.pc.hardware || Predicted: sci.electronics
Actual: rec.autos || Predicted: rec.autos
Actual: soc.religion.christian || Predicted: soc.religion.christian
Actual: talk.politics.mideast || Predicted: talk.politics.mideast
Actual: sci.electronics || Predicted: sci.electronics
Actual: comp.sys.ibm.pc.hardware || Predicted: comp.sys.ibm.pc.hardware
Actual: talk.politics.guns || Predicted: talk.politics.guns
Actual: rec.motorcycles || Predicted: rec.motorcycles
Actual: sci.crypt || Predicted: sci.crypt
Actual: sci.crypt || Predicted: sci.crypt
