In [1]:
#We are accessing a built in dataset called 20NG dataset, which contains 18,000 news posts on 20 categories. We will only use 4-category subset in this demo.
from sklearn.datasets import fetch_20newsgroups
import numpy as np

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

labels = dataset.target
true_k = np.unique(labels).shape[0]
data = dataset.data  

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
labels.shape

(3387,)

In [5]:
type(data)

list

In [6]:
# This is to vectorize the text corpus. After these codes, the X object will be the input vector for machine learning models.
# When transform into vectors, we do NOT use the raw count of a word in a document. Instead, we use the word's tf-idf score in a document.
# max_df=0.5 means ingoring words that appear in more than 50% of the documents; min_df=2 means ignoring words that appear in less than 2 documents.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=True)

X = vectorizer.fit_transform(data)

In [7]:
#X in the previuos step is hidimentional data, we need to use some dimentionality reduction technique. In this case, we will use SVD (Singular Value Decomposition), which is a common matrix decomposition technique.
#We want to reduce the dimentionality to 5.
#We have to re-normalize after we run our SVD on the dataset.
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

n_components = 5
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

In [8]:
X.shape

(3387, 5)

In [9]:
#Randomly select 80% (3387*80% = 2710) rows from X as the training set
import numpy as np
training_idx = np.random.choice(X.shape[0], size=2710, replace=False)
X_training = X[training_idx, :]

In [10]:
#The remaining is the test set
test_idx = list(set(range(X.shape[0])) - set(training_idx))
X_test = X[test_idx, :]

In [11]:
#The same split for the labels list
labels_training = [labels[i] for i in training_idx]
labels_test = [labels[j] for j in test_idx]

In [12]:
#This is to use Naive Bayes Classifier to predict the label of each news article.
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_training, labels_training)

GaussianNB(priors=None, var_smoothing=1e-09)

In [11]:
#This is the prediction result.
gnb.predict(X_test)

array([1, 3, 1, 1, 3, 2, 2, 1, 2, 1, 0, 1, 0, 2, 3, 2, 2, 2, 3, 0, 2, 2,
       2, 0, 3, 1, 0, 2, 1, 3, 3, 3, 0, 1, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2,
       0, 2, 1, 3, 2, 1, 1, 1, 0, 0, 1, 1, 3, 2, 3, 3, 0, 0, 1, 3, 1, 3,
       2, 3, 2, 2, 0, 3, 1, 2, 1, 0, 0, 3, 3, 1, 3, 1, 3, 0, 3, 2, 0, 0,
       0, 1, 3, 2, 1, 3, 0, 2, 1, 0, 0, 2, 2, 3, 3, 0, 2, 2, 1, 0, 1, 1,
       0, 2, 0, 3, 1, 0, 3, 3, 1, 3, 1, 0, 2, 1, 2, 2, 3, 0, 0, 0, 0, 0,
       0, 2, 3, 1, 3, 2, 2, 2, 0, 3, 0, 0, 2, 2, 1, 1, 2, 2, 3, 1, 3, 0,
       0, 1, 1, 0, 3, 1, 0, 3, 1, 3, 0, 1, 2, 1, 1, 2, 3, 3, 3, 1, 1, 1,
       1, 1, 1, 0, 2, 2, 1, 1, 2, 0, 0, 0, 1, 1, 2, 3, 2, 1, 1, 0, 1, 0,
       2, 1, 2, 0, 1, 1, 1, 2, 3, 2, 1, 2, 2, 0, 3, 2, 0, 1, 0, 2, 2, 0,
       3, 0, 0, 1, 0, 0, 1, 2, 3, 3, 0, 1, 2, 0, 1, 2, 1, 0, 0, 3, 2, 0,
       0, 1, 0, 2, 0, 2, 0, 0, 2, 0, 3, 2, 1, 1, 2, 1, 1, 0, 2, 1, 2, 1,
       2, 0, 3, 1, 0, 0, 3, 1, 0, 2, 2, 0, 2, 1, 3, 0, 0, 3, 3, 1, 1, 1,
       0, 1, 0, 2, 0, 0, 2, 0, 3, 2, 3, 2, 0, 2, 2,

In [12]:
#This is the confusion matrix.
import sklearn.metrics
sklearn.metrics.confusion_matrix(gnb.predict(X_test),labels_test)

array([[106,   4,   2,  58],
       [  2, 179,   7,   2],
       [  2,   5, 178,   3],
       [ 52,   4,   3,  70]])

In [13]:
#This is the accuracy score.
sklearn.metrics.accuracy_score(gnb.predict(X_test),labels_test)

0.7872968980797637