In [9]:
# Modules
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report
import numpy as np

Load and view data

In [4]:
news = fetch_20newsgroups(subset="all")

# Extract the messages and topic labels, and view the topic labels
text = news["data"]
target = news["target"]
print(f'There are the 20 topics that a message ("document") can belong to: {news["target_names"]}')

There are the 20 topics that a message ("document") can belong to: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [7]:
# View an example data point
i_sample = 1
print(f'A post with topic number "{target[i_sample]}", {news["target_names"][target[i_sample]]}')
print('')
print(text[i_sample])

A post with topic number "3", comp.sys.ibm.pc.hardware

From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson)
Subject: Which high-performance VLB video card?
Summary: Seek recommendations for VLB video card
Nntp-Posting-Host: midway.ecn.uoknor.edu
Organization: Engineering Computer Network, University of Oklahoma, Norman, OK, USA
Keywords: orchid, stealth, vlb
Lines: 21

  My brother is in the market for a high-performance video card that supports
VESA local bus with 1-2MB RAM.  Does anyone have suggestions/ideas on:

  - Diamond Stealth Pro Local Bus

  - Orchid Farenheit 1280

  - ATI Graphics Ultra Pro

  - Any other high-performance VLB card


Please post or email.  Thank you!

  - Matt

-- 
    |  Matthew B. Lawson <------------> (mblawson@essex.ecn.uoknor.edu)  |   
  --+-- "Now I, Nebuchadnezzar, praise and exalt and glorify the King  --+-- 
    |   of heaven, because everything he does is right and all his ways  |   
    |   are just." - Nebuchadnezzar, king of Babylon, 562 B

Split posts into train/test set

In [8]:
X_train, X_test, y_train, y_test = train_test_split(text, target, random_state=0)

print(f"The training set has {len(X_train)} messages.")
print(f"The test set has {len(X_test)} messages.")

The training set has 14134 messages.
The test set has 4712 messages.


Create feature representation. I chose to use Term frequency - Inverse Document Frequency (TF-IDF).

In [16]:
%%time
# Produces a TF-IDF representation of the data

tfidfer = TfidfVectorizer()
tfidfer.fit(X_train)
X_train_tfidf = tfidfer.transform(X_train)
X_test_tfidf = tfidfer.transform(X_test)

CPU times: user 7.43 s, sys: 195 ms, total: 7.63 s
Wall time: 7.69 s


In [17]:
X_train_tfidf.shape

(14134, 141276)

Now let's set up a classifer to predict topics of posts. I chose to use Naive Bayes.

In [19]:
# Create a Multinomial Naive Bayes model and saved it to `mnb`
# Fit the 'mnb' model to the training features and labels, for
# the BoW, TF-IDF, or hashing features in this loop.
# YOUR CODE HERE
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)

y_pred = mnb.predict(X_test_tfidf)
print(f"Results for MBD")
print("-"*60)
print(classification_report(y_test, y_pred))
print("-"*60)

Results for MBD
------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.88      0.73      0.80       205
           1       0.86      0.76      0.81       245
           2       0.84      0.81      0.82       250
           3       0.73      0.87      0.80       243
           4       0.94      0.82      0.88       255
           5       0.91      0.85      0.88       240
           6       0.94      0.74      0.83       249
           7       0.84      0.91      0.88       219
           8       0.96      0.93      0.95       246
           9       0.91      0.97      0.94       227
          10       0.95      0.97      0.96       287
          11       0.73      0.98      0.83       234
          12       0.90      0.76      0.82       247
          13       0.97      0.89      0.93       250
          14       0.89      0.97      0.93       240
          15       0.55      0.97      0.70       250
    