In [1]:
#Import the libraries
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint

In [9]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
#Get the training set
newsgroups_train = fetch_20newsgroups(subset='train')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
type(newsgroups_train)

sklearn.utils.Bunch

In [4]:
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [5]:
print(newsgroups_train.filenames.shape)
print(newsgroups_train.target.shape)

(11314,)
(11314,)


In [6]:
newsgroups_train.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [10]:
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
list(newsgroups_train.target_names)
newsgroups_train.filenames.shape
newsgroups_train.target.shape
newsgroups_train.target[:10]

['alt.atheism', 'sci.space']

(1073,)

(1073,)

array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0])

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)

In [12]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [14]:
newsgroups_test = fetch_20newsgroups(subset='test',categories=categories)

In [15]:
vectors_test = vectorizer.transform(newsgroups_test.data)

In [16]:
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [17]:
pred = clf.predict(vectors_test)

In [18]:
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.88213592402729568

In [19]:
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)

In [20]:
vectors_test = vectorizer.transform(newsgroups_test.data)

In [21]:
pred = clf.predict(vectors_test)

In [22]:
metrics.f1_score(pred, newsgroups_test.target, average='macro')

0.77310350681274775

In [23]:
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
vectors = vectorizer.fit_transform(newsgroups_train.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

0.76995175184521725

In [24]:
import numpy as np
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

In [25]:
show_top10(clf, vectorizer, newsgroups_train.target_names)

alt.atheism: not in and it you is that of to the
comp.graphics: graphics you in it is for of and to the
sci.space: for that it space is in and of to the
talk.religion.misc: not it in you is and that to of the
