In [46]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

import numpy as np

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [3]:
print(newsgroups_train.target_names)
print(set(newsgroups_train.target))

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
{np.int64(0), np.int64(1), np.int64(2), np.int64(3)}


In [4]:
X_train, y_train, X_test, y_test = newsgroups_train.data, newsgroups_train.target, newsgroups_test.data, newsgroups_test.target

In [6]:
tfidf = TfidfVectorizer(max_features=2000, min_df=3, max_df=0.5)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, X_test_tf.shape)

(2034, 2000) (1353, 2000)


In [7]:
for word, count in zip(tfidf.get_feature_names_out()[:50], X_train_tf[0].toarray()[0, :50]):
    print(word, ':', count, end=', ')

00 : 0.0, 000 : 0.0, 01 : 0.0, 04 : 0.0, 05 : 0.0, 10 : 0.0, 100 : 0.0, 1000 : 0.0, 11 : 0.0, 12 : 0.0, 128 : 0.0, 129 : 0.0, 13 : 0.0, 130 : 0.0, 14 : 0.0, 15 : 0.0, 16 : 0.0, 17 : 0.0, 18 : 0.0, 19 : 0.0, 1988 : 0.0, 1989 : 0.0, 1990 : 0.0, 1991 : 0.0, 1992 : 0.0, 1993 : 0.0, 20 : 0.0, 200 : 0.0, 202 : 0.0, 21 : 0.0, 22 : 0.0, 23 : 0.0, 24 : 0.0, 25 : 0.0, 256 : 0.0, 26 : 0.0, 27 : 0.0, 28 : 0.0, 2d : 0.0, 30 : 0.0, 300 : 0.0, 31 : 0.0, 32 : 0.0, 33 : 0.0, 34 : 0.0, 35 : 0.0, 39 : 0.0, 3d : 0.0, 40 : 0.0, 400 : 0.0, 

In [9]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tf, y_train)
print(nb_clf.score(X_train_tf, y_train))
print(nb_clf.score(X_test_tf, y_test))

0.8623402163225172
0.7390983000739099


In [12]:
pred = nb_clf.predict(X_test_tf[:10])
for i in pred:
    print(newsgroups_train.target_names[i])

sci.space
comp.graphics
comp.graphics
comp.graphics
comp.graphics
comp.graphics
sci.space
sci.space
alt.atheism
sci.space


In [19]:
tfidf = TfidfVectorizer(max_features=5000, min_df=3, max_df=0.5)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, X_test_tf.shape)

(2034, 5000) (1353, 5000)


In [25]:
nb_clf = MultinomialNB(alpha=0.1)
nb_clf.fit(X_train_tf, y_train)
print(nb_clf.score(X_train_tf, y_train))
print(nb_clf.score(X_test_tf, y_test))

0.9454277286135693
0.7760532150776053


In [33]:
pred = nb_clf.predict(X_test_tf[:10])
for i, j in zip(pred, y_test):
    print('pred:', newsgroups_train.target_names[i], '/ y_test:', newsgroups_train.target_names[j], sep=' ')

pred: sci.space / y_test: sci.space
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: sci.space / y_test: sci.space
pred: sci.space / y_test: sci.space
pred: alt.atheism / y_test: alt.atheism
pred: sci.space / y_test: sci.space


In [56]:
def top_n_features(classifier, vectorizer, categories, n):
    feature_names = np.asarray(vectorizer.get_feature_names_out())
    for i, category in enumerate(categories):
        if isinstance(classifier, MultinomialNB):
            top_n = np.argsort(-classifier.feature_count_[i])[:n]
            print(f'{category}: {', '.join(feature_names[top_n])}')
        else:
            top_n = np.argsort(-classifier.coef_[i])[:n]
            print(f'{category}: {', '.join(feature_names[top_n])}')

In [57]:
top_n_features(nb_clf, tfidf, newsgroups_train.target_names, 20)

alt.atheism: you, not, are, be, this, have, as, what, they, if, god, do, but, your, or, an, so, was, we, on
comp.graphics: you, graphics, on, this, have, any, or, can, thanks, with, if, be, but, there, image, are, files, file, me, anyone
sci.space: space, on, you, be, was, this, as, they, are, have, at, would, or, if, from, not, but, nasa, with, there
talk.religion.misc: you, not, he, are, this, as, be, god, was, they, have, with, jesus, your, who, but, or, by, what, his


In [80]:
logreg_clf = LogisticRegression(max_iter=1000, C=2)
logreg_clf.fit(X_train_tf, y_train)
print(logreg_clf.score(X_train_tf, y_train), logreg_clf.score(X_test_tf, y_test))

0.9670599803343166 0.7671840354767184


In [81]:
pred = logreg_clf.predict(X_test_tf[:10])
for i, j in zip(pred, y_test):
    print('pred:', newsgroups_train.target_names[i], '/ y_test:', newsgroups_train.target_names[j], sep=' ')

pred: sci.space / y_test: sci.space
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: sci.space / y_test: sci.space
pred: sci.space / y_test: sci.space
pred: alt.atheism / y_test: alt.atheism
pred: sci.space / y_test: sci.space


In [82]:
top_n_features(logreg_clf, tfidf, newsgroups_train.target_names, 20)

alt.atheism: atheism, religion, atheists, bobby, islam, deletion, islamic, atheist, motto, up, punishment, must, post, god, him, satan, people, an, isn, you
comp.graphics: graphics, image, file, computer, 3d, files, hi, looking, points, code, format, package, video, 68070, images, color, card, anyone, screen, windows
sci.space: space, nasa, orbit, launch, moon, spacecraft, shuttle, earth, sci, dc, lunar, solar, flight, mars, cost, get, satellite, like, year, at
talk.religion.misc: christian, god, christians, jesus, objective, fbi, he, his, blood, koresh, christ, children, order, see, rosicrucian, who, amorc, abortion, values, hudson
