In [1]:
import re
import numpy as np
import pandas as pd
import pyLDAvis.sklearn

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import pairwise_distances
from nltk.corpus import stopwords

In [2]:
data = fetch_20newsgroups(categories=['rec.autos','rec.motorcycles', 'sci.crypt', 
                                      'sci.electronics','sci.med','sci.space','talk.politics.guns',
                                      'talk.politics.mideast','talk.religion.misc'])
reg = re.compile(r'[0-9]+')
data.data = [reg.sub("", i) for i in data.data]

vectorizer = CountVectorizer(stop_words=stopwords.words('english'), max_df=0.3, min_df=10)
vectorized_data = vectorizer.fit_transform(data.data)

In [3]:
model = LatentDirichletAllocation(n_components=20)
model.fit(vectorized_data)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=20, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [31]:
pylda = pyLDAvis.sklearn.prepare(model, vectorized_data, vectorizer, mds='mmds')
#pylda

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [25]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_data, data.target, test_size=0.1)

In [32]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [33]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92        68
           1       0.93      0.93      0.93        60
           2       0.98      0.94      0.96        53
           3       0.91      1.00      0.95        50
           4       0.94      0.98      0.96        62
           5       0.97      0.96      0.97        74
           6       0.93      0.95      0.94        40
           7       0.97      0.93      0.95        61
           8       1.00      0.97      0.99        38

   micro avg       0.95      0.95      0.95       506
   macro avg       0.95      0.95      0.95       506
weighted avg       0.95      0.95      0.95       506



In [53]:
args = np.argsort(model.coef_, axis=1)

In [54]:
inverse = {j:i for i, j in vectorizer.vocabulary_.items()}

In [56]:
for i in range(len(data.target_names)):
    print('CLASS:', data.target_names[i])
    print('TOP 10:\n')
    for j in range(10):
        print(inverse[args[i][-j]])
    print()

CLASS: rec.autos
TOP 10:

bike
car
cars
toyota
auto
automotive
dealer
ford
eliot

CLASS: rec.motorcycles
TOP 10:

car
dod
bike
bikes
motorcycle
ride
motorcycles
riding
sale
bmw

CLASS: sci.crypt
TOP 10:

gun
clipper
encryption
key
code
tapped
crypto
nsa
pgp
gtoal

CLASS: sci.electronics
TOP 10:

space
circuit
motorola
power
electronics
mhz
ee
tv
usl
grace

CLASS: sci.med
TOP 10:

space
doctor
cancer
pitt
medical
disease
msg
treatment
vaked
information

CLASS: sci.space
TOP 10:

car
space
orbit
moon
dc
launch
planets
earth
rockets
sunset

CLASS: talk.politics.guns
TOP 10:

space
gun
guns
waco
firearms
atf
weapons
batf
ranch
cnn

CLASS: talk.politics.mideast
TOP 10:

distribution
israeli
israel
serdar
turkish
argic
jewish
jews
iran
angmar

CLASS: talk.religion.misc
TOP 10:

thanks
christian
morality
bible
god
koresh
homosexuality
beast
frank
happened

