In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x','talk.politics.mideast',
        'soc.religion.christian','sci.electronics','sci.med']

news_df = fetch_20newsgroups(subset = 'all', remove = ('header','footer','quotes'), categories=cats, random_state = 0)

#LDA는 COUNTVECT만 가능
count_vect = CountVectorizer(max_df = 0.95, max_features = 1000, min_df = 2, stop_words = 'english', ngram_range = (1,2))

feat_vect = count_vect.fit_transform(news_df.data)
print('CountVectorizer shape : ', feat_vect.shape)

CountVectorizer shape :  (7862, 1000)


In [5]:
lda = LatentDirichletAllocation(n_components=8, random_state = 0)
lda.fit(feat_vect)

LatentDirichletAllocation(n_components=8, random_state=0)

In [8]:
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[3.13483523e+02, 1.80694863e+02, 4.03990113e+01, ...,
        8.44355322e+01, 1.25001932e-01, 1.25001646e-01],
       [9.44043581e+01, 1.44080013e+01, 1.30242052e+02, ...,
        7.82561496e+01, 1.25016648e-01, 1.25014688e-01],
       [4.44766267e+00, 5.88923250e-01, 1.25170282e-01, ...,
        4.02190900e+01, 1.25003957e-01, 1.25003865e-01],
       ...,
       [9.40699411e+00, 4.55386620e+01, 1.25194439e-01, ...,
        3.24877033e+01, 1.25029032e-01, 1.25000579e-01],
       [2.10365037e+01, 7.02607423e+00, 1.35839302e+01, ...,
        2.20584629e+01, 1.25014981e-01, 1.25001025e-01],
       [1.25057672e-01, 1.25115975e-01, 1.25009770e-01, ...,
        4.22456002e+01, 1.25001380e-01, 1.25001457e-01]])

In [9]:
def display_topics(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('topic #', topic_index)
        
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes = topic_word_indexes[:no_top_words]
        
        feature_concat =  ' '.join([feature_names[i] for i in top_indexes])
        print(feature_concat)
        
feature_names = count_vect.get_feature_names()
display_topics(lda, feature_names, 15)

topic # 0
edu medical 10 health information 1993 research pitt disease cancer pitt edu 00 new patients 12
topic # 1
said people know don didn just went like say time did going came told ac
topic # 2
don use just like good time make know way think does used organization people ve
topic # 3
file edu image graphics program use available window mit software windows ftp version jpeg server
topic # 4
armenian israel turkish jews armenians people israeli jewish armenia government turks world war turkey muslim
topic # 5
com organization don like just bike think netcom good ibm hp time com organization game ve
topic # 6
edu organization posting university host nntp nntp posting posting host organization university cs reply distribution ca thanks know
topic # 7
god people jesus church believe christ christian think organization christians does say edu bible sin
