In [1]:
import numpy  as np
import lda

In [5]:
# 导入测试数据
X = lda.datasets.load_reuters()
print('文档数量%d, 词语数量%d' % X.shape)

文档数量395, 词语数量4258


In [11]:
# 词汇
vocab = lda.datasets.load_reuters_vocab()
print('语料词语前10个：', vocab[:10])
# 标题
title = lda.datasets.load_reuters_titles()
print('第1篇文档的标题是', title[0])

语料词语前10个： ('church', 'pope', 'years', 'people', 'mother', 'last', 'told', 'first', 'world', 'year')
第1篇文档的标题是 0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20


In [None]:
# 构建模型并训练数据
model = lda.LDA(n_topics=20, n_iter=2000, random_state=9)
model.fit(X)

In [18]:
# np.argsort() 作用：将数组按从小到大进行排列，并返回其索引值
a = [4, 21, 5, 1, 7, 6]
np.argsort(a)

array([3, 0, 2, 5, 4, 1])

In [25]:
# 查看主题-词语
topic_word = model.topic_word_
n_top_words = 10
for i, topic_dist in enumerate(topic_word):
    # 将每个主题中的所有词汇从小到大进行排列，并取出出最大的10个
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: police miami simpson versace cunanan york home city beach
Topic 1: pope paul trip church poland vatican catholic john visit
Topic 2: political minister government party last president prime leader against
Topic 3: pope vatican hospital surgery roman doctors operation appendix pontiff
Topic 4: years church people last year world first time during
Topic 5: against churchill sale letters bardot british former million papers
Topic 6: mother teresa order heart charity nuns calcutta missionaries sister
Topic 7: harriman clinton u.s ambassador churchill president paris pamela france
Topic 8: yeltsin russian russia kremlin moscow operation president heart communist
Topic 9: charles diana royal prince parker bowles camilla queen marriage
Topic 10: king prince years quebec irish married died bertil day
Topic 11: film festival poster hollywood director flynt madonna china people
Topic 12: germany german war nazi letter jews scientology book christian
Topic 13: city art museum century mil

In [32]:
# 查看文档主题,其结果是一个n*k的矩阵，n 是文档数，k 是主题数
doc_topic = model.doc_topic_
for i in range(10):
    # 前10篇文档，在20个主题中概率最大的是
    print("{} (top topic: {})".format(title[i], doc_topic[i].argmax()))

0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20 (top topic: 9)
1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21 (top topic: 4)
2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23 (top topic: 6)
3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25 (top topic: 9)
4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25 (top topic: 6)
5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA 1996-08-25 (top topic: 6)
6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA 1996-08-26 (top topic: 6)
7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India 1996-08-25 (top topic: 6)
8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA 1996-08-26 (top topic: 6)
9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26 (top topic: 9)


In [None]:
# 使用transfrom 预测更多的文章
doc_topic_test = model.transform(X_test)

# LDA 文档翻译
## 作用：隐狄利克雷模型使用API
## 用法及参数：lda.LDA(n_topics, n_iter=2000, alpha=0.1, eta=0.01, random_state=None, refresh=10)
- n_topics:主题数量
- n_iter: 最大迭代次数
- alpha: 主题分布的狄利克雷参数
- eta: 词分布的狄利克雷参数
## 属性
- components_：主题分布的点估计
- topic_word_:主题-词语矩阵
- nzw_: 在最终迭代中记录主题词分配的计数矩阵。
- ndz_: 在最终迭代中记录文档-主题的计数矩阵
- doc_topic_: 文档主题分布矩阵
- nz_: 最终迭代中主题分配计数的数组。
## 方法
- fit(X): 训练模型
- fit_transform(X): 训练并转换
- loglikelihood()： 计算对数似然
- transform(X, max_iter=20): 根据fit()拟合的模型对数据进行变换