# 基于潜在狄里克雷分配（LDA）的内容主题挖掘 

In [6]:
import os
import tarfile  # tar压缩包库
if not os.path.exists('./news_data'):
    with tarfile.open('./datacode_for_book/chapter8/news_data.tar.gz') as tar:
        print(tar.getnames())
        for name in tar.getmembers():
            tar.extract(name,path='./datacode_for_book/chapter8/')
    

['./news_data/news.sohunews.010806.txt', './news_data/news.sohunews.020806.txt', './news_data/news.sohunews.030806.txt', './news_data/news.sohunews.040806.txt', './news_data/news.sohunews.050806.txt', './news_data/news.sohunews.060806.txt', './news_data/news.sohunews.070806.txt', './news_data/news.sohunews.080806.txt', './news_data/news.sohunews.110806.txt', './news_data/news.sohunews.120806.txt']


In [1]:
from bs4 import BeautifulSoup

# 全角转半角
def str_convert(content):
    '''
    将内容中的全角字符，包含英文字母、数字键、符号等转换为半角字符
    :param content: 要转换的字符串内容
    :return: 转换后的半角字符串
    '''
    new_str = ''
    for each_char in content:  # 循环读取每个字符
        code_num = ord(each_char)  # 读取字符的ASCII值或Unicode值
        if code_num == 12288:  # 全角空格直接转换
            code_num = 32
        elif (code_num >= 65281 and code_num <= 65374):  # 全角字符（除空格）根据关系转化
            code_num -= 65248
        new_str += chr(code_num)
    return new_str

def data_parse(data):
    '''
    从原始文件中解析出文本内容数据
    :param data: 包含代码的原始内容
    :return: 文本中的所有内容，列表型
    '''
    raw_code = BeautifulSoup(data, "lxml")  # 建立BeautifulSoup对象
    content_code = raw_code.find_all('content')  # 从包含文本的代码块中找到content标签
    content_list = []  # 建立空列表，用来存储每个content标签的内容
    for each_content in content_code:  # 循环读出每个content标签
        if len(each_content) > 0:  # 如果content标签的内容不为空
            raw_content = each_content.text  # 获取原始内容字符串
            convert_content = str_convert(raw_content)  # 将全角转换为半角
            content_list.append(convert_content)  # 将content文本内容加入列表
    return content_list

In [12]:
# 汇总所有内容
import os
print ('walk files and get content...')
all_content = []  # 总列表，用于存储所有文件的文本内容
for root, dirs, files in os.walk('./datacode_for_book/chapter8/news_data'):  # 分别读取遍历目录下的根目录、子目录和文件列表
    for file in files:  # 读取每个文件
        file_name = os.path.join(root, file)  # 将目录路径与文件名合并为带有完整路径的文件名
        with open(file_name,'r',encoding='utf-8') as f:  # 以只读方式打开文件
            data = f.read()  # 读取文件内容
        all_content.extend(data_parse(data))  # 从文件内容中获取文本并将结果追加到总列表

walk files and get content...


In [21]:
import jieba.posseg as pseg
# 中文分词
def jieba_cut(text):
    '''
    将输入的文本句子根据词性标注做分词
    :param text: 文本句子，字符串型
    :return: 符合规则的分词结果
    '''
    rule_words = ['z', 'vn', 'v', 't', 'nz', 'nr', 'ns', 'n', 'l', 'i', 'j', 'an',
                  'a']  # 只保留状态词、名动词、动词、时间词、其他名词、人名、地名、名词、习用语、简称略语、成语、形容词、名形词
    words = pseg.cut(text)  # 分词
    seg_list = []  # 列表用于存储每个文件的分词结果
    for word in words:  # 循环得到每个分词
        if word.flag in rule_words:
            seg_list.append(word.word)  # 将分词追加到列表
    return seg_list

In [22]:
# 获取每条内容的分词结果
print ('get word list...')
words_list = []  # 分词列表，用于存储所有文件的分词结果
for each_content in all_content:  # 循环读出每个文本内容
    words_list.append(list(jieba_cut(each_content)))  # 将文件内容的分词结果以列表的形式追加到列表

Building prefix dict from the default dictionary ...


get word list...


Dumping model to file cache C:\Users\longf\AppData\Local\Temp\jieba.cache
Loading model cost 0.838 seconds.
Prefix dict has been built succesfully.


NameError: name 'wordlist' is not defined

In [26]:
# 文本预处理
from gensim import corpora, models  # gensim的词频统计和主题建模模块
def text_pro(words_list, tfidf_object=None, training=True):
    '''
    gensim主题建模预处理过程，包含分词类别转字典、生成语料库和TF-IDF转换
    :param words_list: 分词列表，列表型
    :param tfidf_object: TF-IDF模型对象，该对象在训练阶段生成
    :param training: 是否训练阶段，用来针对训练和预测两个阶段做预处理
    :return: 如果是训练阶段，返回词典、TF-IDF对象和TF-IDF向量空间数据；如果是预测阶段，返回TF-IDF向量空间数据
    '''
    # 分词列表转字典
    dic = corpora.Dictionary(words_list)  # 将分词列表转换为字典形式
    print(('{:*^60}'.format('token & word mapping review:')))
    for i, w in list(dic.items())[:5]:  # 循环读出字典前5条的每个key和value，对应的是索引值和分词
        print(('token:%s -- word:%s' % (i, w)))
    # 生成语料库
    corpus = []  # 建立一个用于存储语料库的列表
    for words in words_list:  # 读取每个分词列表
        corpus.append(dic.doc2bow(words))  # 将每个分词列表转换为语料库词袋（bag of words）形式的列表
    print(('{:*^60}'.format('bag of words review:')))
    print((corpus[0]))  # 打印输出第一条语料库
    # TF-IDF转换
    if training == True:
        tfidf = models.TfidfModel(corpus)  # 建立TF-IDF模型对象
        corpus_tfidf = tfidf[corpus]  # 得到TF-IDF向量稀疏矩阵
        print(('{:*^60}'.format('TF-IDF model review:')))
        for doc in corpus_tfidf:  # 循环读出每个向量
            print(doc)  # 打印第一条向量
            break  # 跳出循环
        return dic, corpus_tfidf, tfidf
    else:
        return tfidf_object[corpus]

In [28]:
# 建立主题模型
print ('train topic model...')
dic, corpus_tfidf, tfidf = text_pro(words_list, tfidf_object=None, training=True)  # 训练集的文本预处理
num_topics = 3  # 设置主题个数
lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=num_topics)  # 通过LDA进行主题建模
print ('{:*^60}'.format('topic model review:'))
for i in range(num_topics):  # 输出每一类主题的结果
    print (lda.print_topic(i))  # 输出对应主题

train topic model...
****************token & word mapping review:****************
token:0 -- word:仇恨
token:1 -- word:侮辱
token:2 -- word:侵害
token:3 -- word:凶杀
token:4 -- word:危害
********************bag of words review:********************
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)]
********************TF-IDF model review:********************
[(0, 0.16762633852828174), (1, 0.16660204914253687), (2, 0.1643986382302142), (3, 0.168282481745965), (4, 0.16197667368712637), (5, 0.14602961468426073), (6, 0.16282320045073903), (7, 0.10154448591145282), (8, 0.12365275311464316), (9, 0.12399080729729553), (10, 0.16703117734810868), (11, 0.163124879458702), (12, 0.16844765669812112), (13, 0.16409043499326897), (14, 0.1662290891913951), (15, 0.1685028172752526), (16, 0

In [31]:
# 新数据集的主题模型预测
print ('topic forecast...')
with open('./datacode_for_book/chapter8/article.txt','r',encoding='utf-8') as f:  # 打开新的文本
    text_new = f.read()  # 读取文本数据
text_content = data_parse(data)  # 解析新的文本
words_list_new = jieba_cut(text_new)  # 将文本转换为分词列表
corpus_tfidf_new = text_pro([words_list_new], tfidf_object=tfidf, training=False)  # 新文本数据集的预处理
corpus_lda_new = lda[corpus_tfidf_new]  # 获取新的分词列表（文档）的主题概率分布
print ('{:*^60}'.format('topic forecast:'))
print (list(corpus_lda_new))

topic forecast...
****************token & word mapping review:****************
token:0 -- word:一鸣惊人
token:1 -- word:三剑客
token:2 -- word:上演
token:3 -- word:不败
token:4 -- word:专业培训
********************bag of words review:********************
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 2), (17, 1), (18, 1), (19, 3), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 3), (29, 2), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 2), (36, 1), (37, 2), (38, 1), (39, 1), (40, 2), (41, 2), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 2), (51, 1), (52, 1), (53, 1), (54, 2), (55, 3), (56, 1), (57, 1), (58, 1), (59, 2), (60, 1), (61, 1), (62, 2), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 2), (71, 1), (72, 1), (73, 4), (74, 1), (75, 1), (76, 1), (77, 7), (78, 5), (79, 2), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85,