In [1]:
import logging
import sys
import numpy as np
sys.path.append('..')
from function import GibbsLDA
from function import vbLDA
from function.utils import convert_cnt_to_list, get_top_words, get_corpus_ids_cnt

In [2]:
# 文件路径
corpus_path = '../../../../resources/corpus/solutions/mining_qa/sentiment/0103.txt'
vocab_path = '../../../../resources/vocab/vocab_tmp' 
stopwords_path = '../../../../resources/vocab/stopwords_hit_copy.txt'
# 参数设计
num_doc = 10000 # 文档个数 
max_voca = 6000 # 词典个数
max_iter = 100 # 迭代轮数(TODO 参数优化)
n_topic = 30 # 主题个数(TODO 参数优化，借鉴CRP DPMM 等)

In [3]:
# 通过原始语料构造训练数据
lines, (voca, doc_ids, doc_cnt) = get_corpus_ids_cnt(corpus_path=corpus_path, vocab_path=vocab_path, stopwords_path=stopwords_path, num_doc=num_doc, max_voca=max_voca)
docs = convert_cnt_to_list(doc_ids, doc_cnt)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/dz/0yc_qtt536x7_tgb5wb5phg40000gn/T/jieba.cache
Loading model cost 0.465 seconds.
Prefix dict has been built successfully.


In [4]:
print(lines[:5])

['哈哈', '嘿嘿嘿', '嘻嘻', '嘿嘿', '我爱你']


In [5]:
model = GibbsLDA(len(docs), len(voca), n_topic)
model.fit(docs, max_iter=max_iter)
model.save('tmp.pkl')

2023-01-05 16:52:13 INFO:GibbsLDA:[ITER] 0,	elapsed time:0.14,	log_likelihood:-126437.41
2023-01-05 16:52:13 INFO:GibbsLDA:[ITER] 1,	elapsed time:0.14,	log_likelihood:-118751.24
2023-01-05 16:52:13 INFO:GibbsLDA:[ITER] 2,	elapsed time:0.14,	log_likelihood:-113948.64
2023-01-05 16:52:13 INFO:GibbsLDA:[ITER] 3,	elapsed time:0.14,	log_likelihood:-110044.93
2023-01-05 16:52:13 INFO:GibbsLDA:[ITER] 4,	elapsed time:0.14,	log_likelihood:-106923.89
2023-01-05 16:52:14 INFO:GibbsLDA:[ITER] 5,	elapsed time:0.14,	log_likelihood:-104778.37
2023-01-05 16:52:14 INFO:GibbsLDA:[ITER] 6,	elapsed time:0.16,	log_likelihood:-102465.01
2023-01-05 16:52:14 INFO:GibbsLDA:[ITER] 7,	elapsed time:0.14,	log_likelihood:-100574.51
2023-01-05 16:52:14 INFO:GibbsLDA:[ITER] 8,	elapsed time:0.14,	log_likelihood:-99335.32
2023-01-05 16:52:14 INFO:GibbsLDA:[ITER] 9,	elapsed time:0.14,	log_likelihood:-98187.82
2023-01-05 16:52:15 INFO:GibbsLDA:[ITER] 10,	elapsed time:0.14,	log_likelihood:-97303.62
2023-01-05 16:52:15 INF

In [6]:
# 显示每个主题的关键词
for ti in range(n_topic):
    top_words = get_top_words(model.TW, voca, ti, n_words=5)
    print('Topic', ti ,': ', ','.join(top_words))

Topic 0 :  也,就,不,说,嘛
Topic 1 :  爱,人,最,她,乱
Topic 2 :  吗,靠,别,能,被
Topic 3 :  不,人,吗,还,让
Topic 4 :  说,跟,没,又,哼
Topic 5 :  在,呀,那,讲,妹
Topic 6 :  唉,要,去,看,走
Topic 7 :  来,呸,给,把,呵
Topic 8 :  说,不,就,丑,骂
Topic 9 :  真,想,笑,揍,可
Topic 10 :  那,就,吗,吃,呀
Topic 11 :  说,还,么,看,咋
Topic 12 :  想,很,和,做,打
Topic 13 :  他,骂,要,跟,她
Topic 14 :  个,吵,臭,太,用
Topic 15 :  去,给,打,就,找
Topic 16 :  嗯,在,说,个,惹
Topic 17 :  啦,聋,八,拉,改
Topic 18 :  个,吗,猪,狗,帅
Topic 19 :  呢,笨,还,很,咋
Topic 20 :  太,这,也,吧,哇
Topic 21 :  吧,好,呀,哭,疯
Topic 22 :  不,啥,呀,对,说
Topic 23 :  个,大,鬼,才,怪
Topic 24 :  没,这,都,哎,还
Topic 25 :  好,哦,很,爱,哟
Topic 26 :  有,吗,病,吧,会
Topic 27 :  谁,呀,叫,都,烦
Topic 28 :  听,不,要,哈,着
Topic 29 :  不,都,跟,玩,懂


In [7]:
# 聚类结果展示前10条
topic_results = model.inference(docs, max_iter=10) 
cluster_results = []
new_cluster_results = []
for topic, line in zip(topic_results, lines):
    topic = topic.tolist()
    cluster_results.append((topic.index(max(topic)), line))
    new_cluster_results = sorted(cluster_results, key=lambda k:k[0])

In [8]:
cluster_results[:5]

[(0, '哈哈'), (0, '嘿嘿嘿'), (0, '嘻嘻'), (0, '嘿嘿'), (0, '我爱你')]

In [9]:
for item in new_cluster_results:
    print(item)

(0, '哈哈')
(0, '嘿嘿嘿')
(0, '嘻嘻')
(0, '嘿嘿')
(0, '我爱你')
(0, '哈哈哈哈哈哈哈哈')
(0, '哈哈哈哈哈')
(0, '哈哈哈')
(0, '讨厌')
(0, '哈哈哈哈哈哈哈')
(0, '哈哈哈哈')
(0, '我喜欢')
(0, '你真笨')
(0, '笨蛋')
(0, '哈哈哈哈哈哈哈哈哈哈哈哈')
(0, '哈哈哈哈哈哈哈哈哈哈')
(0, '嘿嘿嘿嘿嘿嘿嘿')
(0, '嘿嘿嘿嘿嘿嘿')
(0, '哼哼哼哼哼哼')
(0, '哈哈哈哈哈哈')
(0, '嘿嘿嘿嘿')
(0, '你太笨了')
(0, '你真棒')
(0, '真笨')
(0, '嘿嘿嘿嘿嘿嘿嘿嘿嘿嘿嘿嘿')
(0, '哼哼哼哼哼哼哼哼哼哼')
(0, '嘿嘿嘿嘿嘿嘿嘿嘿')
(0, '呵呵呵呵呵呵')
(0, '你是不是笨蛋')
(0, '嘿嘿嘿嘿嘿')
(0, '你是笨蛋')
(0, '神经病')
(0, '我哈哈')
(0, '太笨了')
(0, '你笨蛋')
(0, '垃圾')
(0, '喜欢')
(0, '哈哈哈哈哈哈哈哈哈')
(0, '我爱你我爱你')
(0, '哈哈哈哈嘿嘿')
(0, '你这个笨蛋')
(0, '你神经病啊')
(0, '你是大坏蛋')
(0, '你太烦人了')
(0, '这个笨蛋')
(0, '说你大爷')
(0, '神经病啊')
(0, '我生气了')
(0, '喜欢你')
(0, '哈哈啊')
(0, '别吵了')
(0, '你妹啊')
(0, '你太笨')
(0, '麻烦')
(0, '烦人')
(0, '废话')
(0, '帅哥')
(0, '呜呜')
(0, '哈哈哈哈哈哈哈哈哈哈哈哈哈哈')
(0, '哼哼哼哼哼哼哼哼')
(0, '老婆老婆我爱你')
(0, '我也喜欢你')
(0, '我不理你了')
(0, '你别说话了')
(0, '伤心太平洋')
(0, '笨蛋笨蛋')
(0, '我喜欢的')
(0, '你太丑了')
(0, '你别说话')
(0, '生气了')
(0, '漂亮的')
(0, '我烦你')
(0, '小可爱')
(0, '对不起')
(0, '太棒了')
(0, '你真帅')
(0, '你好看')
(0, '真棒')
(0, '混蛋')
(0, '坑爹')
(0, '伤心