In [1]:
import jieba
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import warnings

In [2]:
# 读取停用词
with open('./resources/stop_words_zh_cn.txt', mode='r', encoding='utf-8') as stopword_f:
    stopwords = list(map(lambda x: x.rstrip('\n'), stopword_f.readlines()))
# 去除网页转义符
stopwords.extend(['nbsp','gt','lt','quot','amp'])
stopwords = frozenset(stopwords)

In [3]:
# 读取数据集
corpus = dataset = pd.read_csv('./resources/train.csv',sep="\t",names=["label","comment"],skiprows=1,encoding='utf-8')['comment'].tolist()
corpus[0]

'一如既往地好吃，希望可以开到其他城市'

In [4]:
# 切词
corpus = list(map(lambda x: ' '.join(jieba.cut(x)), corpus))
print(len(corpus))
corpus[0]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lonel\AppData\Local\Temp\jieba.cache
Loading model cost 2.619 seconds.
Prefix dict has been built succesfully.
10000


'一如既往 地 好吃 ， 希望 可以 开 到 其他 城市'

In [5]:
# 统计词频
warnings.filterwarnings("ignore")
count_vectorizer = CountVectorizer(stop_words=stopwords)
count_tf = count_vectorizer.fit_transform(corpus)
count_tf.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [6]:
# 所有的主题词
feature_names = count_vectorizer.get_feature_names()
feature_names[-10:]

['龙啸', '龙头', '龙抄手', '龙湖', '龙湾', '龙眼', '龙虾', '龙骨', '龟苓膏', '龟速']

In [7]:
topics = ['好评','差评']

In [8]:
# 进行 lda 转换
lda = LatentDirichletAllocation(n_topics=len(topics),max_iter=10,learning_method="online",
learning_offset=50,random_state=0)
docres = lda.fit_transform(count_tf)

In [9]:
# 每个主题每个词的重要指数
lda_comp = lda.components_
lda_comp

array([[2.21908771, 0.50721466, 0.51659712, ..., 1.20594492, 1.77215328,
        1.40764435],
       [0.50380219, 1.48958115, 1.5545252 , ..., 0.7296863 , 3.63065909,
        0.51953256]])

In [10]:
# 每次主题的前10重要主题词
for i, word in enumerate(topics):
    topic_comp = lda_comp[i]
    indexed_topic_words = list(zip(range(len(topic_comp)),topic_comp))
    indexed_topic_words = sorted(indexed_topic_words,key=lambda x: x[1],reverse=True)
    most_important_topic_words = [feature_names[x[0]] for x in indexed_topic_words[:10]]
    print(f"[{word}主题最重要主题词]:{','.join(most_important_topic_words)}")

[好评主题最重要主题词]:难吃,真的,好吃,东西,拉肚子,差评,味道,外卖,第一次,小时
[差评主题最重要主题词]:味道,不错,好吃,服务,菜品,环境,喜欢,下次,分量,老板


In [11]:
docres

array([[0.12043239, 0.87956761],
       [0.09808001, 0.90191999],
       [0.05178284, 0.94821716],
       ...,
       [0.05764773, 0.94235227],
       [0.0838649 , 0.9161351 ],
       [0.08468626, 0.91531374]])

In [12]:
# 分析文章中重要性排名前30
longest_doc_id, max_len = 0, 0
for i,c in enumerate(corpus):
    if len(c)>max_len:
        max_len = len(c)
        longest_doc_id = i
test_doc = corpus[longest_doc_id]
doc_bow = count_tf[longest_doc_id]
doc_topics = list(zip(feature_names,[docres[longest_doc_id,0] * lda_comp[0,i] + docres[longest_doc_id,1] * lda_comp[1,i] for i in range(len(feature_names))]))
top_words = [x[0] for x in sorted(doc_topics,key=lambda x: x[1], reverse=True)[:30]]
print(','.join(top_words))

味道,不错,好吃,服务,菜品,环境,喜欢,下次,分量,老板,服务态度,新鲜,特别,推荐,团购,热情,态度,几次,朋友,口味,划算,还会,价格,一如既往,感觉,实惠,位置,值得,蛋糕,干净
