# sklearn-LDA

## 预处理

In [None]:
import os
import pandas as pd
import re
import jieba
import jieba.posseg as psg
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def print_top_words(model, feature_names, n_top_words):
    tword = []
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        tword.append(topic_w)
        print(topic_w)
    return tword

In [None]:
# 指定要读取的文件夹路径
folder_path = "data_cut"
# 获取文件夹中的所有xlsx文件
file_list = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]
# 循环读取每个xlsx文件，并对其中的comment列进行操作
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    # 读取Excel文件
    data = pd.read_excel(file_path)
    # 处理缺失值
    data['cut_comment'] = data['cut_comment'].fillna('') 
    n_features = 1000 #提取1000个特征词语，如果数据量小可以把1000改小，反之同理
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df = 0.5,
                                min_df = 10)
    tf = tf_vectorizer.fit_transform(data["cut_comment"])
    n_topics = 8#想要生产的主题数
    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                learning_method='batch',
                                learning_offset=50,
    #                                 doc_topic_prior=0.1,
    #                                 topic_word_prior=0.01,
                               random_state=0)
    lda.fit(tf)
    n_top_words = 25#打印每个主题的前25个词语
    tf_feature_names = tf_vectorizer.get_feature_names()
    topic_word = print_top_words(lda, tf_feature_names, n_top_words)
    topics=lda.transform(tf)
    topic = []
    for t in topics:
        topic.append(list(t).index(np.max(t)))
    data['topic']=topic
    data.to_excel("data_topic.xlsx",index=False)
    topics[0]#0 1 2 
    plexs = []
    scores = []
    n_max_topics = 16#最大主题数+1，用来检验的可以设置大一点
    for i in range(1,n_max_topics):
        print(i)
        lda = LatentDirichletAllocation(n_components=i, max_iter=50,
                                    learning_method='batch',
                                    learning_offset=50,random_state=0)
        lda.fit(tf)
        plexs.append(lda.perplexity(tf))
        scores.append(lda.score(tf))
        n_t=15#区间最右侧的值。注意：不能大于n_max_topics
    x=list(range(1,n_t))
    plt.plot(x,plexs[1:n_t])
    plt.xlabel("number of topics")
    plt.ylabel("perplexity")
    plt.show()