In [17]:
import os # 系统库，用于处理文件和目录
import re # 正则表达式库，用于对文本进行清洗
# 中文分词库，用于对中文文本进行分词处理
import jieba
# 数据处理和分析库，用于处理和分析数据
import pandas as pd
# Gensim库中的corpora和models模块，用于主题建模和文本处理
from gensim import corpora, models
# Scikit-learn库中的CountVectorizer，用于将文本转换为词频矩阵
from sklearn.feature_extraction.text import CountVectorizer
# 绘图库，用于数据可视化
import matplotlib.pyplot as plt
# 忽略警告信息
import warnings
warnings.filterwarnings('ignore', category=Warning)
# 数值计算库，用于数值运算
import numpy as np
# pyLDAvis库中的gensim_models模块，用于可视化LDA主题模型
import pyLDAvis.gensim_models 
# Gensim库中的CoherenceModel、LdaModel和Dictionary，用于计算主题一致性和LDA模型
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
# 中文显示支持，设置字体为SimHei
plt.rcParams['font.sans-serif'] = ['SimHei']

In [18]:
# 新词文件的路径
newwords_list = 'newwords.txt'  # 替换为你的文件路径

# 从文件中读取新词并添加到jieba字典中
with open(newwords_list, 'r', encoding='utf-8') as file:
    for word in file:
        jieba.add_word(word.strip())

In [19]:
# 读取停用词文件，返回停用词列
def remove_stopwords():
    stop_word = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8')]
    return stop_word

In [20]:
# 分词统计函数，对文本进行分词并统计词频
def tongji(text, stopwords):
    text = text.replace('\n', '').replace(' ', '').replace('\r', '').replace('\u3000', '')\
        .replace('\t', '').replace('\xa0', '').replace('\u2002', '').replace('\ufeff', '')
    words = jieba.cut(text)
    words = [str(w) for w in words if str(w) not in stopwords]
    for word in words:
        if word not in words_dic:
            words_dic[word] = 0
        words_dic[word] += 1
    return ' '.join(words)

In [21]:
# 文本分词函数，对文本进行分词并返回连接后的字符串
def tokenize(text, stopwords):
    text = text.replace('\n', '').replace(' ', '').replace('\r', '').replace('\u3000', '')\
        .replace('\t', '').replace('\xa0', '').replace('\u2002', '').replace('\ufeff', '')
    words = jieba.cut(text)
    words = [str(w) for w in words if str(w) not in stopwords]
    return ' '.join(words)

In [22]:
# 数据预处理函数，读取数据、处理时间列、分词、去除低频和高频词，并返回处理后的数据框
def data_process(limit_min=1, limit_max=100):
    stopwords = remove_stopwords()
    df = pd.read_txt("【文本】政府数据治理机构职能")

    # 应用tongji函数，统计词频
    df['统计'] = df['全文'].apply(tongji, args=(stopwords,))
    print(len(words_dic))
    
    # 剔除低频和高频词
    words_rm = [k for k, v in words_dic.items() if v <= limit_min or v >= limit_max]
    print(len(words_rm))
    del df["统计"]
    stopwords = stopwords + words_rm
    stopwords = set(stopwords)
    print(len(stopwords))
    
    # 应用tokenize函数，进行分词
    df['content'] = df['全文'].apply(tokenize, args=(stopwords,))
    
    # 删除缺失值
    df.dropna(subset=["content"], inplace=True)
    
    # 打印处理后的数据框信息
    documents = df['content'].tolist()
    documents = [i.split(" ") for i in documents]
    print(max(len(sublist) for sublist in documents))
    
    return df

In [23]:
# 读取数据
df = data_process(limit_min=5,limit_max=50)
df.to_excel("./res/数据清洗.xlsx",index=False)
df 

AttributeError: module 'pandas' has no attribute 'read_txt'

In [24]:
data = []
file_paths = os.listdir('【文本】政府数据治理机构职能')
for i, file in enumerate(file_paths):
    with open('【文本】政府数据治理机构职能/'+file) as f:
        text = f.read()
        text = ''.join(re.findall('[\u4e00-\u9fa5]', text)) # 只保留中文
        doc = clean_stopword(text, stopwords)
        data.append(doc)

NameError: name 'clean_stopword' is not defined