In [None]:
!pip install nltk wordcloud matplotlib
!pip install spacy
!pip install "numpy<2" --force-reinstall
!python -m spacy download en_core_web_sm

### 用于提取名词并保存到文件

In [None]:
import json
import spacy

# 加载英语语言模型
nlp = spacy.load("en_core_web_sm")

# 读取 JSON 文件
def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# 提取 user 的 content
def extract_user_content(json_data):
    user_contents = []
    if isinstance(json_data, list):
        for item in json_data:
            if "messages" in item:
                for message in item["messages"]:
                    if message["role"] == "user":
                        user_contents.append(message["content"])
    elif isinstance(json_data, dict):
        if "messages" in json_data:
            for message in json_data["messages"]:
                if message["role"] == "user":
                    user_contents.append(message["content"])
    return " ".join(user_contents)

# 提取词，根据 extract_all_words 决定提取所有词还是只提取名词
def extract_words(text, extract_all_words, custom_filter=None):
    chunk_size = 1000000
    all_words = []
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i + chunk_size]
        doc = nlp(chunk)
        words = []
        for token in doc:
            if extract_all_words:
                if not token.is_punct and (custom_filter is None or token.text.lower() not in custom_filter):
                    words.append(token.text)
                    # print(f"提取词: {token.text}")
            else:
                if token.pos_ == 'NOUN' and (custom_filter is None or token.text.lower() not in custom_filter):
                    words.append(token.text)
                    # print(f"提取名词: {token.text}")
        all_words.extend(words)
    return all_words

# 将词保存到文件
def save_words_to_file(words, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for word in words:
            file.write(word + '\n')

if __name__ == "__main__":
    # 读取 JSON 文件部分
    json_file_path = './data_ok/data_sharegpt/data_sharegpt_8k.json'  # 替换为你的 JSON 文件路径
    # json_file_path = './data_test/test1.json'  # 替换为你的 JSON 文件路径
    json_data = read_json_file(json_file_path)
    user_text = extract_user_content(json_data)
    print("用户内容提取成功，内容长度:", len(user_text))
    # print("user_text:", user_text)

    # 判断变量，True 表示提取所有词，False 表示只提取名词
    extract_all_words = False

    # 可以添加自定义的过滤词列表
    # custom_filter = ['obj', 'file', 'object', 'model']
    custom_filter = []
    words = extract_words(user_text, extract_all_words, custom_filter)
    word_file_path = 'A_all_words8k.txt' if extract_all_words else 'A_nouns8k.txt'
    save_words_to_file(words, word_file_path)
    print(f"{'词' if extract_all_words else '名词'}提取并保存成功，数量: {len(words)}")


### 手动删除一些词

In [None]:
def remove_word_from_file(file_path, words_to_remove):
    try:
        new_lines = []
        # 打开文件并按行读取内容
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                words = line.strip().split()  # 去除行首尾空白并分割为单词列表
                # 过滤掉在 words_to_remove 列表中的单词
                new_words = [word for word in words if word not in words_to_remove]
                if new_words:  # 检查处理后的单词列表是否为空
                    new_line = ' '.join(new_words)  # 重新组合单词为字符串
                    new_lines.append(new_line + '\n')  # 添加换行符并保存到新行列表

        # 将处理后的内容写回文件
        with open(file_path, 'w', encoding='utf-8') as file:
            file.writelines(new_lines)

        print(f"已成功从文件 {file_path} 中移除所有 '{words_to_remove}'")
    except FileNotFoundError:
        print(f"未找到文件 {file_path}。")
    except Exception as e:
        print(f"处理文件时出现错误: {e}")


# 使用示例
file_path = '.txt'  
words_to_remove = []  
remove_word_from_file(file_path, words_to_remove)

In [None]:
# 统计频率
from collections import Counter


def count_word_frequency(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            words = [line.strip() for line in file.readlines()]
        word_counts = Counter(words)
        top_100 = word_counts.most_common(100)
        return top_100
    except FileNotFoundError:
        print(f"错误：文件 {file_path} 未找到。")
    except Exception as e:
        print(f"错误：发生了未知错误 {e}。")


if __name__ == "__main__":
    file_path = 'A_all_words8k.txt'
    result = count_word_frequency(file_path)
    if result:
        formatted_output = ','.join([f"'{word}'" for word, _ in result])
        print(formatted_output)

### 用于生成词云并保存

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 生成词云并保存到指定位置
def generate_wordcloud_from_file(noun_file_path, wordcloud_image_path):
    with open(noun_file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    wordcloud.to_file(wordcloud_image_path)

if __name__ == "__main__":
    noun_file_path = 'A_all_words8k.txt'  # 存储名词的文件路径
    wordcloud_image_path = 'A_all_words8k.png'  # 存储词云图片的路径
    generate_wordcloud_from_file(noun_file_path, wordcloud_image_path)
    print("词云生成并保存成功")