In [None]:
////

In [48]:
import jieba
import json
import csv
from collections import Counter


def load_stopwords(file_path='./stopwords.txt'):
    """
    从文件中加载停用词，并存储在集合中
    :param file_path: 停用词文件的路径，默认为 'chinese_stopwords.txt'
    :return: 停用词集合
    """
    stopwords = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            stopwords.add(line.strip())
    return stopwords


def load_user_dict(user_dict_path='./user_dict.txt'):
    """
    加载用户自定义词典，只添加词语本身
    :param user_dict_path: 用户自定义词典的路径，默认为 'user_dict.txt'
    :return: None
    """
    with open(user_dict_path, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip()
            jieba.add_word(word)


def tokenize_text(text, stopwords):
    """
    使用 jieba 对文本进行分词，并去除停用词和单字（只保留汉字）
    :param text: 输入的文本
    :param stopwords: 停用词集合
    :return: 去除停用词和单字后的分词列表
    """
    tokens = jieba.cut(text)
    # 过滤掉只包含空格或特殊字符的词以及单字
    filtered_tokens = []
    for token in tokens:
        if len(token) > 1 and all('\u4e00' <= char <= '\u9fff' for char in token) and token.strip() and not all(c in ' \n\t\r:.,!?[](){}"\'`' for c in token):
            if token not in stopwords:
                filtered_tokens.append(token)
    return filtered_tokens


def process_video(video_info, stopwords):
    """
    处理单个视频的评论和弹幕
    :param video_info: 包含视频评论和弹幕的字典
    :param stopwords: 停用词集合
    :return: 处理后的视频信息字典
    """
    # 处理评论
    processed_comments = []
    for comment in video_info["所有评论"]:
        processed_comments.extend(tokenize_text(comment, stopwords))
    video_info["所有评论"] = processed_comments

    # 处理弹幕
    processed_bullets = []
    for bullet in video_info["所有弹幕"]:
        processed_bullets.extend(tokenize_text(bullet, stopwords))
    video_info["所有弹幕"] = processed_bullets
    return video_info


def process_partition(partition):
    """
    处理分区中的所有视频
    :param partition: 分区信息的字典，包含多个视频的信息
    :return: 处理后的分区信息字典
    """
    stopwords = load_stopwords()
    processed_partition = {}
    for video_id, video_info in partition.items():
        processed_partition[video_id] = process_video(video_info, stopwords)
    return processed_partition


def analyze_word_frequency(video_info):
    """
    对视频的评论和弹幕进行词频分析，统计前 20 的词频
    :param video_info: 视频信息的字典
    :return: 词频统计结果
    """
    word_counts = Counter()
    for comment in video_info["所有评论"]:
        word_counts.update(tokenize_text(comment, load_stopwords()))
    for bullet in video_info["所有弹幕"]:
        word_counts.update(tokenize_text(bullet, load_stopwords()))
    return word_counts.most_common(20)


def process_json(input_json_path, output_json_path, output_csv_path):
    """
    处理整个 JSON 文件并生成 CSV 文件
    :param input_json_path: 输入 JSON 文件的路径
    :param output_json_path: 输出 JSON 文件的路径
    :param output_csv_path: 输出 CSV 文件的路径
    """
    with open(input_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = {}
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['分区', '视频bv', '高频词']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for partition_name, partition in data.items():
            processed_partition = process_partition(partition)
            processed_data[partition_name] = processed_partition
            for video_id, video_info in processed_partition.items():
                top_20_word_counts = analyze_word_frequency(video_info)
                top_20_words = [word for word, count in top_20_word_counts]
                writer.writerow({'分区': partition_name, '视频bv': video_id, '高频词': top_20_words})

    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, ensure_ascii=False, indent=4)


def main():
    input_json_path = './video_info.json' 
    output_json_path = './output.json' 
    output_csv_path = 'word_frequency.csv'  
    process_json(input_json_path, output_json_path, output_csv_path)


if __name__ == "__main__":
    main()

In [None]:
////

In [52]:
import wordcloud
import json


def generate_word_cloud(partition_name, text, font_path):
    """
    生成词云图
    :param partition_name: 分区名称
    :param text: 要生成词云的文本
    :param font_path: 本地字体文件的路径
    """
    # 创建词云对象，使用本地字体，并设置浅色背景
    wordcloud_obj = wordcloud.WordCloud(
        width=800,
        height=600,
        font_path=font_path,
        background_color='white'  # 设置背景颜色为白色，你可以根据需要修改为其他浅色，例如 'lightgray'
    ).generate(text)

    # 保存词云图为文件
    output_file = f"./worldcloud/{partition_name}_word_cloud.png"
    wordcloud_obj.to_file(output_file)


def process_json(json_file_path, font_path):
    """
    处理 JSON 文件并生成词云图
    :param json_file_path: JSON 文件的路径
    :param font_path: 本地字体文件的路径
    """
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for partition_name, partition in data.items():
        all_text = ""
        for video in partition.values():
            all_text += " ".join(video["所有评论"])
            all_text += " ".join(video["所有弹幕"])

        # 生成词云图
        generate_word_cloud(partition_name, all_text, font_path)


if __name__ == "__main__":
    json_file_path = './output.json'  
    font_path = './YeZiGongChangTangYingHei-2.ttf'  
    process_json(json_file_path, font_path)

In [50]:
#这个是带弹幕的版本
import pandas as pd
import json


def json_to_csv(json_file_path, csv_file_path):
    # 读取 JSON 文件
    with open(json_file_path, 'r', encoding='utf-8-sig') as f:
        data = json.load(f)
    
    rows = []
    for partition, videos in data.items():
        for video_id, video_info in videos.items():
            # 合并所有评论和弹幕
            all_text = " ".join(video_info.get("所有评论", [])) + " " + " ".join(video_info.get("所有弹幕", []))
            row = {
                "分区": partition,
                "视频 ID": video_id,
                "所有评论和弹幕": all_text
            }
            rows.append(row)
    
    # 将数据转换为 DataFrame
    df = pd.DataFrame(rows)
    
    # 添加自增序列作为序号
    df.insert(0, "序号", range(1, len(df) + 1))
    
    # 保存为 CSV 文件
    df.to_csv(csv_file_path, index=False)
    print(f"数据已保存至 {csv_file_path}")


# 调用函数
json_file_path = './output.json'  
csv_file_path = './output_withdanmaku_csv.csv' 
json_to_csv(json_file_path, csv_file_path)

数据已保存至 ./output_withdanmaku_csv.csv
