In [1]:
import os
import time
import logging
import pickle
import tqdm
import json
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import collections
import numpy as np
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from numba import jit


2024-11-08 17:10:17.540 python[12499:84999] getMetalPluginClassForService: Failed to find bundle for accelerator bundle named: AGXMetalA12 errno: 0




In [2]:
BASE_DIR = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'

In [3]:
experiment_id = f"bertopic_experiment_{int(time.time())}"
experiment_dir = os.path.join(BASE_DIR, 'experiments', experiment_id)
if not os.path.exists(experiment_dir):
    os.makedirs(experiment_dir)

In [4]:
log_file_path = os.path.join(experiment_dir, f"bertopic_experiment_{experiment_id}.log")
logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler = logging.FileHandler(log_file_path)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
logger.info(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")
print(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")

2024-11-08 17:10:31,885 - INFO - 启动实验 bertopic_experiment_1731082231，日志记录到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731082231/bertopic_experiment_bertopic_experiment_1731082231.log


启动实验 bertopic_experiment_1731082231，日志记录到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731082231/bertopic_experiment_bertopic_experiment_1731082231.log


In [5]:
experiment_config = {
    "parameters": {
        "n_gram_range": (1, 2),
        "min_topic_size": 2,
        "nr_topics": "auto",
        "umap_params": {
            "n_neighbors": 15,
            "min_dist": 0.1,
            "n_components": 5,
            "random_state": 42
        }
    }
}
config_path = os.path.join(experiment_dir, 'config.json')
with open(config_path, 'w') as config_file:
    json.dump(experiment_config, config_file, indent=4)
logger.info(f"实验配置已保存到 {config_path}")
print(f"实验配置已保存到 {config_path}")

2024-11-08 17:10:31,931 - INFO - 实验配置已保存到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731082231/config.json


实验配置已保存到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731082231/config.json


In [6]:
logger.info("加载 Latin-BERT 嵌入模型...")
model_path = "/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/latin-bert/models/latin_bert"
tokenizer = AutoTokenizer.from_pretrained(model_path)
bert_model = AutoModel.from_pretrained(model_path)
def get_embeddings(texts):
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
    return embeddings

2024-11-08 17:10:31,947 - INFO - 加载 Latin-BERT 嵌入模型...


In [7]:
vectorizer_model = CountVectorizer(
    ngram_range=experiment_config["parameters"]["n_gram_range"],
    token_pattern=r"(?u)\b\w+\b"
)
custom_umap = UMAP(
    n_neighbors=experiment_config["parameters"]["umap_params"]["n_neighbors"],
    min_dist=experiment_config["parameters"]["umap_params"]["min_dist"],
    n_components=experiment_config["parameters"]["umap_params"]["n_components"],
    random_state=experiment_config["parameters"]["umap_params"]["random_state"]
)

In [8]:
testset_dir = os.path.join(BASE_DIR, 'data/testset')
documents = []
logger.info("加载测试集数据...")
test_files = [f for f in os.listdir(testset_dir) if f.endswith('.txt')]
for test_file in tqdm.tqdm(test_files, desc="Loading testset files"):
    file_path = os.path.join(testset_dir, test_file)
    with open(file_path, 'r', encoding='utf-8') as file:
        documents.append(file.read())
logger.info(f"加载了 {len(documents)} 个文档用于 BERTopic 实验。")
print(documents[:5])
if not documents:
    logger.error("文档列表为空，无法进行 BERTopic 分析。")
else:
    logger.info(f"文档列表包含 {len(documents)} 个文档。")

2024-11-08 17:10:32,589 - INFO - 加载测试集数据...
Loading testset files: 100%|██████████| 61/61 [00:00<00:00, 7556.86it/s]
2024-11-08 17:10:32,641 - INFO - 加载了 61 个文档用于 BERTopic 实验。
2024-11-08 17:10:32,642 - INFO - 文档列表包含 61 个文档。


['humanus infirmitas caro membrum romanus qui¬dam liber communis membrum exhibeo servio iniquitas immunditia iniquitas exhibeo membrum servio justitia sanc¬tificatio doctor gens hu¬manum loquor respondeo quae¬stio christianus propono debeo assequor vita aeternus saltem servio justitia sanctificatio servio iniquitas im¬munditiae iniquitas perfec¬ta iustitia requiro magnus dilectio humaniter dico2 saltem cum2 parvus fer¬ventia servio justitia prior ser¬viebatur immunditia consido unus¬quisque volo christianus acqui¬renda sanctificatio saltem sollicitus servio immunditia iniquitas converto servitium justitia derelinquo servitium iniqui¬tas consido fructus iniquitas cogito immunditiam pecco caro genero eru¬bescentiam finis pecco mors libero verus pecco fides christus fructus sancti¬ficatio finis verus vita aeternus stipendium pecco mors gratia deus sanc¬ti vito aeternus christus iesus attendo cum2 diligentia iniquitas caro immunditia servio asse¬quendum iniquitas cum2 vigilo labor adhibeo 

In [9]:
logger.info("初始化 BERTopic 模型...")
topic_model = BERTopic(
    vectorizer_model=CountVectorizer(ngram_range=experiment_config["parameters"]["n_gram_range"]),
    embedding_model=get_embeddings,
    umap_model=UMAP(**experiment_config["parameters"]["umap_params"]),
    min_topic_size=experiment_config["parameters"]["min_topic_size"],
    nr_topics=experiment_config["parameters"]["nr_topics"],
    language="multilingual"
)

2024-11-08 17:10:32,650 - INFO - 初始化 BERTopic 模型...


In [None]:
if documents:
    logger.info("开始训练 BERTopic 模型...")
    topics, probabilities = topic_model.fit_transform(documents)
    logger.info("BERTopic 模型训练成功。")
    unique_topics = set(topics)
    logger.info(f"生成了 {len(unique_topics)} 个唯一主题。")
    logger.info("开始进行主题的简单可视化...")
    try:
        topics_info = topic_model.get_topic_info()
        if len(topics_info) > 0:
            for topic_num in topics_info['Topic'][:10]:
                if topic_num != -1:
                    words_weights = topic_model.get_topic(topic_num)
                    words_str = ', '.join([word for word, _ in words_weights])
                    print(f"主题 {topic_num}: {words_str}")
                    logger.info(f"主题 {topic_num}: {words_str}")
        else:
            logger.warning("未生成有效的主题，无法进行可视化。")
    except Exception as e:
        logger.error(f"可视化时发生错误: {e}")
    logger.info("保存每个文档的主题分配结果...")
    document_topic_data = []
    for doc_idx, (topic, prob) in enumerate(zip(topics, probabilities)):
        document_name = os.path.basename(test_files[doc_idx])
        document_topic_data.append([document_name, topic, prob])
    df_document_topics = pd.DataFrame(document_topic_data, columns=["Document", "Assigned Topic", "Probability"])
    document_topics_csv_path = os.path.join(experiment_dir, 'bertopic_document_topic_distribution.csv')
    df_document_topics.to_csv(document_topics_csv_path, index=False)
    logger.info(f"每个文档的主题分配结果已保存至 {document_topics_csv_path}。")
else:
    logger.error("文档加载失败，无法进行 BERTopic 分析。")

2024-11-08 17:10:32,660 - INFO - 开始训练 BERTopic 模型...
2024-11-08 17:10:32,675 - INFO - Use pytorch device_name: mps
2024-11-08 17:10:32,676 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [None]:
fig = topic_model.visualize_topics()
fig.show()
fig = topic_model.visualize_hierarchy()
fig.show()

In [None]:
# Step 10: 评估模块
try:
    logger.info("开始评估 BERTopic 模型...")

    # 提取每个主题的前10个关键词
    bertopic_topics = [
        [word for word, _ in topic_model.get_topic(topic_num)]
        for topic_num in range(len(topic_model.get_topics()))
        if topic_model.get_topic(topic_num)
    ]

    # 使用 gensim.corpora.Dictionary 创建词典
    texts = [doc.split() for doc in documents]
    dictionary = Dictionary(texts)

    # Step 10.1: 计算主题一致性（Topic Coherence） - TC-NPMI
    logger.info("计算主题一致性 (TC-NPMI)...")
    coherence_model_npmi = CoherenceModel(topics=bertopic_topics, texts=texts, dictionary=dictionary, coherence='c_npmi')
    coherence_score_npmi = coherence_model_npmi.get_coherence()
    logger.info(f"平均主题一致性 (TC-NPMI): {coherence_score_npmi}")
    print(f"平均主题一致性 (TC-NPMI): {coherence_score_npmi}")

    # Step 10.2: 计算主题一致性（Topic Coherence） - TC-LCP
    logger.info("计算主题一致性 (TC-LCP)...")
    coherence_model_lcp = CoherenceModel(topics=bertopic_topics, texts=texts, dictionary=dictionary, coherence='c_uci')
    coherence_score_lcp = coherence_model_lcp.get_coherence()
    logger.info(f"平均主题一致性 (TC-LCP): {coherence_score_lcp}")
    print(f"平均主题一致性 (TC-LCP): {coherence_score_lcp}")

    # Step 10.3: 计算平均成对 Jaccard 相似度（Mean Pairwise Jaccard Similarity）
    logger.info("计算平均成对 Jaccard 相似度 (MPJ)...")

    @jit(nopython=True)
    def jaccard_similarity(set1, set2):
        intersection = set1 & set2
        union = set1 | set2
        return len(intersection) / len(union) if union else 0

    @jit(nopython=True)
    def calculate_avg_jaccard(bertopic_topics):
        jaccard_scores = [
            jaccard_similarity(set(bertopic_topics[i]), set(bertopic_topics[j]))
            for i in range(len(bertopic_topics))
            for j in range(i + 1, len(bertopic_topics))
        ]
        return np.mean(jaccard_scores)

    avg_jaccard = calculate_avg_jaccard(bertopic_topics)
    logger.info(f"平均成对 Jaccard 相似度 (MPJ): {avg_jaccard}")
    print(f"平均成对 Jaccard 相似度 (MPJ): {avg_jaccard}")

    # Step 10.4: 高频词汇分布分析
    logger.info("进行高频词汇分布分析...")

    # 统计所有文档中的词频
    word_freq = collections.Counter(word for doc in documents for word in doc.split())

    # 获取高频词汇
    high_freq_words = {word for word, _ in word_freq.most_common(50)}

    # 检查每个主题中高频词汇的比例
    high_freq_word_ratios = [
        sum(1 for word in topic if word in high_freq_words) / len(topic)
        for topic in bertopic_topics
    ]

    avg_high_freq_word_ratio = np.mean(high_freq_word_ratios)
    logger.info(f"平均高频词汇比例: {avg_high_freq_word_ratio}")
    print(f"平均高频词汇比例: {avg_high_freq_word_ratio}")

    # 保存评估结果
    evaluation_results_path = os.path.join(experiment_dir, 'bertopic_evaluation_results.txt')
    with open(evaluation_results_path, 'w') as eval_file:
        eval_file.write(f"平均主题一致性 (TC-NPMI): {coherence_score_npmi}\n")
        eval_file.write(f"平均主题一致性 (TC-LCP): {coherence_score_lcp}\n")
        eval_file.write(f"平均成对 Jaccard 相似度 (MPJ): {avg_jaccard}\n")
        eval_file.write(f"平均高频词汇比例: {avg_high_freq_word_ratio}\n")

    logger.info(f"BERTopic 模型评估结果已保存至 {evaluation_results_path}")
except Exception as e:
    logger.error(f"评估 BERTopic 模型时发生错误: {e}")
    raise