In [15]:
import os
import time
import logging
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from transformers import AutoTokenizer, AutoModel
import torch
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
from tqdm import tqdm
import pandas as pd

In [None]:
# 检查 MPS 是否可用
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS backend on Apple Silicon.")
else:
    device = torch.device("cpu")
    print("MPS backend not available, using CPU instead.")

In [None]:
# 基础路径设置
BASE_DIR = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'

# 生成唯一实验ID
experiment_id = f"latinbert_{time.strftime('%Y%m%d%H%M%S')}"
experiment_dir = os.path.join(BASE_DIR, 'experiments', 'latinbert', experiment_id)

# 创建实验目录
if not os.path.exists(experiment_dir):
    os.makedirs(experiment_dir)

# 获取日志文件路径
log_file_path = os.path.join(experiment_dir, f"{experiment_id}.log")

# 配置日志记录
logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler = logging.FileHandler(log_file_path)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

logger.info(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")
print(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")


In [None]:
# 实验配置
experiment_config = {
    "parameters": {
        "n_gram_range": (1, 2),  # 捕捉更多的上下文短语
        "min_topic_size": 2,  # 增大最小主题大小以减少噪声
        "nr_topics": "auto",  # 自动确定主题数量
        "umap_params": {
            "n_neighbors": 10,  # 增加邻居数使得降维更平滑
            "min_dist": 0.1,  # 增大最小距离使主题更分离
            "n_components": 2,  # 增大维度以保留更多特征信息
            "random_state": 42  # 确保实验可重复
        }
    }
}

config_path = os.path.join(experiment_dir, 'config.json')
with open(config_path, 'w') as config_file:
    json.dump(experiment_config, config_file, indent=4)

logger.info(f"实验配置已保存到 {config_path}")
print(f"实验配置已保存到 {config_path}")



In [None]:
# 加载本地 Latin BERT 模型
logger.info("加载 Latin-BERT 嵌入模型...")
model_path = "/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/latin-bert/models/latin_bert"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path).to(device)


In [None]:
# 定义嵌入生成函数
def get_latin_bert_embeddings(texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=256).to(device)
        with torch.no_grad():
            batch_embeddings = model(**tokens).last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings.cpu().numpy())
    return np.vstack(embeddings)



In [None]:
# 加载段落级别的测试集数据
testset_path = os.path.join(BASE_DIR, 'data/testset_paragraphs_level.json')
with open(testset_path, 'r', encoding='utf-8') as json_file:
    testset_data = json.load(json_file)

# 提取段落文本和对应的 document_id, paragraph_num
documents = []
paragraph_contents = []
document_ids = []
paragraph_nums = []
for document in testset_data["documents"]:
    for paragraph_num, paragraph in enumerate(document["paragraphs"], start=1):
        if "content" in paragraph:
            documents.append(paragraph["content"])
            paragraph_contents.append(paragraph["content"])
            document_ids.append(document["document_id"])
            paragraph_nums.append(paragraph_num)

logger.info(f"从测试集加载了 {len(documents)} 个段落")


In [None]:
# 嵌入生成
document_embeddings = get_latin_bert_embeddings(documents)

In [None]:
# 创建 CountVectorizer 和 UMAP
vectorizer_model = CountVectorizer(
    ngram_range=experiment_config["parameters"]["n_gram_range"],
    token_pattern=r"(?u)\b\w+\b"
)
custom_umap = UMAP(
    n_neighbors=experiment_config["parameters"]["umap_params"]["n_neighbors"],
    min_dist=experiment_config["parameters"]["umap_params"]["min_dist"],
    n_components=experiment_config["parameters"]["umap_params"]["n_components"],
    random_state=experiment_config["parameters"]["umap_params"]["random_state"]
)

In [None]:
# 创建 CountVectorizer 和 UMAP
vectorizer_model = CountVectorizer(
    ngram_range=experiment_config["parameters"]["n_gram_range"],
    token_pattern=r"(?u)\b\w+\b"
)
custom_umap = UMAP(
    n_neighbors=experiment_config["parameters"]["umap_params"]["n_neighbors"],
    min_dist=experiment_config["parameters"]["umap_params"]["min_dist"],
    n_components=experiment_config["parameters"]["umap_params"]["n_components"],
    random_state=experiment_config["parameters"]["umap_params"]["random_state"]
)


In [None]:
# 初始化 BERTopic 模型，不设置 embedding_model
topic_model = BERTopic(
    embedding_model=None,  # 不使用默认的嵌入模型
    vectorizer_model=vectorizer_model,
    umap_model=custom_umap,
    min_topic_size=experiment_config["parameters"]["min_topic_size"],
    nr_topics=experiment_config["parameters"]["nr_topics"],
    language=None  # 禁用语言特定的嵌入
)


In [None]:
# 使用手动生成的嵌入进行主题模型训练
if documents:
    print("Starting BERTopic model training...")
    logger.info("开始训练 BERTopic 模型...")
    
    # 将生成的 numpy 格式的自定义嵌入传入 fit_transform
    topics, probabilities = topic_model.fit_transform(documents, embeddings=document_embeddings)
    logger.info("BERTopic 模型训练成功。")
    print("BERTopic model training completed.")
    
    # 可视化和保存结果
    topics_info = topic_model.get_topic_info()
    for topic_num in topics_info['Topic'][:10]:  # 输出前 10 个主题
        if topic_num != -1:
            words_weights = topic_model.get_topic(topic_num)
            words_str = ', '.join([word for word, _ in words_weights])
            print(f"主题 {topic_num}: {words_str}")
            logger.info(f"主题 {topic_num}: {words_str}")

    # 保存文档的主题分配结果
    document_topic_data = []
    for doc_idx, (topic, prob) in enumerate(zip(topics, probabilities)):
        document_id = document_ids[doc_idx]
        paragraph_num = paragraph_nums[doc_idx]
        document_topic_data.append([document_id, paragraph_num, topic, prob])

    df_document_topics = pd.DataFrame(document_topic_data, columns=["Document", "Paragraph", "Assigned Topic", "Probability"])
    document_topics_csv_path = os.path.join(experiment_dir, 'bertopic_document_topic_distribution.csv')
    df_document_topics.to_csv(document_topics_csv_path, index=False)
    logger.info(f"每个文档的主题分配结果已保存至 {document_topics_csv_path}。")

    # 保存文档的主题分配结果到 JSON 文件
    document_topic_distributions = []
    for doc_idx, (topic, prob) in enumerate(zip(topics, probabilities)):
        document_id = document_ids[doc_idx]
        paragraph_num = paragraph_nums[doc_idx]
        document_topic_distributions.append({
            "Document": document_id,
            "Paragraph": int(paragraph_num),
            "Content": paragraph_contents[doc_idx],
            "Topic": topic,
            "Topic Keywords": [word for word, _ in topic_model.get_topic(topic)],
            "Probability": float(prob)
        })

    json_output_path = os.path.join(experiment_dir, 'document_topic_distributions.json')
    with open(json_output_path, 'w', encoding='utf-8') as json_file:
        json.dump(document_topic_distributions, json_file, ensure_ascii=False, indent=4)
    logger.info(f"文档的主题分配情况已保存至 {json_output_path}")


In [None]:
# 保存可视化图表为 HTML 文件
try:
    topics_fig_path = os.path.join(experiment_dir, 'bertopic_topics.html')
    fig = topic_model.visualize_topics()
    fig.write_html(topics_fig_path)
    logger.info(f"主题可视化图表已保存至 {topics_fig_path}")

    # 生成并保存层次聚类图表
    fig_hierarchy = topic_model.visualize_hierarchy()
    fig_hierarchy.show()
    hierarchy_fig_path = os.path.join(experiment_dir, 'bertopic_hierarchy.html')
    fig_hierarchy.write_html(hierarchy_fig_path)
    logger.info(f"层次聚类图表已保存至 {hierarchy_fig_path}")
except Exception as e:
    logger.error(f"保存可视化图表时发生错误: {e}")
    raise


In [None]:
# 评估模型
logger.info("开始评估 BERTopic 模型...")

# 提取每个主题的前10个关键词
bertopic_topics = []
for topic_num in range(len(topic_model.get_topics())):
    topic = topic_model.get_topic(topic_num)
    if topic:  # 确保 topic 不是布尔值
        bertopic_topics.append([word for word, _ in topic])

# 使用 gensim.corpora.Dictionary 创建词典
texts = [doc.split() for doc in documents]
dictionary = corpora.Dictionary(texts)

# 计算主题一致性（NPMI）
logger.info("计算主题一致性 (NPMI)...")
coherence_model_npmi = CoherenceModel(topics=bertopic_topics, texts=texts, dictionary=dictionary, coherence='c_npmi')
coherence_score_npmi = coherence_model_npmi.get_coherence()
logger.info(f"平均主题一致性 (NPMI): {coherence_score_npmi}")
print(f"平均主题一致性 (NPMI): {coherence_score_npmi}")

# 计算主题多样性
unique_words = set()
total_words = 0
for topic in bertopic_topics:
    unique_words.update(topic)
    total_words += len(topic)
topic_diversity = len(unique_words) / total_words
logger.info(f"主题多样性: {topic_diversity}")
print(f"主题多样性: {topic_diversity}")

# 计算 WEPS 评分
logger.info("计算 WEPS 评分...")

# 定义嵌入获取函数：根据主题中的词汇获取其嵌入向量
def get_average_embedding_for_topic(topic_words, tokenizer, model, device):
    tokens = tokenizer(topic_words, padding=True, truncation=True, return_tensors="pt", max_length=256).to(device)
    with torch.no_grad():
        embeddings = model(**tokens).last_hidden_state.mean(dim=1)
    return embeddings.cpu().numpy()

# 获取每个主题的平均嵌入向量
topic_embeddings = []
for topic in bertopic_topics:  # 每个主题的词汇
    topic_embedding = get_average_embedding_for_topic(topic, tokenizer, model, device)
    topic_embeddings.append(topic_embedding)

# 计算不同主题之间的余弦相似度
topic_embeddings = np.concatenate(topic_embeddings)  # 合并所有主题的嵌入
similarity_matrix = cosine_similarity(topic_embeddings)

# 计算 WEPS（主题之间的相似度）
weps = np.mean(similarity_matrix)  # 或者根据需求选择合适的聚合方式
logger.info(f"WEPS: {weps}")
print(f"WEPS: {weps}")

# 保存评估结果
evaluation_results_path = os.path.join(experiment_dir, 'bertopic_evaluation_results.txt')
with open(evaluation_results_path, 'w') as eval_file:
    eval_file.write(f"平均主题一致性 (NPMI): {coherence_score_npmi}\n")
    eval_file.write(f"主题多样性: {topic_diversity}\n")
    eval_file.write(f"WEPS: {weps}\n")

logger.info(f"BERTopic 模型评估结果已保存至 {evaluation_results_path}")

In [None]:
# 保存每次生成的主题和评估结果到一个文件
results_file_path = os.path.join(experiment_dir, 'bertopic_results.txt')
with open(results_file_path, 'w', encoding='utf-8') as f:
    # 保存主题词
    f.write("BERTopic 生成的主题：\n")
    for idx, topic in enumerate(bertopic_topics):
        topic_str = f"Topic {idx}: {', '.join(topic)}"
        f.write(topic_str + '\n')
    f.write("\n")
    
    # 保存评估结果
    f.write(f"平均主题一致性 (NPMI): {coherence_score_npmi}\n")
    f.write(f"主题多样性: {topic_diversity}\n")

logger.info(f"生成的主题和评估结果已保存至 {results_file_path}")