In [None]:
import os
import time
import logging
import pickle
import tqdm
import json
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd

In [None]:
# 设置工作目录
BASE_DIR = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'

In [62]:
# 生成唯一实验ID
experiment_id = f"bertopic_experiment_{int(time.time())}"
experiment_dir = os.path.join(BASE_DIR, 'experiments', experiment_id)

# 创建实验目录
if not os.path.exists(experiment_dir):
    os.makedirs(experiment_dir)

# 获取当前 experiment_id 的日志文件路径
log_file_path = os.path.join(experiment_dir, f"bertopic_experiment_{experiment_id}.log")

# 配置日志记录，使每个实验的日志记录到独立的文件中
logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()  # 清除现有的处理器，避免重复添加

logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

# 添加文件日志处理器
file_handler = logging.FileHandler(log_file_path)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# 添加控制台日志处理器
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

# 确保在配置 logging 后马上打印一条信息，便于确认新的日志文件被创建
logger.info(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")
print(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")

2024-11-08 00:51:55,768 - INFO - 启动实验 bertopic_experiment_1731023515，日志记录到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731023515/bertopic_experiment_bertopic_experiment_1731023515.log


启动实验 bertopic_experiment_1731023515，日志记录到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731023515/bertopic_experiment_bertopic_experiment_1731023515.log


In [63]:
# 实验配置
experiment_config = {
    "parameters": {
        "n_gram_range": (1, 2),
        "min_topic_size": 2,
        "nr_topics": "auto",
        "umap_params": {
            "n_neighbors": 15,
            "min_dist": 0.1,
            "n_components": 5,
            "random_state": 42
        }
    }
}

config_path = os.path.join(experiment_dir, 'config.json')
with open(config_path, 'w') as config_file:
    json.dump(experiment_config, config_file, indent=4)

logger.info(f"实验配置已保存到 {config_path}")
print(f"实验配置已保存到 {config_path}")

2024-11-08 00:51:55,791 - INFO - 实验配置已保存到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731023515/config.json


实验配置已保存到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731023515/config.json


In [64]:
# 加载预训练的 Latin BERT 模型
logger.info("加载 Latin-BERT 嵌入模型...")
model_path = "/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/latin-bert/models/latin_bert"
tokenizer = AutoTokenizer.from_pretrained(model_path)
bert_model = AutoModel.from_pretrained(model_path)

def get_embeddings(texts):
    """
    使用 Latin-BERT 模型为文本生成嵌入。
    """
    tokens = tokenizer(
        texts, padding=True, truncation=True, return_tensors="pt", max_length=512
    )
    with torch.no_grad():
        embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)  # 取最后一层的平均表示
    return embeddings

2024-11-08 00:51:55,803 - INFO - 加载 Latin-BERT 嵌入模型...


In [65]:
# 创建 CountVectorizer 和 UMAP
vectorizer_model = CountVectorizer(
    ngram_range=experiment_config["parameters"]["n_gram_range"],  # 可调整的 n-gram 范围
    token_pattern=r"(?u)\b\w+\b"
)
custom_umap = UMAP(
    n_neighbors=experiment_config["parameters"]["umap_params"]["n_neighbors"],
    min_dist=experiment_config["parameters"]["umap_params"]["min_dist"],
    n_components=experiment_config["parameters"]["umap_params"]["n_components"],
    random_state=experiment_config["parameters"]["umap_params"]["random_state"]
)


In [66]:
# 加载测试集文档
testset_dir = os.path.join(BASE_DIR, 'data/testset')
documents = []

logger.info("加载测试集数据...")
test_files = [f for f in os.listdir(testset_dir) if f.endswith('.txt')]
for test_file in tqdm.tqdm(test_files, desc="Loading testset files"):
    file_path = os.path.join(testset_dir, test_file)
    with open(file_path, 'r', encoding='utf-8') as file:
        documents.append(file.read())

logger.info(f"加载了 {len(documents)} 个文档用于 BERTopic 实验。")

# 打印一些文档内容进行检查
print(documents[:5])

# 检查文档列表是否为空
if not documents:
    logger.error("文档列表为空，无法进行 BERTopic 分析。")
else:
    logger.info(f"文档列表包含 {len(documents)} 个文档。")

2024-11-08 00:51:56,561 - INFO - 加载测试集数据...
Loading testset files: 100%|██████████| 61/61 [00:00<00:00, 224.61it/s]
2024-11-08 00:51:56,846 - INFO - 加载了 61 个文档用于 BERTopic 实验。
2024-11-08 00:51:56,847 - INFO - 文档列表包含 61 个文档。


['induimini dominus iesum christum romanus epistula dominicus quoad intellego aliqual iesus secundus adam filius deus andreas declaro aegeae lego his¬toria lombardica tertius folium considero epistula aposto¬lus hora somnus surgo probo apostolus dilectio com¬pletio lex prae¬cepto dilectio propior complicantur om¬nia quoad propior subjungo plenitudo lex dilectio scio hora somnus sur¬gere dilectio plenitudo lex scio surgen¬dum somnus vigilanter attenden¬dum propior salus cum2 credo expono tempus credo fides recipio salus mors christus conformabimur exspecto dico2 credo salus lex spiri¬tualiter intellego scien¬tes spiritual intellego dilectio¬nem propior salus unus mando1 dilectio adimpleo diligo nolo malus infero salus intel¬lectu diligo removeo difficilis potestas pauper dives nobilis ignobilis graecus barbaris diligo plenitu¬do lex deus adimplemus mandatum ser¬vamus vita ingredior salus magister veritas volo1 vita ingredior servo mandatum somnus sopor teneo dilectio plenitudo lex ignor

In [67]:
# 创建 BERTopic 模型
logger.info("初始化 BERTopic 模型...")
topic_model = BERTopic(
    vectorizer_model=CountVectorizer(ngram_range=experiment_config["parameters"]["n_gram_range"]),
    embedding_model=get_embeddings,
    umap_model=UMAP(**experiment_config["parameters"]["umap_params"]),
    min_topic_size=experiment_config["parameters"]["min_topic_size"],
    nr_topics=experiment_config["parameters"]["nr_topics"],
    language="multilingual"
)

2024-11-08 00:51:56,875 - INFO - 初始化 BERTopic 模型...


In [68]:
# 训练 BERTopic 模型
if documents:
    logger.info("开始训练 BERTopic 模型...")
    topics, probabilities = topic_model.fit_transform(documents)
    logger.info("BERTopic 模型训练成功。")

    # 检查生成的主题数量
    unique_topics = set(topics)
    logger.info(f"生成了 {len(unique_topics)} 个唯一主题。")

    # 可视化主题，直接显示生成的主题和前10个主题词
    logger.info("开始进行主题的简单可视化...")
    try:
        topics_info = topic_model.get_topic_info()
        if len(topics_info) > 0:
            for topic_num in topics_info['Topic'][:10]:
                if topic_num != -1:  # 排除噪声主题
                    words_weights = topic_model.get_topic(topic_num)
                    words_str = ', '.join([word for word, _ in words_weights])
                    print(f"主题 {topic_num}: {words_str}")
                    logger.info(f"主题 {topic_num}: {words_str}")
        else:
            logger.warning("未生成有效的主题，无法进行可视化。")
    except Exception as e:
        logger.error(f"可视化时发生错误: {e}")

    # 保存每个文档的主题分配结果
    logger.info("保存每个文档的主题分配结果...")
    document_topic_data = []

    for doc_idx, (topic, prob) in enumerate(zip(topics, probabilities)):
        document_name = os.path.basename(test_files[doc_idx])
        document_topic_data.append([document_name, topic, prob])

    df_document_topics = pd.DataFrame(document_topic_data, columns=["Document", "Assigned Topic", "Probability"])
    document_topics_csv_path = os.path.join(experiment_dir, 'bertopic_document_topic_distribution.csv')
    df_document_topics.to_csv(document_topics_csv_path, index=False)
    logger.info(f"每个文档的主题分配结果已保存至 {document_topics_csv_path}。")

else:
    logger.error("文档加载失败，无法进行 BERTopic 分析。")

2024-11-08 00:51:56,952 - INFO - 开始训练 BERTopic 模型...
2024-11-08 00:51:57,004 - INFO - Use pytorch device_name: mps
2024-11-08 00:51:57,005 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2024-11-08 00:52:05,304 - INFO - BERTopic 模型训练成功。
2024-11-08 00:52:05,336 - INFO - 生成了 12 个唯一主题。
2024-11-08 00:52:05,338 - INFO - 开始进行主题的简单可视化...
2024-11-08 00:52:05,348 - INFO - 主题 0: scribo, deus, homo, christus, pontifex, pater, doctrina, vita, sol, fides
2024-11-08 00:52:05,349 - INFO - 主题 1: deus, gratia, secundus, cum2, mare, corpus, dominus, anima, homo, christus
2024-11-08 00:52:05,350 - INFO - 主题 2: deus, filius, gaudium, spiritus, iesus, novitas, christus, homo, natura, dominus
2024-11-08 00:52:05,351 - INFO - 主题 3: deus, verbum, audio, mundus, cum2, homo, verbum deus, peccator, fides, accipio
2024-11-08 00:52:05,351 - INFO - 主题 4: deus, locus, via, mundus, tempus, jesus, pilo1, vox, manus, caelum
2024-11-08 00:52:05,351 - INFO - 主题 5: christus, deus, pate

主题 0: scribo, deus, homo, christus, pontifex, pater, doctrina, vita, sol, fides
主题 1: deus, gratia, secundus, cum2, mare, corpus, dominus, anima, homo, christus
主题 2: deus, filius, gaudium, spiritus, iesus, novitas, christus, homo, natura, dominus
主题 3: deus, verbum, audio, mundus, cum2, homo, verbum deus, peccator, fides, accipio
主题 4: deus, locus, via, mundus, tempus, jesus, pilo1, vox, manus, caelum
主题 5: christus, deus, pater, caritas, latitudo, thesaurus, homo, longitudo, fecunditas, cum2
主题 6: pater, mundus, filius, deus, peto, aeternus, deus pater, intellego, cognitio, venio
主题 7: mors, sanguis, deus, christus, veritas, vereor, oblatio, filius, offero, libero
主题 8: spiritus, anima, ferrum, pastor, caro, numerus, magnes1, moveo, carus1, ovo


In [69]:
# 假设您已经训练好了 BERTopic 模型
fig = topic_model.visualize_topics()
fig.show()


In [70]:
fig = topic_model.visualize_hierarchy()
fig.show()


In [71]:
import torch
import numpy as np
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity

logger.info("开始进行模型评估...")



2024-11-08 01:02:53,717 - INFO - 开始进行模型评估...


In [73]:
# Step 1: 获取每个主题的前 10 个关键词
logger.info("获取每个主题的前10个关键词...")
topics = []

# 遍历所有可能的主题编号，确保每个主题返回的不是 `bool`
for topic_num in range(len(topic_model.get_topics())):
    topic = topic_model.get_topic(topic_num)
    if topic:  # 检查 topic 是否有效
        topics.append(topic)

# 提取前 10 个关键词
topic_words = [[word for word, _ in topic[:10]] for topic in topics]


2024-11-08 01:03:49,473 - INFO - 获取每个主题的前10个关键词...


In [74]:
# Step 2: 定义获取嵌入的函数（使用您提供的 Latin-BERT 模型）
def get_word_embedding(word):
    tokens = tokenizer(word, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        embedding = bert_model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()  # 取最后一层的平均表示
    return embedding



In [75]:
# Step 3: 计算 TC-EMBED (Topic Coherence - Embed)
logger.info("计算 TC-EMBED (Topic Coherence - Embed) 指标...")
tc_embed_scores = []
for topic in topic_words:
    embeddings = np.array([get_word_embedding(word) for word in topic])
    cos_sim_matrix = cosine_similarity(embeddings)
    upper_triangle = cos_sim_matrix[np.triu_indices_from(cos_sim_matrix, k=1)]  # 取余弦相似度矩阵的上三角部分
    tc_embed_scores.append(upper_triangle.mean())  # 计算均值

avg_tc_embed = np.mean(tc_embed_scores)
logger.info(f"平均 TC-EMBED 得分: {avg_tc_embed}")
print(f"平均 TC-EMBED 得分: {avg_tc_embed}")



2024-11-08 01:03:54,334 - INFO - 计算 TC-EMBED (Topic Coherence - Embed) 指标...
2024-11-08 01:03:59,324 - INFO - 平均 TC-EMBED 得分: 0.9823591709136963


平均 TC-EMBED 得分: 0.9823591709136963


In [76]:
# Step 4: 计算 Mean Pairwise Jaccard Similarity
logger.info("计算平均成对 Jaccard 相似度指标...")
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0

jaccard_scores = []
for topic1, topic2 in combinations(topic_words, 2):
    jaccard_scores.append(jaccard_similarity(set(topic1), set(topic2)))

avg_jaccard = np.mean(jaccard_scores)
logger.info(f"平均成对 Jaccard 相似度: {avg_jaccard}")
print(f"平均成对 Jaccard 相似度: {avg_jaccard}")

# 保存评估结果
evaluation_results_path = os.path.join(experiment_dir, 'evaluation_results.txt')
with open(evaluation_results_path, 'w') as eval_file:
    eval_file.write(f"平均 TC-EMBED 得分: {avg_tc_embed}\n")
    eval_file.write(f"平均成对 Jaccard 相似度: {avg_jaccard}\n")
logger.info(f"评估结果已保存至 {evaluation_results_path}")


2024-11-08 01:04:03,003 - INFO - 计算平均成对 Jaccard 相似度指标...
2024-11-08 01:04:03,007 - INFO - 平均成对 Jaccard 相似度: 0.08991618976139097
2024-11-08 01:04:03,012 - INFO - 评估结果已保存至 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731023515/evaluation_results.txt


平均成对 Jaccard 相似度: 0.08991618976139097
