In [None]:
import os
import time
import logging
import pickle
import tqdm
import json
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd

In [82]:
# 设置工作目录
BASE_DIR = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'

In [83]:
# 生成唯一实验ID
experiment_id = f"bertopic_experiment_{int(time.time())}"
experiment_dir = os.path.join(BASE_DIR, 'experiments', experiment_id)

# 创建实验目录
if not os.path.exists(experiment_dir):
    os.makedirs(experiment_dir)

# 获取当前 experiment_id 的日志文件路径
log_file_path = os.path.join(experiment_dir, f"bertopic_experiment_{experiment_id}.log")

# 配置日志记录，使每个实验的日志记录到独立的文件中
logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()  # 清除现有的处理器，避免重复添加

logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

# 添加文件日志处理器
file_handler = logging.FileHandler(log_file_path)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# 添加控制台日志处理器
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

# 确保在配置 logging 后马上打印一条信息，便于确认新的日志文件被创建
logger.info(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")
print(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")

2024-11-07 16:09:28,031 - INFO - 启动实验 bertopic_experiment_1730992167，日志记录到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1730992167/bertopic_experiment_bertopic_experiment_1730992167.log


启动实验 bertopic_experiment_1730992167，日志记录到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1730992167/bertopic_experiment_bertopic_experiment_1730992167.log


In [84]:
# 实验配置
experiment_config = {
    "parameters": {
        "n_gram_range": (1, 2),  # 捕捉更多的上下文短语
        "min_topic_size": 2,  # 增大最小主题大小以减少噪声
        "nr_topics": "auto",  # 自动确定主题数量
        "umap_params": {
            "n_neighbors": 10,  # 增加邻居数使得降维更平滑
            "min_dist": 0.1,  # 增大最小距离使主题更分离
            "n_components": 5,  # 增大维度以保留更多特征信息
            "random_state": 42  # 确保实验可重复
        }
    }
}

config_path = os.path.join(experiment_dir, 'config.json')
with open(config_path, 'w') as config_file:
    json.dump(experiment_config, config_file, indent=4)

logger.info(f"实验配置已保存到 {config_path}")
print(f"实验配置已保存到 {config_path}")

2024-11-07 16:09:28,050 - INFO - 实验配置已保存到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1730992167/config.json


实验配置已保存到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1730992167/config.json


In [85]:
# 加载预训练的 Latin BERT 模型
logger.info("加载 Latin-BERT 嵌入模型...")
model_path = "/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/latin-bert/models/latin_bert"
tokenizer = AutoTokenizer.from_pretrained(model_path)
bert_model = AutoModel.from_pretrained(model_path)

def get_embeddings(texts):
    """
    使用 Latin-BERT 模型为文本生成嵌入。
    """
    tokens = tokenizer(
        texts, padding=True, truncation=True, return_tensors="pt", max_length=512
    )
    with torch.no_grad():
        embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)  # 取最后一层的平均表示
    return embeddings

2024-11-07 16:09:28,063 - INFO - 加载 Latin-BERT 嵌入模型...


In [86]:
# 创建 CountVectorizer 和 UMAP
vectorizer_model = CountVectorizer(
    ngram_range=experiment_config["parameters"]["n_gram_range"],  # 可调整的 n-gram 范围
    token_pattern=r"(?u)\b\w+\b"
)
custom_umap = UMAP(
    n_neighbors=experiment_config["parameters"]["umap_params"]["n_neighbors"],
    min_dist=experiment_config["parameters"]["umap_params"]["min_dist"],
    n_components=experiment_config["parameters"]["umap_params"]["n_components"],
    random_state=experiment_config["parameters"]["umap_params"]["random_state"]
)


In [87]:
# 检查嵌入生成
sample_text = ["Gallia est omnis divisa in partes tres."]
embeddings = get_embeddings(sample_text)
print(embeddings)

tensor([[ 5.0366e-01, -8.2940e-02,  6.8993e-01, -6.4628e-01, -5.4111e-01,
          1.3897e-01,  3.0640e-01, -3.8169e-01, -6.9933e-02, -2.7038e-01,
          3.1233e-01,  8.5968e-01,  2.7322e-02, -2.3399e-01, -5.9250e-01,
         -1.1730e-01, -5.7347e-01, -1.2630e-01, -2.6752e-01,  1.5595e+00,
          2.0561e-01,  8.4108e-02,  1.3122e-01, -3.5626e-01, -1.1722e-01,
         -2.0510e-01, -5.6573e-01,  7.0868e-02,  9.3588e-01, -1.3381e-01,
          4.3802e-01, -4.6198e-01, -5.5544e-01, -4.5095e-02,  2.5489e-01,
          1.0352e-02, -2.6475e-01, -3.1353e-01, -3.2140e-01,  2.5392e-01,
          4.2841e-02, -5.3262e-01, -4.7118e-01,  2.0216e-02,  1.9804e-01,
          1.7835e-01,  5.8115e-01, -1.4839e-02, -1.1122e-01,  1.4460e-01,
         -3.0590e-01, -6.8662e-02, -4.8024e-01, -1.7965e-02,  6.8085e-01,
         -5.7150e-02,  2.8283e-01,  4.7284e-01, -4.3270e-01,  4.1282e-01,
         -3.8502e-01, -3.2037e-01, -1.3186e-01, -1.9784e-01, -3.6641e-01,
         -2.9638e-01,  4.5709e-01, -4.

In [88]:
# 加载测试集文档
testset_dir = os.path.join(BASE_DIR, 'data/testset')
documents = []

logger.info("加载测试集数据...")
test_files = [f for f in os.listdir(testset_dir) if f.endswith('.txt')]
for test_file in tqdm.tqdm(test_files, desc="Loading testset files"):
    file_path = os.path.join(testset_dir, test_file)
    with open(file_path, 'r', encoding='utf-8') as file:
        documents.append(file.read())

logger.info(f"加载了 {len(documents)} 个文档用于 BERTopic 实验。")

# 打印一些文档内容进行检查
print(documents[:5])

# 检查文档列表是否为空
if not documents:
    logger.error("文档列表为空，无法进行 BERTopic 分析。")
else:
    logger.info(f"文档列表包含 {len(documents)} 个文档。")

2024-11-07 16:09:30,922 - INFO - 加载测试集数据...
Loading testset files: 100%|██████████| 61/61 [00:00<00:00, 2220.85it/s]
2024-11-07 16:09:30,992 - INFO - 加载了 61 个文档用于 BERTopic 实验。
2024-11-07 16:09:30,998 - INFO - 文档列表包含 61 个文档。


['thesaurus cor evangelium locus iustus iustus iustitia albus albus al¬bedo christianus christianus christus christus maneo volo necessarius induamus induo maneo comessatio ebrietas cubilis im¬puditiis ambulo ambulo volo1 iustus studeo induo iustitia vo¬lens calidus induo caliditas thesaurus volo calidus certus ignis calidum reperio om¬nia participatio ignis calidum plenitudo ignis thesaurus recipio calidum caliditas christus plenitudo gratia plenitudo gratia ad-sequor recipio calor1 fomentum vi¬tae sensibilis calidus umidus consisto gratia pasco spiritus gratia ra¬dius sol justitia christus deus vivo amo amo gratia amo christianus verus christianus vivo gratia chri¬sti studium quaero assimilaris diligo cupio thesaurus verus christiani christus cor locus reperio christus verus minister christus minister locus sequor minister se¬quitur christum mors cum2 christus resurrectio incoho ieiunium quadragesimal ecclesia institus spiritus christus moveo christus apostolus successo¬ribus rector 

In [89]:
# 创建 BERTopic 模型
logger.info("初始化 BERTopic 模型...")
topic_model = BERTopic(
    vectorizer_model=CountVectorizer(ngram_range=experiment_config["parameters"]["n_gram_range"]),
    embedding_model=get_embeddings,
    umap_model=UMAP(**experiment_config["parameters"]["umap_params"]),
    min_topic_size=experiment_config["parameters"]["min_topic_size"],
    nr_topics=experiment_config["parameters"]["nr_topics"],
    language="multilingual"
)

2024-11-07 16:09:31,027 - INFO - 初始化 BERTopic 模型...


In [90]:
# 训练 BERTopic 模型
if documents:
    logger.info("开始训练 BERTopic 模型...")
    topics, probabilities = topic_model.fit_transform(documents)
    logger.info("BERTopic 模型训练成功。")

    # 检查生成的主题数量
    unique_topics = set(topics)
    logger.info(f"生成了 {len(unique_topics)} 个唯一主题。")

    # 可视化主题，直接显示生成的主题和前10个主题词
    logger.info("开始进行主题的简单可视化...")
    try:
        topics_info = topic_model.get_topic_info()
        if len(topics_info) > 0:
            for topic_num in topics_info['Topic'][:10]:
                if topic_num != -1:  # 排除噪声主题
                    words_weights = topic_model.get_topic(topic_num)
                    words_str = ', '.join([word for word, _ in words_weights])
                    print(f"主题 {topic_num}: {words_str}")
                    logger.info(f"主题 {topic_num}: {words_str}")
        else:
            logger.warning("未生成有效的主题，无法进行可视化。")
    except Exception as e:
        logger.error(f"可视化时发生错误: {e}")

    # 保存每个文档的主题分配结果
    logger.info("保存每个文档的主题分配结果...")
    document_topic_data = []

    for doc_idx, (topic, prob) in enumerate(zip(topics, probabilities)):
        document_name = os.path.basename(test_files[doc_idx])
        document_topic_data.append([document_name, topic, prob])

    df_document_topics = pd.DataFrame(document_topic_data, columns=["Document", "Assigned Topic", "Probability"])
    document_topics_csv_path = os.path.join(experiment_dir, 'bertopic_document_topic_distribution.csv')
    df_document_topics.to_csv(document_topics_csv_path, index=False)
    logger.info(f"每个文档的主题分配结果已保存至 {document_topics_csv_path}。")

else:
    logger.error("文档加载失败，无法进行 BERTopic 分析。")

2024-11-07 16:09:31,082 - INFO - 开始训练 BERTopic 模型...
2024-11-07 16:09:31,169 - INFO - Use pytorch device_name: mps
2024-11-07 16:09:31,170 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2




2024-11-07 16:09:40,305 - INFO - BERTopic 模型训练成功。
2024-11-07 16:09:40,317 - INFO - 生成了 8 个唯一主题。
2024-11-07 16:09:40,317 - INFO - 开始进行主题的简单可视化...
2024-11-07 16:09:40,335 - INFO - 主题 0: deus, cum2, sapientia, mundus, spiritus, verbum, magister, misericordia, christus, homo
2024-11-07 16:09:40,335 - INFO - 主题 1: christus, deus, homo, rex, veritas, mors, filius, vereor, mundus, verbum
2024-11-07 16:09:40,336 - INFO - 主题 2: anima, vito, corpus, christus, deus, cibus, vita, spiritus, resurrectio, caro
2024-11-07 16:09:40,337 - INFO - 主题 3: fides, credo, deus, spiritus, intellego, domus, virtus, unus, filius, christus
2024-11-07 16:09:40,337 - INFO - 主题 4: verbum, deus, anima, cum2, spiritus, homo, christus, corpus, carus1, numerus
2024-11-07 16:09:40,338 - INFO - 主题 5: spiritus, deus, vita, sancio, pater, spiritus sancio, christus, filius, fructus, venio
2024-11-07 16:09:40,338 - INFO - 主题 6: christus, spiritus, thesaurus, jejuno, substantia, apostolus, mundus, pater, deus, sacramentum
2024-

主题 0: deus, cum2, sapientia, mundus, spiritus, verbum, magister, misericordia, christus, homo
主题 1: christus, deus, homo, rex, veritas, mors, filius, vereor, mundus, verbum
主题 2: anima, vito, corpus, christus, deus, cibus, vita, spiritus, resurrectio, caro
主题 3: fides, credo, deus, spiritus, intellego, domus, virtus, unus, filius, christus
主题 4: verbum, deus, anima, cum2, spiritus, homo, christus, corpus, carus1, numerus
主题 5: spiritus, deus, vita, sancio, pater, spiritus sancio, christus, filius, fructus, venio
主题 6: christus, spiritus, thesaurus, jejuno, substantia, apostolus, mundus, pater, deus, sacramentum
