In [1]:
import os
import time
import logging
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from transformers import AutoTokenizer, AutoModel
import torch
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
from tqdm import tqdm

In [2]:
# 检查 MPS 是否可用
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS backend on Apple Silicon.")
else:
    device = torch.device("cpu")
    print("MPS backend not available, using CPU instead.")

Using MPS backend on Apple Silicon.


In [3]:
# 基础路径设置
BASE_DIR = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'

# 生成唯一实验ID
experiment_id = f"latinbert_{time.strftime('%Y%m%d%H%M%S')}"
experiment_dir = os.path.join(BASE_DIR, 'experiments', 'latinbert', experiment_id)

# 创建实验目录
if not os.path.exists(experiment_dir):
    os.makedirs(experiment_dir)

# 获取日志文件路径
log_file_path = os.path.join(experiment_dir, f"{experiment_id}.log")

# 配置日志记录
logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler = logging.FileHandler(log_file_path)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

logger.info(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")
print(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")


2024-11-10 19:53:50,355 - INFO - 启动实验 latinbert_20241110195350，日志记录到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/latinbert/latinbert_20241110195350/latinbert_20241110195350.log


启动实验 latinbert_20241110195350，日志记录到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/latinbert/latinbert_20241110195350/latinbert_20241110195350.log


In [4]:
# 实验配置
experiment_config = {
    "parameters": {
        "n_gram_range": (1, 2),  # 捕捉更多的上下文短语
        "min_topic_size": 2,  # 增大最小主题大小以减少噪声
        "nr_topics": "auto",  # 自动确定主题数量
        "umap_params": {
            "n_neighbors": 10,  # 增加邻居数使得降维更平滑
            "min_dist": 0.1,  # 增大最小距离使主题更分离
            "n_components": 2,  # 增大维度以保留更多特征信息
            "random_state": 42  # 确保实验可重复
        }
    }
}

config_path = os.path.join(experiment_dir, 'config.json')
with open(config_path, 'w') as config_file:
    json.dump(experiment_config, config_file, indent=4)

logger.info(f"实验配置已保存到 {config_path}")
print(f"实验配置已保存到 {config_path}")


2024-11-10 19:53:50,360 - INFO - 实验配置已保存到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/latinbert/latinbert_20241110195350/config.json


实验配置已保存到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/latinbert/latinbert_20241110195350/config.json


In [5]:
# 加载本地 Latin BERT 模型
logger.info("加载 Latin-BERT 嵌入模型...")
model_path = "/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/latin-bert/models/latin_bert"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path).to(device)


2024-11-10 19:53:50,366 - INFO - 加载 Latin-BERT 嵌入模型...


In [6]:
# 定义嵌入生成函数
def get_latin_bert_embeddings(texts):
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=256).to(device)
    with torch.no_grad():
        embeddings = model(**tokens).last_hidden_state.mean(dim=1)
    return embeddings.cpu().numpy()


In [7]:
# 加载测试集文档
testset_dir = os.path.join(BASE_DIR, 'data/testset')
documents = []

logger.info("加载测试集数据...")
test_files = [f for f in os.listdir(testset_dir) if f.endswith('.txt')]
for test_file in tqdm(test_files, desc="Loading testset files"):
    file_path = os.path.join(testset_dir, test_file)
    with open(file_path, 'r', encoding='utf-8') as file:
        documents.append(file.read())

logger.info(f"加载了 {len(documents)} 个文档用于 BERTopic 实验。")
print(documents[:5])


2024-11-10 19:53:51,963 - INFO - 加载测试集数据...
Loading testset files: 100%|██████████| 61/61 [00:00<00:00, 1942.43it/s]
2024-11-10 19:53:52,093 - INFO - 加载了 61 个文档用于 BERTopic 实验。


['induimini dominus iesum christum romanus epistula dominicus quoad intellego aliqual iesus secundus adam filius deus andreas declaro aegeae lego his¬toria lombardica tertius folium considero epistula aposto¬lus hora somnus surgo probo apostolus dilectio com¬pletio lex prae¬cepto dilectio propior complicantur om¬nia quoad propior subjungo plenitudo lex dilectio scio hora somnus sur¬gere dilectio plenitudo lex scio surgen¬dum somnus vigilanter attenden¬dum propior salus credo expono tempus credo fides recipio salus mors christus conformabimur exspecto dico2 credo salus lex spiri¬tualiter intellego scien¬tes spiritual intellego dilectio¬nem propior salus unus mando1 dilectio adimpleo diligo nolo malus infero salus intel¬lectu diligo removeo difficilis potestas pauper dives nobilis ignobilis graecus barbaris diligo plenitu¬do lex deus adimplemus mandatum ser¬vamus vita ingredior salus magister veritas volo1 vita ingredior servo mandatum somnus sopor teneo dilectio plenitudo lex ignoro pec

In [8]:
# 嵌入生成
document_embeddings = get_latin_bert_embeddings(documents)


In [9]:
# 定义参数网格
param_grid = {
    'n_neighbors': [5, 10, 15],
    'min_dist': [0.0, 0.1, 0.5],
    'n_components': [2, 5, 10],
    'min_topic_size': [2, 5, 10],
    'nr_topics': ['auto', 10, 20]
}

# 创建参数组合
param_combinations = list(ParameterGrid(param_grid))

# 存储每次运行的结果
results = []


In [10]:
# 遍历每个参数组合
for params in param_combinations:
    logger.info(f"评估参数组合: {params}")
    
    # 创建 UMAP 和 BERTopic 模型
    custom_umap = UMAP(
        n_neighbors=params['n_neighbors'],
        min_dist=params['min_dist'],
        n_components=params['n_components'],
        random_state=42
    )
    
    topic_model = BERTopic(
        embedding_model=None,
        vectorizer_model=CountVectorizer(ngram_range=experiment_config["parameters"]["n_gram_range"]),
        umap_model=custom_umap,
        min_topic_size=params['min_topic_size'],
        nr_topics=params['nr_topics'],
        language=None
    )
    
    # 训练模型
    topics, probabilities = topic_model.fit_transform(documents, embeddings=document_embeddings)
    
    # 提取每个主题的前10个关键词
    bertopic_topics = []
    for topic_num in range(len(topic_model.get_topics())):
        topic = topic_model.get_topic(topic_num)
        if topic:  # 确保 topic 不是布尔值
            bertopic_topics.append([word for word, _ in topic])
    
    # 使用 gensim.corpora.Dictionary 创建词典
    texts = [doc.split() for doc in documents]
    dictionary = corpora.Dictionary(texts)
    
    # 计算主题一致性（NPMI）
    coherence_model_npmi = CoherenceModel(topics=bertopic_topics, texts=texts, dictionary=dictionary, coherence='c_npmi')
    coherence_score_npmi = coherence_model_npmi.get_coherence()
    
    # 计算主题多样性
    unique_words = set()
    total_words = 0
    for topic in bertopic_topics:
        unique_words.update(topic)
        total_words += len(topic)
    topic_diversity = len(unique_words) / total_words
    
    # 计算 WEPS 评分
    def get_average_embedding_for_topic(topic_words, tokenizer, model, device):
        tokens = tokenizer(topic_words, padding=True, truncation=True, return_tensors="pt", max_length=256).to(device)
        with torch.no_grad():
            embeddings = model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings.cpu().numpy()

    topic_embeddings = []
    for topic in bertopic_topics:
        topic_embedding = get_average_embedding_for_topic(topic, tokenizer, model, device)
        topic_embeddings.append(topic_embedding)
    topic_embeddings = np.concatenate(topic_embeddings)
    similarity_matrix = cosine_similarity(topic_embeddings)
    weps = np.mean(similarity_matrix)
    
    silhouette_avg = silhouette_score(document_embeddings, topics)
    
    # 存储结果
    results.append({
        'params': params,
        'coherence_score_npmi': coherence_score_npmi,
        'topic_diversity': topic_diversity,
        'weps': weps,
        'silhouette_score': silhouette_avg
    })

# 找到最优的参数组合
best_result = max(results, key=lambda x: (x['coherence_score_npmi'], x['topic_diversity'], x['weps'], x['silhouette_score']))

# 输出最优结果
logger.info(f"最优参数组合: {best_result['params']}")
print(f"最优参数组合: {best_result['params']}")

# 保存最优参数
params_path = os.path.join(experiment_dir, 'best_params.json')
with open(params_path, 'w') as f:
    json.dump(best_result['params'], f, indent=4)
logger.info(f"最优参数已保存至 {params_path}")


2024-11-10 19:53:56,506 - INFO - 评估参数组合: {'min_dist': 0.0, 'min_topic_size': 2, 'n_components': 2, 'n_neighbors': 5, 'nr_topics': 'auto'}
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-11-10 19:54:01,581 - INFO - adding document #0 to Dictionary<0 unique tokens: []>
2024-11-10 19:54:01,604 - INFO - built Dictionary<9286 unique tokens: ['abicio', 'abscido', 'achaia', 'actus', 'adae']...> from 61 documents (total 52171 corpus positions)
2024-11-10 19:54:01,604 - INFO - Dictionary lifecycle event {'msg': "built Dictionary<9286 unique tokens: ['abicio', 'abscido', 'achaia', 'actus', 'adae']...> from 61 documents (total 52171 corpus positions)", 'datetime': '2024-11-10T19:54:01.604500', 'gensim': '4.3.3', 'python': '3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:35:25) [Clang 16.0.6 ]', 'platform': 'macOS-15.1-arm64-arm-64bit', 'event': 'created'}
2024-11-10 19:54:01,606 - INFO - using ParallelWordOccurrenceAccumulator<proce

IndexError: list index out of range

In [None]:
# 保存评估结果和主题词
results_file_path = os.path.join(experiment_dir, 'bertopic_results.txt')
with open(results_file_path, 'w', encoding='utf-8') as f:
    # 保存主题词
    f.write("BERTopic 生成的主题：\n")
    for idx, topic in enumerate(bertopic_topics):
        topic_str = f"Topic {idx}: {', '.join(topic)}"
        f.write(topic_str + '\n')
    f.write("\n")
    
    # 保存评估结果
    f.write(f"平均主题一致性 (NPMI): {best_result['coherence_score_npmi']}\n")
    f.write(f"主题多样性: {best_result['topic_diversity']}\n")
    f.write(f"WEPS: {best_result['weps']}\n")
    f.write(f"轮廓系数: {best_result['silhouette_score']}\n")

logger.info(f"生成的主题和评估结果已保存至 {results_file_path}")
print(f"生成的主题和评估结果已保存至 {results_file_path}")