In [1]:
import os
import time
import logging
import pickle
import tqdm
import json
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np

2024-11-08 22:04:09.024 python[52201:405582] getMetalPluginClassForService: Failed to find bundle for accelerator bundle named: AGXMetalA12 errno: 0


In [2]:
BASE_DIR = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'

In [3]:
os.environ["PYTORCH_MPS_SUPPORT"] = "0"


In [4]:
# 设置 CPU 设备
device = torch.device("cpu")

In [5]:
# 生成唯一实验ID
experiment_id = f"bertopic_experiment_{int(time.time())}"
experiment_dir = os.path.join(BASE_DIR, 'experiments', experiment_id)

# 创建实验目录
if not os.path.exists(experiment_dir):
    os.makedirs(experiment_dir)

# 获取日志文件路径
log_file_path = os.path.join(experiment_dir, f"bertopic_experiment_{experiment_id}.log")

# 配置日志记录
logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler = logging.FileHandler(log_file_path)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

logger.info(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")
print(f"启动实验 {experiment_id}，日志记录到 {log_file_path}")

2024-11-08 22:04:16,629 - INFO - 启动实验 bertopic_experiment_1731099856，日志记录到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731099856/bertopic_experiment_bertopic_experiment_1731099856.log


启动实验 bertopic_experiment_1731099856，日志记录到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731099856/bertopic_experiment_bertopic_experiment_1731099856.log


In [6]:
# 实验配置
experiment_config = {
    "parameters": {
        "n_gram_range": (1, 2),  # 捕捉更多的上下文短语
        "min_topic_size": 2,  # 增大最小主题大小以减少噪声
        "nr_topics": "auto",  # 自动确定主题数量
        "umap_params": {
            "n_neighbors": 10,  # 增加邻居数使得降维更平滑
            "min_dist": 0.1,  # 增大最小距离使主题更分离
            "n_components": 2,  # 增大维度以保留更多特征信息
            "random_state": 42  # 确保实验可重复
        }
    }
}

config_path = os.path.join(experiment_dir, 'config.json')
with open(config_path, 'w') as config_file:
    json.dump(experiment_config, config_file, indent=4)

logger.info(f"实验配置已保存到 {config_path}")
print(f"实验配置已保存到 {config_path}")

2024-11-08 22:04:19,164 - INFO - 实验配置已保存到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731099856/config.json


实验配置已保存到 /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/experiments/bertopic_experiment_1731099856/config.json


In [7]:
# 加载 mBERT 模型
logger.info("加载 mBERT 嵌入模型...")
model_name = "bert-base-multilingual-cased"  # 使用 mBERT 的模型名称
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

# 设置设备为 CPU
device = torch.device("cpu")
bert_model.to(device)

2024-11-08 22:04:21,743 - INFO - 加载 mBERT 嵌入模型...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [8]:
def get_embeddings_batch(texts, batch_size=4):
    all_embeddings = []
    print("Generating embeddings with progress bar:")
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=256).to(device)
        with torch.no_grad():
            embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1).to("cpu")
        all_embeddings.append(embeddings)
    
    # 将嵌入从张量转换为 numpy 数组
    all_embeddings = torch.cat(all_embeddings, dim=0)
    return all_embeddings.numpy()

In [9]:
# 创建 CountVectorizer 和 UMAP
vectorizer_model = CountVectorizer(
    ngram_range=experiment_config["parameters"]["n_gram_range"],
    token_pattern=r"(?u)\b\w+\b"
)
custom_umap = UMAP(
    n_neighbors=experiment_config["parameters"]["umap_params"]["n_neighbors"],
    min_dist=experiment_config["parameters"]["umap_params"]["min_dist"],
    n_components=experiment_config["parameters"]["umap_params"]["n_components"],
    random_state=experiment_config["parameters"]["umap_params"]["random_state"]
)

In [10]:
from tqdm import tqdm

sample_text = ["Gallia est omnis divisa in partes tres."]
embeddings = get_embeddings_batch(sample_text)
print(embeddings)

Generating embeddings with progress bar:


Embedding batches:   0%|          | 0/1 [00:00<?, ?it/s]



Embedding batches: 100%|██████████| 1/1 [00:03<00:00,  3.20s/it]

[[-0.470485   -0.44551334  0.87620616  0.2539409   0.04651868 -0.1859973
  -0.31797978 -0.91170734  0.1361594   1.4825076   0.6874243  -0.2873758
   0.40294683  0.08532739 -0.27130303  0.00572636 -0.28052163  0.2920426
   0.08689792  0.3988412  -0.03843997 -0.30036142 -0.5438476   0.21518339
   0.04183191 -0.33924854 -0.15437032 -0.47068164  0.3841212  -0.60857266
   0.19729309  0.84971136  0.04278448 -0.21175192  0.06215887  0.2147707
  -0.05952524  0.41915867  0.56576985  0.35840595 -0.25020942  0.7506829
  -0.83813167 -0.26705465 -0.08727431 -0.09836506 -0.38725692  0.20073754
  -0.19582477 -0.25368464 -0.5044367  -0.10926254 -0.38802317 -0.08912248
  -0.10833127 -0.17533143  0.02227819 -0.31886753  0.09597922  0.2834816
   0.13880643 -0.23092322 -0.50962543  0.09932714 -0.23265117 -0.12123919
  -0.38961098  0.19686666 -0.79366225 -0.20112753  0.85688746  0.04029782
   0.49419764 -0.08749607 -0.16760086  0.17439248  0.66768587 -0.20600595
  -0.6811675   0.10808054 -0.05225864  0.391




In [11]:
# 加载测试集文档
testset_dir = os.path.join(BASE_DIR, 'data/berttest')
documents = []

logger.info("加载测试集数据...")
test_files = [f for f in os.listdir(testset_dir) if f.endswith('.txt')]
for test_file in tqdm(test_files, desc="Loading testset files"):
    file_path = os.path.join(testset_dir, test_file)
    with open(file_path, 'r', encoding='utf-8') as file:
        documents.append(file.read())

logger.info(f"加载了 {len(documents)} 个文档用于 BERTopic 实验。")
print(documents[:5])


2024-11-08 22:06:01,613 - INFO - 加载测试集数据...
Loading testset files: 100%|██████████| 5/5 [00:00<00:00, 2196.66it/s]
2024-11-08 22:06:01,623 - INFO - 加载了 5 个文档用于 BERTopic 实验。


['commemoratio lucae clementinis dominus sanctus reliquiae veneratio sanc¬tor vir honorabilis christus dilectus fra¬tres insto devotio tempus tempus piger cor redeo debeo salus cor inflammo cari¬tas ardor sedulus meditatio pensaris excusatio sperno eloquen¬tiae fuco scientia vito ineptissi¬mus sperno allegatio simplex quoad conor cunctipotentis gratia mens via salutaris meditatio incitaris meditatio verbum exprimo instituo inardescat ignis spirital affectio qui1 anima sacro passio tempus divinus amor gaudium vitalis incalesco eapropter dilectus utor doctrina urbanus reliquiae veneratio sancio dominus exordior intentio laus sacrosanctus sacramentum novus lex utor exordior commemoratio epilogus tres tres tempus magnus ecclesiastici vir superus studium meditaris debeo memoria acerbus passio salvator salutifer eucharistiae sacramentum dies pius testamentum legatus necnon tertius sacramentum confectio cum2 quattuor effectus materia forma per¬ceptio praeho verbum christus salvator ostendo th

In [None]:
# 初始化 BERTopic 模型，不设置 embedding_model
topic_model = BERTopic(
    embedding_model=None,  # 不使用默认的嵌入模型
    vectorizer_model=vectorizer_model,
    umap_model=custom_umap,
    min_topic_size=experiment_config["parameters"]["min_topic_size"],
    nr_topics=experiment_config["parameters"]["nr_topics"],
    language=None  # 禁用语言特定的嵌入
)

# 手动生成嵌入
print("Generating embeddings for documents...")
custom_embeddings = get_embeddings_batch(documents)

# 使用手动生成的嵌入进行主题模型训练
if documents:
    print("Starting BERTopic model training...")
    logger.info("开始训练 BERTopic 模型...")
    
    # 将生成的 numpy 格式的自定义嵌入传入 fit_transform
    topics, probabilities = topic_model.fit_transform(documents, embeddings=custom_embeddings)
    logger.info("BERTopic 模型训练成功。")
    print("BERTopic model training completed.")
    
    # 可视化和保存结果
    topics_info = topic_model.get_topic_info()
    for topic_num in topics_info['Topic'][:10]:  # 输出前 10 个主题
        if topic_num != -1:
            words_weights = topic_model.get_topic(topic_num)
            words_str = ', '.join([word for word, _ in words_weights])
            print(f"主题 {topic_num}: {words_str}")
            logger.info(f"主题 {topic_num}: {words_str}")

    # 保存文档的主题分配结果
    document_topic_data = []
    for doc_idx, (topic, prob) in enumerate(zip(topics, probabilities)):
        document_name = os.path.basename(test_files[doc_idx])
        document_topic_data.append([document_name, topic, prob])

    df_document_topics = pd.DataFrame(document_topic_data, columns=["Document", "Assigned Topic", "Probability"])
    document_topics_csv_path = os.path.join(experiment_dir, 'bertopic_document_topic_distribution.csv')
    df_document_topics.to_csv(document_topics_csv_path, index=False)
    logger.info(f"每个文档的主题分配结果已保存至 {document_topics_csv_path}。")


Generating embeddings for documents...
Generating embeddings with progress bar:


Embedding batches: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]
2024-11-08 22:06:06,248 - INFO - 开始训练 BERTopic 模型...
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Starting BERTopic model training...


In [None]:
# 保存可视化图表为 HTML 文件
try:
    topics_fig_path = os.path.join(experiment_dir, 'bertopic_topics.html')
    fig = topic_model.visualize_topics()
    fig.write_html(topics_fig_path)
    logger.info(f"主题可视化图表已保存至 {topics_fig_path}")

    # 生成并保存层次聚类图表
    fig_hierarchy = topic_model.visualize_hierarchy()
    fig_hierarchy.show()
    hierarchy_fig_path = os.path.join(experiment_dir, 'bertopic_hierarchy.html')
    fig_hierarchy.write_html(hierarchy_fig_path)
    logger.info(f"层次聚类图表已保存至 {hierarchy_fig_path}")
except Exception as e:
    logger.error(f"保存可视化图表时发生错误: {e}")
    raise