# 导入

In [None]:
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from transformers.pipelines import pipeline

# 加载数据

In [None]:
# step1 加载文件
with open('./切词_100.txt', 'r', encoding='utf-8') as file:
  docs = file.readlines()
print('文档条数: ', len(docs))
print('预览第一条: ', docs[0])

# 创建词向量模型

官方文档
To use a Hugging Face transformers model, load in a pipeline and point to any model found on their model hub (https://huggingface.co/models):

from transformers.pipelines import pipeline

embedding_model = pipeline("feature-extraction", model="distilbert-base-cased")
topic_model = BERTopic(embedding_model=embedding_model)

In [None]:
# transformers.pipelines 模块的主要功能是提供了一系列预定义的管道（pipelines），每个管道对应一个特定的 NLP 任务。通过调用这些管道的方法，可以直接将文本输入传递给相应的预训练模型，然后获取模型的输出结果
# 特征提取（Feature Extraction）是指从文本数据中提取有意义的特征或表示。这些特征可以用于各种下游任务，如文本分类
# https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html#sentence-transformers
# https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/pipelines#transformers.FeatureExtractionPipeline
embedding_model = pipeline(
  "feature-extraction",
  # model="bert-base-chinese",
  model="hfl/chinese-bert-wwm",
)
embedding_model

# 加载词向量，很快

In [None]:
embeddings = np.load('emb.npy')
print(type(embeddings), embeddings.shape)

# 文本聚类 ⭐

In [None]:
# 创建模型，跑出来的结果可能和我的不一样
topic_model = BERTopic(
  embedding_model=embedding_model,
  min_topic_size=10, # 重要参数
  verbose=True,
)

# 训练模型
topic_model.fit_transform(docs, embeddings=embeddings)

# 获取聚类结果
topic_info = topic_model.get_topic_info()
topic_info

# 可视化

In [None]:
# ⭐ UMAP可视化
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine',).fit_transform(embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)