In [None]:
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from transformers.pipelines import pipeline
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# 加载数据

In [None]:
# step1 加载文件
with open('../../data/切词.txt', 'r', encoding='utf-8') as file:
  docs = file.readlines()
print('条数: ', len(docs))
print('预览第一条: ', docs[0])

vectorizer_model = None

# 创建

In [None]:
# 1. 词向量模型，同时加载本地训练好的词向量
embedding_model = pipeline("feature-extraction", model="bert-base-chinese") # 使用bert-base-chinese
embeddings = np.load('../../data/embedding_bbc.npy') # 使用bert-base-chinese向量
print(embeddings.shape)

# 2. 创建分词模型
vectorizer_model = CountVectorizer() # 因为我们已经分好词了，所以这里不需要传入分词函数了

# 3. 创建UMAP降维模型
umap_model = UMAP(
  n_neighbors=15,
  n_components=5,
  min_dist=0.0,
  metric='cosine',
  random_state=42  # ⚠️ 防止随机 https://maartengr.github.io/BERTopic/faq.html
)

# 4. 创建HDBSCAN聚类模型
# 如果要建设离群值，可以减小下面两个参数
# https://hdbscan.readthedocs.io/en/latest/faq.html
hdbscan_model = HDBSCAN(
  min_cluster_size=20,
  min_samples=5,
  prediction_data=True # 一定要注意，如果后文设置了calculate_probabilities = True，一定要设置该参数 https://github.com/MaartenGr/BERTopic/issues/1103
)

# 5. 创建CountVectorizer模型
vectorizer_model = CountVectorizer(stop_words=['洛阳', '旅游', '文化'])

In [None]:
topic_model = BERTopic(
  embedding_model=embedding_model,
  vectorizer_model=vectorizer_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  # 官方文档
  # The calculate_probabilties parameter is only used when using HDBSCAN or cuML's HDBSCAN model. In other words, this will not work when using a model other than HDBSCAN

  # 源码注释
  # https://maartengr.github.io/BERTopic/faq.html#how-do-i-calculate-the-probabilities-of-all-topics-in-a-document
  # calculate_probabilities: Calculate the probabilities of all topics
  #                        per document instead of the probability of the assigned
  #                        topic per document. This could slow down the extraction
  #                        of topics if you have many documents (> 100_000). 
  #                        NOTE: If false you cannot use the corresponding
  #                        visualization method `visualize_probabilities`.
  #                        NOTE: This is an approximation of topic probabilities
  #                        as used in HDBSCAN and not an exact representation.
  # 简而言之，这个值默认是False
  # 它的含义是：如果设置为False（默认值），则只计算一条文档归属于其所属主题的概率；如果设置为True则计算一条文档归属于所有主题的概率。但可能会降低运算效率
  # 其次，该参数仅用于使用HDBSCAN做聚类的场景
  calculate_probabilities = True
)

# 源码注释
# predictions: Topic predictions for each documents
# probabilities: The probability of the assigned topic per document.
#   If `calculate_probabilities` in BERTopic is set to True, then
#   it calculates the probabilities of all topics across all documents
#   instead of only the assigned topic. This, however, slows down
#   computation and may increase memory usage.
topics, probs = topic_model.fit_transform(docs, embeddings=embeddings) #传入训练好的词向量
topic_info = topic_model.get_topic_info()
topic_info

# 先看一个topic_model.fit_transform输出的是什么

In [None]:
print(len(topics), topics[:10])
print(len(probs), probs[:10])

In [None]:
probs.shape