# 导入

In [None]:
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP

# 加载数据

In [None]:
# step1 加载文件
with open('./切词_100.txt', 'r', encoding='utf-8') as file:
  docs = file.readlines()
print('文档条数: ', len(docs))
print('预览第一条: ', docs[0])

# 创建词向量模型

In [None]:
embedding_model = SentenceTransformer(
  'paraphrase-multilingual-mpnet-base-v2',
)

# 加载词向量，很快

In [None]:
embeddings = np.load('./emb.npy')
print(type(embeddings), embeddings.shape)

# 文本聚类 ⭐

In [None]:
# 创建模型，跑出来的结果可能和我的不一样
topic_model = BERTopic(
  embedding_model=embedding_model,
  min_topic_size=10, # 重要参数
  verbose=True,
)

# 训练模型
topic_model.fit_transform(docs, embeddings=embeddings)

# 获取聚类结果
topic_info = topic_model.get_topic_info()
topic_info

# 可视化

In [None]:
# ⭐ UMAP可视化
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine',).fit_transform(embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)