In [12]:
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from transformers.pipelines import pipeline
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# 加载数据

In [13]:
# step1 加载文件
with open('../../data/切词.txt', 'r', encoding='utf-8') as file:
  docs = file.readlines()
print('条数: ', len(docs))
print('预览第一条: ', docs[0])

vectorizer_model = None

条数:  2714
预览第一条:  This article shows how symbolic computing and the mathematical formalism induced by maximizing entropy and minimizing the mean deviation from statistical equilibrium may be effectively applied to obtaining probabilistic models for the structure of atoms, using trial wave functions compatible with an average shell picture of the atom. The objective is not only to recover the experimental value of the ground state mean energy of the atom, but rather to better approximate the unknown parameters of these trial



# 创建

In [14]:
# 1. 词向量模型，同时加载本地训练好的词向量
embedding_model = pipeline("feature-extraction", model="bert-base-chinese") # 使用bert-base-chinese
embeddings = np.load('../../data/emb.npy') # 使用向量
print(embeddings.shape)

# 2. 创建分词模型
vectorizer_model = CountVectorizer() # 因为我们已经分好词了，所以这里不需要传入分词函数了

# 3. 创建UMAP降维模型
umap_model = UMAP(
  n_neighbors=15,
  n_components=5,
  min_dist=0.0,
  metric='cosine',
  random_state=42  # ⚠️ 防止随机 https://maartengr.github.io/BERTopic/faq.html
)

# 4. 创建HDBSCAN聚类模型
# 如果要建设离群值，可以减小下面两个参数
# https://hdbscan.readthedocs.io/en/latest/faq.html
hdbscan_model = HDBSCAN(
  min_cluster_size=20,
  min_samples=20,
)

# 5. 创建CountVectorizer模型
vectorizer_model = CountVectorizer()

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(2714, 384)


In [15]:
topic_model = BERTopic(
  embedding_model=embedding_model,
  vectorizer_model=vectorizer_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
)

topics, probs = topic_model.fit_transform(docs, embeddings=embeddings) #传入训练好的词向量
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1236,-1_the_of_and_in,"[the, of, and, in, to, we, quantum, is, for, w...",[In the surface acoustic wave quantum computer...
1,0,222,0_states_of_entanglement_the,"[states, of, entanglement, the, we, for, that,...",[We introduce classification of mixed three-qu...
2,1,154,1_quantum_of_the_and,"[quantum, of, the, and, to, is, for, in, this,...",[In quantum algorithms discovered so far for s...
3,2,133,2_the_of_and_in,"[the, of, and, in, laser, is, to, with, by, are]",[Yb ions were trapped in quadrupole ion trap. ...
4,3,102,3_topological_majorana_the_in,"[topological, majorana, the, in, superconducto...",[3D topological insulators attract much attent...
5,4,93,4_the_spin_of_model,"[the, spin, of, model, we, and, to, is, in, qu...",[We consider two distant spin-1 particles (or ...
6,5,91,5_photon_optical_photons_quantum,"[photon, optical, photons, quantum, and, of, t...",[Quantum light sources play vital role in vari...
7,6,88,6_spin_silicon_the_in,"[spin, silicon, the, in, donor, si, and, of, e...",[The silicon vacancy (V Si in 3C-SiC is studie...
8,7,82,7_superconducting_qubit_the_to,"[superconducting, qubit, the, to, flux, of, qu...",[We introduce new type of superconducting char...
9,8,80,8_stars_the_of_ray,"[stars, the, of, ray, to, star, in, and, from,...",[We propose stellar locus outlier (SLOT) metho...


# 可视化

In [16]:
topic_model.visualize_barchart()

In [17]:

# Arguments:
#     topics: A selection of topics to visualize.
#     top_n_topics: Only select the top n most frequent topics.
#     n_words: Number of words to show in a topic
#     custom_labels: Whether to use custom topic labels that were defined using
#                `topic_model.set_topic_labels`.
#     title: Title of the plot.
#     width: The width of each figure.
#     height: The height of each figure.
topic_model.visualize_barchart(top_n_topics=4, n_words=3)#显示几个主题，显示多少个主题词

# 修改标签名

In [None]:
topic_model.set_topic_labels({
  0: '龙门石窟',
  1: '景区运营',
  2: '文旅建设'
})
topic_model.get_topic_info()

" topic_model.set_topic_labels({\n  0: '龙门石窟',\n  1: '景区运营',\n  2: '文旅建设'\n})\ntopic_model.get_topic_info() "

In [19]:
topic_model.visualize_barchart(custom_labels=True) # 自定义标签名

# 下载图像