In [1]:
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from transformers.pipelines import pipeline
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


# 加载数据

In [2]:
# step1 加载文件
with open('../../data/切词.txt', 'r', encoding='utf-8') as file:
  docs = file.readlines()
print('条数: ', len(docs))
print('预览第一条: ', docs[0])

vectorizer_model = None

条数:  2714
预览第一条:  This article shows how symbolic computing and the mathematical formalism induced by maximizing entropy and minimizing the mean deviation from statistical equilibrium may be effectively applied to obtaining probabilistic models for the structure of atoms, using trial wave functions compatible with an average shell picture of the atom. The objective is not only to recover the experimental value of the ground state mean energy of the atom, but rather to better approximate the unknown parameters of these trial



# 创建

In [4]:
# 1. 词向量模型，同时加载本地训练好的词向量
embedding_model = pipeline("feature-extraction", model="bert-base-chinese") # 使用bert-base-chinese
# embedding_model = pipeline("feature-extraction", model="hfl/chinese-bert-wwm") # 使用bert-base-chinese
embeddings = np.load('../../data/emb.npy') # 使用bert-base-chinese向量
print(embeddings.shape)

# 2. 创建分词模型
vectorizer_model = CountVectorizer() # 因为我们已经分好词了，所以这里不需要传入分词函数了

# 3. 创建UMAP降维模型
umap_model = UMAP(
  n_neighbors=15,
  n_components=5,
  min_dist=0.0,
  metric='cosine',
  random_state=42  # ⚠️ 防止随机 https://maartengr.github.io/BERTopic/faq.html
)

# 4. 创建HDBSCAN聚类模型
# 如果要建设离群值，可以减小下面两个参数
# https://hdbscan.readthedocs.io/en/latest/faq.html
hdbscan_model = HDBSCAN(
  min_cluster_size=20,
  min_samples=5,
)

# 5. 创建CountVectorizer模型
vectorizer_model = CountVectorizer(stop_words=['洛阳', '旅游', '文化'])

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(2714, 384)


In [5]:
topic_model = BERTopic(
  embedding_model=embedding_model,
  vectorizer_model=vectorizer_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
)

topics, probs = topic_model.fit_transform(docs, embeddings=embeddings) #传入训练好的词向量
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,943,-1_the_of_in_and,"[the, of, in, and, to, we, quantum, is, for, by]",[In the surface acoustic wave quantum computer...
1,0,212,0_entanglement_states_of_the,"[entanglement, states, of, the, we, that, for,...",[We introduce an operational entanglement clas...
2,1,142,1_quantum_of_to_and,"[quantum, of, to, and, the, algorithm, algorit...",[In quantum algorithms discovered so far for s...
3,2,98,2_topological_majorana_superconductor_supercon...,"[topological, majorana, superconductor, superc...",[Superconductors with topological surface or e...
4,3,98,3_laser_the_of_and,"[laser, the, of, and, ion, in, is, by, to, tra...",[Yb ions were trapped in quadrupole ion trap. ...
5,4,92,4_superconducting_qubit_flux_the,"[superconducting, qubit, flux, the, to, qubits...","[For many types of superconducting qubits, mag..."
6,5,88,5_state_entangled_states_teleportation,"[state, entangled, states, teleportation, sche...",[proposed scheme for bidirectional quantum tel...
7,6,87,6_photons_optical_photon_quantum,"[photons, optical, photon, quantum, light, and...",[Semiconductor quantum dots are currently emer...
8,7,80,7_stars_the_ray_of,"[stars, the, ray, of, star, to, galaxy, galaxi...",[We propose stellar locus outlier (SLOT) metho...
9,8,70,8_cavity_gate_the_scheme,"[cavity, gate, the, scheme, two, of, atoms, qu...",[We present one-step scheme for direct impleme...


# 可视化

In [6]:
# Arguments:
#           topics: A selection of topics to visualize
#           top_n_topics: Only select the top n most frequent topics
#           custom_labels: Whether to use custom topic labels that were defined using 
#                      `topic_model.set_topic_labels`.
#           title: Title of the plot.
#           width: The width of the figure.
#           height: The height of the figure.
topic_model.visualize_topics()


# 合并思路1：合并距离相近主题

In [None]:
topic_model.merge_topics(docs, [
  [8, 9],
  [0, 5, 10, 13],
  [1, 4],
  [2, 11, 12],
  [3, 6, 7]
])

topic_model.get_topic_info()

In [None]:
topic_model.visualize_topics()