In [1]:
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from transformers.pipelines import pipeline
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


# 加载数据

In [2]:
# step1 加载文件
with open('../../data/切词.txt', 'r', encoding='utf-8') as file:
  docs = file.readlines()
print('条数: ', len(docs))
print('预览第一条: ', docs[0])

vectorizer_model = None

条数:  2714
预览第一条:  This article shows how symbolic computing and the mathematical formalism induced by maximizing entropy and minimizing the mean deviation from statistical equilibrium may be effectively applied to obtaining probabilistic models for the structure of atoms, using trial wave functions compatible with an average shell picture of the atom. The objective is not only to recover the experimental value of the ground state mean energy of the atom, but rather to better approximate the unknown parameters of these trial



# 创建

In [3]:
# 1. 词向量模型，同时加载本地训练好的词向量
embedding_model = pipeline("feature-extraction", model="bert-base-chinese") # 使用bert-base-chinese
embeddings = np.load('../../data/emb.npy') # 使用bert-base-chinese向量
print(embeddings.shape)

# 2. 创建分词模型
vectorizer_model = CountVectorizer() # 因为我们已经分好词了，所以这里不需要传入分词函数了

# 3. 创建UMAP降维模型
umap_model = UMAP(
  n_neighbors=15,
  n_components=5,
  min_dist=0.0,
  metric='cosine',
  random_state=30  # ⚠️ 防止随机 https://maartengr.github.io/BERTopic/faq.html
)

# 4. 创建HDBSCAN聚类模型
# 如果要建设离群值，可以减小下面两个参数
# https://hdbscan.readthedocs.io/en/latest/faq.html
hdbscan_model = HDBSCAN(
  min_cluster_size=10,
  min_samples=5,
)

# 5. 创建CountVectorizer模型
vectorizer_model = CountVectorizer(stop_words=['洛阳', '旅游', '文化'])

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(2714, 384)


In [4]:
topic_model = BERTopic(
  embedding_model=embedding_model,
  vectorizer_model=vectorizer_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
)

topics, probs = topic_model.fit_transform(docs, embeddings=embeddings) #传入训练好的词向量
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,937,-1_the_of_we_and,"[the, of, we, and, in, to, quantum, for, is, t...",[We investigate the entanglement dynamics of t...
1,0,97,0_topological_majorana_superconductivity_super...,"[topological, majorana, superconductivity, sup...",[Superconductors with topological surface or e...
2,1,92,1_quantum_learning_computers_computing,"[quantum, learning, computers, computing, and,...",[quantum computer encodes information in quant...
3,2,87,2_laser_ion_the_ions,"[laser, ion, the, ions, energy, and, of, ioniz...",[Yb ions were trapped in quadrupole ion trap. ...
4,3,80,3_stars_ray_star_galaxies,"[stars, ray, star, galaxies, galaxy, the, of, ...",[We propose stellar locus outlier (SLOT) metho...
...,...,...,...,...,...
61,60,10,60_photons_conversion_fiber_fibers,"[photons, conversion, fiber, fibers, key, low,...",[We report single-stage bidirectional interfac...
62,61,10,61_environment_dynamics_qubit_le,"[environment, dynamics, qubit, le, general, ce...",[The Loschmidt echo (LE) of central two-level ...
63,62,10,62_cloning_covariant_machine_repair,"[cloning, covariant, machine, repair, optimal,...",[While exact cloning of an unknown quantum sta...
64,63,10,63_lzs_zener_landau_lz,"[lzs, zener, landau, lz, transition, interfere...",[single bichromatic field near resonant to qub...


# 层次聚类

In [5]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 64/64 [00:00<00:00, 191.52it/s]


# 参数：方向

In [None]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, orientation='bottom')

# label

In [None]:
topic_model.set_topic_labels({
  0: '景区运营',
  1: '知名景点',
})
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, custom_labels=True)

# 合并主题

In [None]:
topic_model.merge_topics(docs, [
  [19, 2],
  [9, 6, 1, 12, 20, 5, 15, 18],
  [11, 16, 4, 10, 7],
  [17, 13, 0, 3, 8, 14]
])

topic_info = topic_model.get_topic_info()
topic_info

In [None]:
topic_model.visualize_topics()