# 导入

In [1]:
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP

  from .autonotebook import tqdm as notebook_tqdm


# 加载数据

In [3]:
# step1 加载文件
with open('./切词.txt', 'r', encoding='utf-8') as file:
  docs = file.readlines()
print('文档条数: ', len(docs))
print('预览第一条: ', docs[0])

文档条数:  2714
预览第一条:  This article shows how symbolic computing and the mathematical formalism induced by maximizing entropy and minimizing the mean deviation from statistical equilibrium may be effectively applied to obtaining probabilistic models for the structure of atoms, using trial wave functions compatible with an average shell picture of the atom. The objective is not only to recover the experimental value of the ground state mean energy of the atom, but rather to better approximate the unknown parameters of these trial



# 创建词向量模型

In [5]:
embedding_model = SentenceTransformer(
  'paraphrase-multilingual-mpnet-base-v2',
)

# 加载词向量，很快

In [6]:
embeddings = np.load('./emb.npy')
print(type(embeddings), embeddings.shape)

<class 'numpy.ndarray'> (2714, 384)


# 文本聚类 ⭐

In [7]:
# 创建模型，跑出来的结果可能和我的不一样
topic_model = BERTopic(
  embedding_model=embedding_model,
  min_topic_size=10, # 重要参数
  verbose=True,
)

# 训练模型
topic_model.fit_transform(docs, embeddings=embeddings)

# 获取聚类结果
topic_info = topic_model.get_topic_info()
topic_info

2024-11-23 23:28:30,734 - BERTopic - Reduced dimensionality
2024-11-23 23:28:30,829 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1105,-1_the_of_in_and,"[the, of, in, and, we, to, is, quantum, for, t...",[quantum-mechanical many-particle system may e...
1,0,142,0_states_entanglement_of_we,"[states, entanglement, of, we, local, the, tha...",[Many-body quantum systems can be characterize...
2,1,98,1_topological_majorana_superconductor_supercon...,"[topological, majorana, superconductor, superc...",[Superconductors with topological surface or e...
3,2,92,2_laser_the_ion_of,"[laser, the, ion, of, and, transition, in, ene...",[Yb ions were trapped in quadrupole ion trap. ...
4,3,87,3_superconducting_qubit_flux_qubits,"[superconducting, qubit, flux, qubits, to, the...",[We introduce new type of superconducting char...
5,4,86,4_quantum_learning_and_to,"[quantum, learning, and, to, computing, classi...",[quantum computer encodes information in quant...
6,5,79,5_stars_ray_star_the,"[stars, ray, star, the, of, galaxy, galaxies, ...",[We propose stellar locus outlier (SLOT) metho...
7,6,64,6_cavity_gate_scheme_rydberg,"[cavity, gate, scheme, rydberg, the, trapped, ...",[simple scheme is proposed to generate an n-qu...
8,7,61,7_diamond_nv_vacancy_nitrogen,"[diamond, nv, vacancy, nitrogen, center, cente...",[Single defects in diamond and especially nega...
9,8,58,8_photons_photon_optical_light,"[photons, photon, optical, light, quantum, pho...",[Semiconductor quantum dots are currently emer...


# 可视化

In [8]:
# ⭐ UMAP可视化
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine',).fit_transform(embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)