# 导入

In [1]:
import numpy as np
from bertopic import BERTopic
from transformers.pipelines import pipeline
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# 加载文件

In [2]:

with open('./data/切词.txt', 'r', encoding='utf-8') as file:
  docs = file.readlines()
print('条数: ', len(docs))
print('预览第一条: ', docs[0])

vectorizer_model = None

条数:  2714
预览第一条:  This article shows how symbolic computing and the mathematical formalism induced by maximizing entropy and minimizing the mean deviation from statistical equilibrium may be effectively applied to obtaining probabilistic models for the structure of atoms, using trial wave functions compatible with an average shell picture of the atom. The objective is not only to recover the experimental value of the ground state mean energy of the atom, but rather to better approximate the unknown parameters of these trial



# 创建模型

In [3]:
# 1. 词向量模型，同时加载本地训练好的词向量
embedding_model = pipeline("feature-extraction", model="bert-base-chinese") # 使用bert-base-chinese
embeddings = np.load('./data/emb.npy') # 使用bert-base-chinese向量
print('向量shape：', embeddings.shape)

# 替换: 使用hfl模型
# embedding_model = pipeline("feature-extraction", model="hfl/chinese-bert-wwm")
# embeddings = np.load('./data/embedding_hfl.npy') 
# print('向量shape：', embeddings.shape)

# 替换: 使用Sentencetransformers模型
# embedding_model = embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2',)
# embeddings = np.load('./data/embedding_sen.npy')
# print(embeddings.shape)

# 2. 创建UMAP降维模型
umap_model = UMAP(
  n_neighbors=15,
  n_components=5,
  min_dist=0.0,
  metric='cosine',
  random_state=30  # ⚠️ 防止随机 https://maartengr.github.io/BERTopic/faq.html
)

# 3. 创建HDBSCAN聚类模型
# 如果要建设离群值，可以减小下面两个参数min_cluster_size min_samples
# https://hdbscan.readthedocs.io/en/latest/faq.html
hdbscan_model = HDBSCAN(
  min_cluster_size=20,
  min_samples=5,
  metric='euclidean'
)

# 5. 创建CountVectorizer模型
vectorizer_model = CountVectorizer(stop_words=['and', 'of', 'in'])

# 6. 正式创建BERTopic模型
topic_model = BERTopic(
  embedding_model=embedding_model,
  vectorizer_model=vectorizer_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


向量shape： (2714, 384)


# 训练模型

In [4]:
# 查看主题
topics, probs = topic_model.fit_transform(docs, embeddings=embeddings) #传入训练好的词向量
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,938,-1_the_to_we_quantum,"[the, to, we, quantum, is, for, that, with, by...",[We propose way for implementing two-step iSWA...
1,0,219,0_entanglement_states_we_the,"[entanglement, states, we, the, that, for, sta...",[Many-body quantum systems can be characterize...
2,1,107,1_superconducting_qubit_to_the,"[superconducting, qubit, to, the, qubits, flux...",[Semiconductor spin qubits may be coupled thro...
3,2,105,2_laser_the_ion_is,"[laser, the, ion, is, to, by, with, transition...",[laser-cooling experiment with Ca ions trapped...
4,3,97,3_topological_majorana_superconductivity_super...,"[topological, majorana, superconductivity, sup...",[Superconductors with topological surface or e...
5,4,92,4_quantum_to_learning_the,"[quantum, to, learning, the, this, computers, ...",[quantum computer encodes information in quant...
6,5,91,5_photons_optical_photon_quantum,"[photons, optical, photon, quantum, light, sin...",[Semiconductor quantum dots are currently emer...
7,6,80,6_stars_ray_the_star,"[stars, ray, the, star, galaxy, galaxies, to, ...",[We propose stellar locus outlier (SLOT) metho...
8,7,71,7_gate_cavity_two_the,"[gate, cavity, two, the, gates, atoms, qubit, ...",[We present one-step scheme for direct impleme...
9,8,61,8_diamond_nv_vacancy_nitrogen,"[diamond, nv, vacancy, nitrogen, center, cente...",[Single defects in diamond and especially nega...


# 保存聚类结果

In [5]:
topic_docs = topic_model.get_document_info(docs)
topic_docs.to_csv('./聚类结果.csv')

In [8]:
with open('./data/文本.txt', 'r', encoding='utf-8') as file:
  texts = file.readlines()
  print('文本条数：', len(texts))
  topic_docs.insert(1, '原文', texts)
with open('./data/时间.txt', 'r', encoding='utf-8') as file:
  years = file.readlines()
  print('文本条数：', len(years))
  topic_docs.insert(2, '时间', years)
topic_docs.to_csv('./聚类结果2.csv')

文本条数： 2714


ValueError: cannot insert 原文, already exists

# 可视化

In [9]:
topic_model.visualize_barchart()

In [10]:
topic_model.visualize_topics()

In [11]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, hide_document_hover=True)

# 层次聚类

In [12]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 31/31 [00:00<00:00, 244.09it/s]


# 合并主题

In [13]:
topic_model.merge_topics(docs, [
 [30,25,10,0,22,19,26,5,4,27,15,7,1,13,20],
 [31,29,14],
 [3,16,23,17,18,24,2,12,6],
 [11,21,9,8,28]
])

topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,938,-1_the_to_we_quantum,"[the, to, we, quantum, is, for, with, that, by...","[Pair coherent state, is state of two-mode rad..."
1,0,936,0_the_quantum_to_we,"[the, quantum, to, we, is, for, qubit, that, s...",[We first propose an experimental scheme to im...
2,1,520,1_the_to_we_is,"[the, to, we, is, with, for, by, that, are, on]",[The thermal entanglement in two-qubit Heisenb...
3,2,227,2_the_spin_quantum_to,"[the, spin, quantum, to, we, nuclear, for, ele...",[Long coherence times of single spins in silic...
4,3,93,3_the_to_is_viscosity,"[the, to, is, viscosity, using, for, this, flo...",[The technology of using ultrasonic vibration ...


In [14]:
topic_model.visualize_topics()

In [15]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, hide_document_hover=True)

# 动态主题模型

In [16]:
# 读取时间戳
with open('./data/时间.txt', "r", encoding='utf-8') as file:
    lines = file.readlines()
    timestamps = [int(line.strip()) for line in lines]
print(len(timestamps), timestamps[:10])

2714 [2024, 2006, 2025, 2017, 2008, 2004, 2011, 2001, 1998, 2004]


In [17]:
topics_over_time = topic_model.topics_over_time(docs, timestamps, global_tuning=False, evolution_tuning=False)
topic_model.visualize_topics_over_time(topics_over_time)