In [None]:
!curl -LsSf https://astral.sh/uv/install.sh | sh
!uv init
!uv add torch
!uv add transformers
!uv add numpy
!uv add tqdm
!uv add bertopic

In [None]:
# 从 Hugging Face 加载数据
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import numpy as np

dataset = load_dataset("maartengr/arxiv_nlp")["train"]

# 提取元数据
abstracts = dataset["Abstracts"]
titles = dataset["Titles"]

# 为每个摘要创建嵌入向量
embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)
embeddings.shape

# 将输入嵌入向量从 384维降到5维
umap_model = UMAP(
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=42
)
reduced_embeddings = umap_model.fit_transform(embeddings)

# 对降维后的嵌入向量进行聚类
# 拟合模型并提取簇
hdbscan_model = HDBSCAN(
    min_cluster_size=50,
    metric="euclidean",
    cluster_selection_method="eom"
).fit(reduced_embeddings)
clusters = hdbscan_model.labels_

# 我们生成了多少个簇
len(set(clusters))

# 打印簇0中的前三个文档
cluster = 0
for index in np.where(clusters==cluster)[0][:3]:
  print(abstracts[int(index)][:300] + "... \n")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 将384 维的嵌入向量降至二维以便于可视化

reduced_embeddings = UMAP(
    n_components=2,
    min_dist=0.0,
    metric="cosine",
    random_state=42
).fit_transform(embeddings)

# 创建 DataFrame
df = pd.DataFrame(reduced_embeddings, columns=["x", "y"])
df["title"] = titles
df["cluster"] = [str(c) for c in clusters]

# 选择离群点和非离群点（聚类）
clusters_df = df.loc[df.cluster != "-1", :]
outliers_df = df.loc[df.cluster == "-1", :]

# 分别绘制离群点和非离群点
plt.scatter(outliers_df.x, outliers_df.y, alpha=0.05, s=2, c="grey")
plt.scatter(clusters_df.x, clusters_df.y, c=clusters_df.cluster.astype(int), alpha=0.6, s=2, cmap="tab20b")
plt.axis("off")

In [None]:
!pip install bertopic

In [None]:
from bertopic import BERTopic

# 使用之前定义的模型训练我们的模型
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True
).fit(abstracts, embeddings)

topic_model.get_topic_info()
topic_model.get_topic(0)
topic_model.find_topics("topic modeing")
topic_model.get_topic(22)
topic_model.topics_[titles.index("BERTopic: Neural topic modeling with a class-based TF-IDF procedure")]

In [None]:
# 可视化主题和文档
fig = topic_model.visualize_documents(
    titles,
    reduced_embeddings=reduced_embeddings,
    width=1200,
    hide_annotations=True
)

# 更新图例字体设置以便于可视化
fig.update_layout(font=dict(size=16))

# 可视化带有关键词排名的条形图
topic_model.visualize_barchart()

# 可视化主题之间的关系
topic_model.visualize_heatmap(n_clusters=30)

# 可视化主题的潜在层次结构
topic_model.visualize_hierarchy()

In [None]:
# 保存原始表示
from copy import deepcopy
from bertopic.representation import KeyBERTInspired

original_topics = deepcopy(topic_model.topic_representations_)

def topic_differences(model, original_topics, nr_topics=5):
  """显示两个模型之间主题表示的差异"""
  df = pd.DataFrame(columns=["Topic", "Original", "Updated"])

  for topic in range(nr_topics):
    # 每个模型、每个主题提取前5个词
    og_words = " | ".join(list(zip(*original_topics[topic]))[0][:5])
    new_words = " | ".join(list(zip(*model.get_topic(topic)))[0][:5])
    df.loc[len(df)] = [topic, og_words, new_words]

# 使用 KeyBERTInspired 更新主题表示
representation_model = KeyBERTInspired()
topic_model.update_topics(abstracts, representation_model=representation_model)
# 展示主题差异
topic_differences(topic_model, original_topics)

In [None]:
from bertopic.representation import MaximalMarginalRelevance

# 将主题表示更新为最大边际相关性
representation_model = MaximalMarginalRelevance(diversity=0.2)
topic_model.update_topics(abstracts, representation_model=representation_model)
# 展示主题差异
topic_differences(topic_model, original_topics)

In [None]:
from transformers import pipeline
from bertopic.representation import TextGeneration

prompt = """I have a topic that contains the following documents:
         [DOCUMENTS]
         The topic is described by the following keywords: '[KEYWORDS]'.
         Based on the documents and keywords, what is this topic about?
         """
# 使用 FLAN-T5 更新主题表示
generator = pipeline("text2text-generation", model="google/flan-t5-small")
representation_model = TextGeneration(
    generator,
    prompt=prompt,
    doc_length=50,
    tokenizer="shitespace"
)
topic_model.update_topics(abstracts, representation_model=representation_model)

# 展示主题差异
topic_differences(topic_model, original_topics)

In [None]:
import openai
from bertopic.representation import OpenAI

prompt = """I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short topic label in the following
format:
topic: <short topic label>
"""
# 使用 GPT-3.5 更新主题表示
client = openai.OpenAI(
    api_key="sk-o5h8qo4udMjKiARF318d3829EdD74d8aB891CcD86b7a6e0b",
    base_url="https://api.apiyi.com/v1"
)

representation_model = OpenAI(
    client,
    model="gpt-3.5-turbo",
    exponential_backoff=True,
    chat=True,
    prompt=prompt
)

topic_model.update_topics(abstracts, representation_model=representation_model)

# 展示主题差异
topic_differences(topic_model, original_topics)

In [None]:
!pip install datamapplot

In [None]:
import datamapplot

# 可视化主题和文档
fig = topic_model.visualize_document_datamap(
    titles,
    topics=list(range(20)),
    reduced_embeddings=reduced_embeddings,
    # label_font_size=11,
    # use_medoids=True,
)