# Gemini APIを用いたトピック名の生成



In [1]:
# %%capture
# sentence-transformers 3.2.0からStaticEmbeddingをサポート
!pip install -q bertopic==0.17.0 datasets==2.20.0 litellm==1.72.2 sentence-transformers==3.2.0 scikit-learn==1.5.0 umap-learn==0.5.6 pandas==2.2.2

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/255.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
from copy import deepcopy

import pandas as pd
from bertopic import BERTopic
from bertopic.representation import LiteLLM
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.cluster import HDBSCAN
from umap import UMAP

# **ArXiv Articles: Computation and Language**

In [3]:
dataset = load_dataset("maartengr/arxiv_nlp")["train"]

abstracts = dataset["Abstracts"]
titles = dataset["Titles"]

# テキストクラスタリング

## **1. 文書の埋め込み**

In [4]:
embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

Batches:   0%|          | 0/1405 [00:00<?, ?it/s]

## **2. 埋め込みの次元削減**

In [5]:
umap_model = UMAP(
    n_components=5, min_dist=0.0, metric="cosine", random_state=42
)
reduced_embeddings = umap_model.fit_transform(embeddings)

## **3. 次元削減した埋め込みのクラスタリング**

In [6]:
hdbscan_model = HDBSCAN(
    min_cluster_size=50, metric="euclidean", cluster_selection_method="eom"
).fit(reduced_embeddings)
clusters = hdbscan_model.labels_

len(set(clusters))

167

# クラスタリングからトピックモデリングへ

## **BERTopic: A Modular Topic Modeling Framework**

In [7]:
# Train our model with our previously defined models
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True
).fit(abstracts, embeddings)

2025-06-07 23:11:39,816 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-07 23:12:36,321 - BERTopic - Dimensionality - Completed ✓
2025-06-07 23:12:36,324 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-07 23:12:53,661 - BERTopic - Cluster - Completed ✓
2025-06-07 23:12:53,683 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-07 23:12:57,636 - BERTopic - Representation - Completed ✓


## **表現モデル**

In [8]:
# Save original representations
original_topics = deepcopy(topic_model.topic_representations_)

In [9]:
def topic_differences(model, original_topics, nr_topics=5):
    """Show the differences in topic representations between two models """
    df = pd.DataFrame(columns=["Topic", "Original", "Updated"])
    for topic in range(nr_topics):

        # Extract top 5 words per topic per model
        og_words = " | ".join(list(zip(*original_topics[topic]))[0][:5])
        new_words = " | ".join(list(zip(*model.get_topic(topic)))[0][:5])
        df.loc[len(df)] = [topic, og_words, new_words]

    return df

### Gemini

以下のページを参考に、APIキーを取得しましょう。

- [Gemini API キーを取得する](https://ai.google.dev/gemini-api/docs/api-key?hl=ja)

APIにはレート制限があります。詳細については、以下のページを参照してください。

- [レート制限](https://ai.google.dev/gemini-api/docs/rate-limits?hl=ja)

In [12]:
os.environ["GEMINI_API_KEY"] = "APIキーを貼り付け"

Gemini APIを呼び出すために、今回はLiteLLMを使用します。LiteLLMは、OpenAIの形式でLLMのAPIを呼び出せるようにするためのライブラリです。さまざまなLLMのAPIを同じ形式で呼び出せるようになるので、Gemini APIを呼び出すのにも使うことができます。BERTopic的には、[バージョン0.17.0](https://github.com/MaartenGr/BERTopic/releases/tag/v0.17.0)からのサポートなので、古いBERTopicではこの方法は使えないことに注意してください。

In [11]:
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label in the following format:
topic: <short topic label>
"""

representation_model = LiteLLM(model="gemini/gemini-2.0-flash", delay_in_seconds=5)
topic_model.update_topics(abstracts, representation_model=representation_model)

topic_differences(topic_model, original_topics)

Unnamed: 0,Topic,Original,Updated
0,0,speech | asr | recognition | end | acoustic,Speech Recognition and Translation Systems
1,1,summarization | summaries | summary | abstract...,Text Summarization
2,2,translation | nmt | machine | bleu | neural,Neural Machine Translation
3,3,hate | offensive | speech | detection | toxic,Hate Speech Detection on Social Media
4,4,relation | extraction | re | relations | entity,Relation Extraction


## 参考資料

- [Gemini | LiteLLM](https://docs.litellm.ai/docs/providers/gemini)
- [LiteLLM | BERTopic](https://maartengr.github.io/BERTopic/api/representations.html#bertopic.representation.LiteLLM)