# Topic Explorer

In [1]:
import json
import numpy as np
import pandas as pd
from datasets import load_dataset

file_path = "/out"

## Specific Category

We began by summarizing the English prompts from the 06/2024 - 08/2024 leaderboard dataset into specific categories.

### Data Processing

From the dataset of 60,000 conversations, we selected those tagged as English and removed any repetitive entries.

In [7]:
df = pd.read_json("/mnt/disk0/weilin/tmp/battles_latest_20240819_freshness_20240619_md.json")

english_df = df[df['language'] == 'English'].copy()
english_df['Prompt'] = english_df.apply(
    lambda x: ' '.join([i['content'] for i in x['conversation_a'] if i['role'] == 'user']),
    axis=1
)
english_df = english_df.drop_duplicates(subset='Prompt')
doc = english_df['Prompt']

len(doc)

52483

### Create Embedding

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(doc, show_progress_bar=True)

np.save(f"{file_path}/recent_english_embeddings.npy", embeddings)

### BERTopic Topic Clustering

We performed topic clustering on the english conversation dataset using BERTopic.

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
import openai
from bertopic.representation import OpenAI

embedding_model = SentenceTransformer('all-mpnet-base-v2')
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        
        top_n_words=10,
        verbose=True,
        calculate_probabilities=True
)

topics, probs = topic_model.fit_transform(doc, embeddings=embeddings)

Before reducing outliers, we selected 20 example prompts from each identified cluster. These prompts were chosen from those in the first 20th percentile of probability calculated by HDBSCAN clustering, representing the likelihood that they belong to the cluster. We excluded extra-long (>100 words) and extra-short (<5 words) prompts for better readability.

In [None]:
import random
from collections import defaultdict

sampled_prompts = defaultdict(list)
topic_info = topic_model.get_topic_info()
doc_info = topic_model.get_document_info(doc)

for topic_id in topic_info['Topic'][1:]:
    filtered_docs = doc_info[(doc_info['Topic'] == topic_id) & 
                             (doc_info['Probability'] >= doc_info['Probability'].quantile(0.8)) &
                             (doc_info['Document'].str.split().str.len() >= 5)]

    res = filtered_docs
    cap = 100
    if len(filtered_docs) >= 20:
        while len(res) < 20:
            res = filtered_docs[
                filtered_docs['Document'].str.split().str.len() <= cap
            ]
            cap += 50
    if topic_id % 10 == 0:
        print(topic_id)
    
    sampled_docs = res.sample(n=min(20, 
                            len(res)),
                            random_state=42,
                            replace=False)
    
    sampled_prompts[topic_id] = sampled_docs['Document'].tolist()

In [None]:
import pickle 

with open(f"{file_path}/example_prompts.pkl", 'wb') as f:
    pickle.dump(sampled_prompts, f)

Reduce all outliers.

In [None]:
new_topics = topic_model.reduce_outliers(doc, topics, probabilities=probs, strategy="probabilities")
topic_model.update_topics(doc, topics=new_topics)

topic_model.save(
    path=f"{file_path}/model",
    serialization="safetensors",
    save_ctfidf=True,
    save_embedding_model="sentence-transformers/all-mpnet-base-v2"
)

### Summarize Category Names

For each cluster, we used ChatGPT-4o to assign a category name based on the selected example prompts. 

In [None]:
def summarize_topic(prompts):
    input_text = "Based on the sampled prompts below, extract a short but highly descriptive topic label of at most 5 words:\n\n" + "\n\n".join(prompts)
    client = openai.OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You help summarize the category of the given prompts. Make sure it is in the following format: The topic of doc is '...'."},
            {"role": "user", "content": input_text}
        ],
        temperature=0
    )

    return response.choices[0].message.content

summaries = {}
for topic_id, prompts in sampled_prompts.items():
    summary = summarize_topic(prompts)
    summaries[topic_id] = summary

    if topic_id % 50 == 0:
        print(topic_id, ': ', summary)

summaries_df = pd.DataFrame(list(summaries.items()), columns=['Topic', 'Summary'])
summaries_df['Category'] = summaries_df['Summary'].apply(lambda x: re.search(r"'(.*?)'", x).group(1))
topic_info_modified = topic_info[['Topic', 'Count']]
summaries_df = summaries_df.merge(topic_info_modified, on='Topic')[['Topic', 'Category', 'Count']]
summaries_df['Percentage'] = summaries_df['Count'] / summaries_df['Count'].sum()
summaries_df['Example Prompt'] = summaries_df.apply(lambda x: sampled_prompts[x.Topic], axis=1)
summaries_df['Example Prompt'] = summaries_df['Example Prompt'].str.join('|||')

summaries_df.to_csv(f"{file_path}/recent_english_narrow_categories.csv", index=False)

## Broad Category

We performed topic clustering again on the category names of these 279 specific categories, summarizing them into 12 broad categories. The summarization process followed an almost identical approach as before.

In [None]:
broad_doc = summaries_df['Category']

# Create embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(broad_doc, show_progress_bar=True)

# BERTopic
embedding_model = SentenceTransformer('all-mpnet-base-v2')
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=4, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        # representation_model=openai_model,

        top_n_words=3,
        verbose=True
)

topics, probs = topic_model.fit_transform(broad_doc, embeddings=embeddings)

# Reduce all outliers
new_topics = topic_model.reduce_outliers(broad_doc, topics, strategy="embeddings")
topic_model.update_topics(broad_doc, topics=new_topics)

# Summarize category names
broad_topic_info = topic_model.get_topic_info()
broad_doc_info = topic_model.get_document_info(broad_doc)
summaries = {}

for topic_id in range(len(broad_topic_info)):
    cat = ', '.join(list(broad_doc_info[broad_doc_info['Topic'] == topic_id]['Document']))
    summary = summarize_topic(cat)
    summaries[topic_id] = summary


# Combine results 
broad_summaries_df = pd.DataFrame(list(summaries.items()), columns=['Topic', 'Summary'])
broad_summaries_df['Category'] = broad_summaries_df['Summary'].apply(lambda x: re.search(r"'(.*?)'", x).group(1))
topic_info_modified = broad_topic_info[['Topic', 'Count']]
broad_summaries_df = broad_summaries_df.merge(topic_info_modified, on='Topic')[['Topic', 'Category', 'Count']]
broad_summaries_df['Percentage'] = broad_summaries_df['Count'] / broad_summaries_df['Count'].sum()
broad_summaries_df = broad_summaries_df.fillna('Other')

broad_summaries_df.to_csv(f"{file_path}/recent_english_broad_categories.csv", index=False)

## Data Processing: combine broad and narrow topics

The clustering results were stored in JSON format to facilitate future visualizations.

In [None]:
# Merge categories
merged = broad_doc_info[['Topic']].merge(summaries_df, left_index=True, right_index=True)
merged = merged.merge(broad_summaries_df, left_on='Topic_x', right_on='Topic')
merged = merged[['Topic_x', 'Category_y', 'Topic_y', 'Category_x', 'Count_x', 'Percentage_x', 'Example Prompt']]
merged = merged.rename(columns={
    'Topic_x': 'broad_category_id', 
    'Category_y': 'broad_category', 
    'Topic_y': 'narrower_category_id',
    'Category_x': 'narrower_category',
    'Count_x': 'prompt_count',
    'Percentage_x': 'prompt_percentage',
    'Example Prompt': 'example_prompt'})

# Export results in JSON format
root = {
    "name": "categories",
    "children": []
}
for broad_category, group in merged.groupby(["broad_category_id", "broad_category"]):
    parent = {
        "id": int(broad_category[0]),
        "name": broad_category[1],
        "children": []
    }
    
    for _, row in group.iterrows():
        child = {
            "id": row["narrower_category_id"],
            "name": row["narrower_category"],
            "count": row["prompt_count"],
            "percent": row['prompt_percentage'],
        }

        parent["children"].append(child)
    
    root["children"].append(parent)

json_output = json.dumps(root, indent=4)

with open(f"{file_path}/recent_english_piechart.json", "w") as f:
    f.write(json_output)
