# Vision Arena Data Clustering

In [6]:
import json
import pandas as pd
import re

file_path = "/home/ygtang/arena-leaderboard-v2/vision/data"

### Data Processing

In [5]:
df = pd.read_json("/tmp/vision-arena-embeddings/filtered_chat_random.jsonl", lines=True)

In [29]:
df['Prompt'] = df.apply(
    lambda x: ' '.join([i['content'][0] for i in x['conversation'] if i['role'] == 'user']),
    axis=1
)
prompts = list(df['Prompt'])

len(prompts)

50000

In [11]:
df.head()

Unnamed: 0,conversation_id,model,conversation,turn,language,user_id,tstamp
0,8b13aa65d904446cabbe0eda34872df2,gpt-4o-2024-05-13,"[{'role': 'user', 'content': ['Consider a diss...",2,English,73440,1726803000.0
1,d977ac4c9f444214920658ff68c169b2,phi-3-vision-128k-instruct,"[{'role': 'user', 'content': ['make this logo ...",2,English,50827,1724256000.0
2,841e83c09d264f3d87f0c8e3b396d5b8,gpt-4-turbo-2024-04-09,"[{'role': 'user', 'content': ['Solve this', ['...",1,English,13299,1719507000.0
3,77981e52a0414f10a64a53dc12b35d12,cogvlm2-llama3-chat-19b,"[{'role': 'user', 'content': ['answer', ['bc08...",1,English,21720,1720807000.0
4,19e1c334f5214800b7ccdedcdce6b25c,gemini-1.5-pro-api-0514,"[{'role': 'user', 'content': ['Which option co...",1,English,64922,1726063000.0


In [36]:
df['conversation'][1]

[{'role': 'user',
  'content': ['make this logo transparent for any website',
   ['b1e84cb07dba0a7e7064ee407439ee87']]},
 {'role': 'assistant',
  'content': "To make the logo transparent, you would need to open the image in a graphics editing software, then go to the image's properties or adjustment layer, and set the opacity to 0%. However, I cannot perform actions on images, I can only describe them."},
 {'role': 'user',
  'content': 'help me write a splunk query where I want to see all host between abc10 to 25'},
 {'role': 'assistant',
  'content': 'To write a Splunk query for this purpose, you would use the following command:\n\n\n```\n\nsearch "server.hostname" between "abc10" and "25"\n\n```\n\n\nThis command searches for the `server.hostname` field in the Splunk index for `server` and filters the results to include only those with a hostname between "abc10" and "25".'}]

### Create Embedding

In [12]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(doc, show_progress_bar=True)

np.save(f"{file_path}/recent_english_embeddings.npy", embeddings)

Batches:   0%|          | 0/1641 [00:00<?, ?it/s]

In [16]:
print("Embedding dimensions:", model.get_sentence_embedding_dimension())

Embedding dimensions: 768


### BERTopic Topic Clustering

We performed topic clustering on the english conversation dataset using BERTopic.

In [14]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
import openai

embedding_model = SentenceTransformer('all-mpnet-base-v2')
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        
        top_n_words=10,
        verbose=True,
        calculate_probabilities=True
)

topics, probs = topic_model.fit_transform(doc, embeddings=embeddings)

2025-01-06 23:14:49,051 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-06 23:15:45,107 - BERTopic - Dimensionality - Completed ✓
2025-01-06 23:15:45,109 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

In [23]:
len(topic_model.get_topic_info())

194

In [19]:
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,28485,-1_data_self_like_return,"[data, self, like, return, new, use, time, 10,...","[Summarize each letter, one line each: ""Thanks..."
1,0,904,0_song_chorus_lyrics_verse,"[song, chorus, lyrics, verse, dub, oh, bridge,...",[# Role: Lyrics Generator\n\n# Profile\n- lang...
2,1,756,1_story_mark_write story_write,"[story, mark, write story, write, girl, hypnot...",[Write a story featuring a loving and devoted ...
3,2,512,2_strawberry_word strawberry_word_strawberry word,"[strawberry, word strawberry, word, strawberry...","[How many ""r"" are there in the word ""strawberr..."
4,3,502,3_solve_numbers_x²_equation,"[solve, numbers, x², equation, sqrt, y², minim...","[Solve 1 +1, 2\t3\t-1\t | 5\n4\t-1\t2\t | 6\n-..."


Before reducing outliers, we selected 20 example prompts from each identified cluster. These prompts were chosen from those in the first 20th percentile of probability calculated by HDBSCAN clustering, representing the likelihood that they belong to the cluster. We excluded extra-long (>100 words) and extra-short (<5 words) prompts for better readability.

In [18]:
from collections import defaultdict

sampled_prompts = defaultdict(list)
topic_info = topic_model.get_topic_info()
doc_info = topic_model.get_document_info(doc)

for topic_id in topic_info['Topic'][1:]:
    filtered_docs = doc_info[(doc_info['Topic'] == topic_id) & 
                             (doc_info['Probability'] >= doc_info['Probability'].quantile(0.8)) &
                             (doc_info['Document'].str.split().str.len() >= 5)]

    res = filtered_docs
    cap = 100
    if len(filtered_docs) >= 20:
        while len(res) < 20:
            res = filtered_docs[
                filtered_docs['Document'].str.split().str.len() <= cap
            ]
            cap += 50
    
    sampled_docs = res.sample(n=min(20, 
                            len(res)),
                            random_state=42,
                            replace=False)
    
    sampled_prompts[topic_id] = sampled_docs['Document'].tolist()

In [20]:
sampled_prompts[0]

['Please compare the following playlist 1 and playlist 2 and tell me if playlist 2 has any missing or differing entries. show these differences in a nicely formatted table, but ignore featured artists if the main artists match and ignore small differences in the titles. note if positions differ. summarize only the major differences.\n\nplaylist 1:\nNo.\tTitle\tArtist(s)\n1\t"Cherry Pink And Apple Blossom White"\tPerez Prado\n2\t"Rock Around the Clock"\tBill Haley & His Comets\n3\t"The Yellow Rose of Texas"\tMitch Miller\n4\t"Autumn Leaves"\tRoger Williams\n5\t"Unchained Melody"\tLes Baxter\n6\t"The Ballad of Davy Crockett"\tBill Hayes\n7\t"Love Is a Many-Splendored Thing"\tThe Four Aces\n8\t"Sincerely"\tThe McGuire Sisters\n9\t"Ain\'t That a Shame"\tPat Boone\n10\t"The Wallflower (Dance with Me, Henry)"\tGeorgia Gibbs\n11\t"The Crazy Otto Medley"\tJohnny Maddox\n12\t"Melody of Love"\tBilly Vaughn\n13\t"Sixteen Tons"\tTennessee Ernie Ford\n14\t"Learnin\' the Blues"\tFrank Sinatra\n15\t"

In [21]:
import pickle 

with open(f"{file_path}/example_prompts.pkl", 'wb') as f:
    pickle.dump(sampled_prompts, f)

Reduce all outliers.

In [24]:
new_topics = topic_model.reduce_outliers(doc, topics, probabilities=probs, strategy="probabilities")
topic_model.update_topics(doc, topics=new_topics)

topic_model.save(
    path=f"{file_path}/model",
    serialization="safetensors",
    save_ctfidf=True,
    save_embedding_model="sentence-transformers/all-mpnet-base-v2"
)



In [26]:
len(topic_model.get_topic_info())

193

In [25]:
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,972,0_song_chorus_lyrics_verse,"[song, chorus, lyrics, verse, you, my, me, oh,...",[# Role: Lyrics Generator\n\n# Profile\n- lang...
1,1,1322,1_her_story_his_she,"[her, story, his, she, he, mark, him, write, w...",[Write a story featuring a loving and devoted ...
2,2,607,2_cookedcooked_strawberry_many_how,"[cookedcooked, strawberry, many, how, word, co...","[How many ""r"" are there in the word ""strawberr..."
3,3,651,3_numbers_solve_number_what,"[numbers, solve, number, what, equation, x², i...","[Solve 1 +1, 2\t3\t-1\t | 5\n4\t-1\t2\t | 6\n-..."
4,4,963,4_patient_blood_stent_patients,"[patient, blood, stent, patients, cancer, of, ...",[Things to remember - Dr. Pankaj Harkut is a h...


### Summarize Category Names

For each cluster, we used ChatGPT-4o to assign a category name based on the selected example prompts. 

In [None]:
def summarize_topic(prompts):
    input_text = "Based on the sampled prompts below, extract a short but highly descriptive topic label of at most 5 words:\n\n" + "\n\n".join(prompts)
    client = openai.OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You help summarize the category of the given prompts. Make sure it is in the following format: The topic of doc is '...'."},
            {"role": "user", "content": input_text}
        ],
        temperature=0
    )

    return response.choices[0].message.content

summaries = {}
for topic_id, prompts in sampled_prompts.items():
    summary = summarize_topic(prompts)
    summaries[topic_id] = summary

In [29]:
summaries_df = pd.DataFrame(list(summaries.items()), columns=['Topic', 'Summary'])
summaries_df['Category'] = summaries_df['Summary'].apply(lambda x: re.search(r"'(.*?)'", x).group(1))
topic_info_modified = topic_info[['Topic', 'Count']]
summaries_df = summaries_df.merge(topic_info_modified, on='Topic')[['Topic', 'Category', 'Count']]
summaries_df['Percentage'] = summaries_df['Count'] / summaries_df['Count'].sum()
summaries_df['Example Prompt'] = summaries_df.apply(lambda x: sampled_prompts[x.Topic], axis=1)
summaries_df['Example Prompt'] = summaries_df['Example Prompt'].str.join('|||')

summaries_df.to_csv(f"{file_path}/recent_english_narrow_categories.csv", index=False)

In [30]:
summaries_df.head()

Unnamed: 0,Topic,Category,Count,Percentage,Example Prompt
0,0,Music Lyrics and Songwriting,904,0.03767,Please compare the following playlist 1 and pl...
1,1,Strength and Endurance Stories,756,0.031503,"Write a story, Lily's first person narration. ..."
2,2,Counting Letters in Words,512,0.021335,"How many ""r""s in strawberry|||How many r's are..."
3,3,Mathematical Problem Solving,502,0.020918,"Assuming that 1+2=4, what is the value of 4-2?..."
4,4,Medical Diagnosis and Pathology,485,0.02021,What are typical hormone levels for a woman in...


## Broad Category

We performed topic clustering again on the category names of these 193 specific categories, summarizing them into 12 broad categories. The summarization process followed an almost identical approach as before.

In [32]:
broad_doc = list(summaries_df['Category'])

# Create embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(broad_doc, show_progress_bar=True)

# BERTopic
embedding_model = SentenceTransformer('all-mpnet-base-v2')
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=4, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,

        top_n_words=3,
        verbose=True
)

topics, probs = topic_model.fit_transform(broad_doc, embeddings=embeddings)

# Reduce all outliers
new_topics = topic_model.reduce_outliers(broad_doc, topics, strategy="embeddings")
topic_model.update_topics(broad_doc, topics=new_topics)

# Summarize category names
broad_topic_info = topic_model.get_topic_info()
broad_doc_info = topic_model.get_document_info(broad_doc)
summaries = {}

for topic_id in range(len(broad_topic_info)):
    cat = ', '.join(list(broad_doc_info[broad_doc_info['Topic'] == topic_id]['Document']))
    summary = summarize_topic(cat)
    summaries[topic_id] = summary


# Combine results 
broad_summaries_df = pd.DataFrame(list(summaries.items()), columns=['Topic', 'Summary'])
broad_summaries_df['Category'] = broad_summaries_df['Summary'].apply(lambda x: re.search(r"'(.*?)'", x).group(1))
topic_info_modified = broad_topic_info[['Topic', 'Count']]
broad_summaries_df = broad_summaries_df.merge(topic_info_modified, on='Topic')[['Topic', 'Category', 'Count']]
broad_summaries_df['Percentage'] = broad_summaries_df['Count'] / broad_summaries_df['Count'].sum()
broad_summaries_df = broad_summaries_df.fillna('Other')

broad_summaries_df.to_csv(f"{file_path}/recent_english_broad_categories.csv", index=False)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2025-01-06 23:35:51,598 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-06 23:35:54,329 - BERTopic - Dimensionality - Completed ✓
2025-01-06 23:35:54,330 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-06 23:35:54,336 - BERTopic - Cluster - Completed ✓
2025-01-06 23:35:54,337 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-01-06 23:35:54,348 - BERTopic - Representation - Completed ✓


In [34]:
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,19,0_ai_art_generation_image,"[ai, art, generation, image, creation, prompt,...","[Image Generation Requests, AI Strategy and Im..."
1,1,19,1_recommendations_game_fanfiction_discussions,"[recommendations, game, fanfiction, discussion...","[STALKER Game Modding Discussions, Video Game ..."
2,2,23,2_writing_translation_in_content,"[writing, translation, in, content, words, let...","[Translation and Language Tasks, Translation a..."
3,3,14,3_analysis_discussion_of_and,"[analysis, discussion, of, and, assassination,...","[Legal and Constitutional Analysis, Wars and C..."
4,4,12,4_python_operations_development_data,"[python, operations, development, data, pyspar...","[JSON and Python Data Manipulation, Python pro..."


## Data Processing: combine broad and narrow topics

The clustering results were stored in JSON format to facilitate future visualizations.

In [39]:
# Merge categories
merged = broad_doc_info[['Topic']].merge(summaries_df, left_index=True, right_index=True)
merged = merged.merge(broad_summaries_df, left_on='Topic_x', right_on='Topic')
merged = merged[['Topic_x', 'Category_y', 'Topic_y', 'Category_x', 'Count_x', 'Percentage_x', 'Example Prompt']]
merged = merged.rename(columns={
    'Topic_x': 'broad_category_id', 
    'Category_y': 'broad_category', 
    'Topic_y': 'narrower_category_id',
    'Category_x': 'narrower_category',
    'Count_x': 'prompt_count',
    'Percentage_x': 'prompt_percentage',
    'Example Prompt': 'example_prompt'})

# Export results in JSON format
root = {
    "name": "categories",
    "children": []
}
for broad_category, group in merged.groupby(["broad_category_id", "broad_category"]):
    parent = {
        "id": int(broad_category[0]),
        "name": broad_category[1],
        "children": []
    }
    
    for _, row in group.iterrows():
        child = {
            "id": row["narrower_category_id"],
            "name": row["narrower_category"],
            "count": row["prompt_count"],
            "percent": row['prompt_percentage'],
        }

        parent["children"].append(child)
    
    root["children"].append(parent)

json_output = json.dumps(root, indent=4)

with open(f"{file_path}/recent_english_piechart.json", "w") as f:
    f.write(json_output)
