In [2]:
import numpy as np
import pandas as pd
import plotly.express as px

In [4]:
file_path = "/home/ygtang/arena-leaderboard-v2/topic_clustering/data/embedding_ablation/mpnet"
df = pd.read_csv(f"{file_path}/recent_english_broad_percentage.csv")

In [5]:
df.head()

Unnamed: 0,broad_category_id,broad_category,count,percentage
0,0,Tech Programming,3508,35.08
1,1,Math Puzzles,1384,13.84
2,5,Creative Arts,1196,11.96
3,-1,Miscellaneous,696,6.96
4,3,Multidisciplinary Studies,693,6.93


In [6]:
broad = df.sort_values('percentage', ascending=False)
broad 

Unnamed: 0,broad_category_id,broad_category,count,percentage
0,0,Tech Programming,3508,35.08
1,1,Math Puzzles,1384,13.84
2,5,Creative Arts,1196,11.96
3,-1,Miscellaneous,696,6.96
4,3,Multidisciplinary Studies,693,6.93
5,2,Communication & Healthcare,676,6.76
6,6,Professional Development,630,6.3
7,4,Language Education,555,5.55
8,8,Philosophy & Education,337,3.37
9,7,Game Strategies,325,3.25


In [7]:
broad['Percentage (%)'] = broad['percentage'].round(1)
broad = broad.sort_values(by='Percentage (%)', ascending=True)
broad

Unnamed: 0,broad_category_id,broad_category,count,percentage,Percentage (%)
9,7,Game Strategies,325,3.25,3.2
8,8,Philosophy & Education,337,3.37,3.4
7,4,Language Education,555,5.55,5.6
6,6,Professional Development,630,6.3,6.3
5,2,Communication & Healthcare,676,6.76,6.8
4,3,Multidisciplinary Studies,693,6.93,6.9
3,-1,Miscellaneous,696,6.96,7.0
2,5,Creative Arts,1196,11.96,12.0
1,1,Math Puzzles,1384,13.84,13.8
0,0,Tech Programming,3508,35.08,35.1


In [10]:
def wrap_text(label, n):
    words = label.split()
    wrapped_label = '<br>'.join([' '.join(words[i:i+n]) for i in range(0, len(words), n)])
    return wrapped_label

wrapped_labels = [wrap_text(label, 3) for label in broad['broad_category']]

fig = px.bar(
    broad,
    x='Percentage (%)',
    y='broad_category',
    title='Top 10 Prompt Categories',
    labels={'broad_category': 'Category'},
    text='Percentage (%)',
    orientation='h'
)

fig.update_traces(textposition='outside')
fig.update_layout(
    xaxis_title='Percentage (%)',
    yaxis_title='Category',
    title_x=0.5,
    title={
        'text': 'Broad Categories',
        'x': 0.5, 
        'y': 0.9,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    yaxis=dict(
        tickmode='array',
        tickvals=list(range(len(broad))),
        ticktext=wrapped_labels
    ),
    xaxis=dict(
        tickformat=".2f",
        range=[0, 45]
    ),
    height=600,
    width=600
)

fig.show()

In [11]:
all = pd.read_csv(f"{file_path}/recent_english_category_summary.csv")

In [30]:
def plot_subcat(cat):
    sub = all[all['broad_category'] == cat]
    sub = sub[['narrower_category', 'prompt_percentage']]
    sub['Percentage (%)'] = (sub['prompt_percentage'] * 100).round(2)
    sub = sub.sort_values(by='Percentage (%)', ascending=True)

    def wrap_text(label, n):
        words = label.split()
        wrapped_label = '<br>'.join([' '.join(words[i:i+n]) for i in range(0, len(words), n)])
        return wrapped_label

    wrapped_labels = [wrap_text(label, 6) for label in sub['narrower_category']]

    fig = px.bar(
        sub,
        x='Percentage (%)',
        y='narrower_category',
        title='Top 10 Prompt Categories',
        labels={'narrower_category': 'Category'},
        text='Percentage (%)',
        orientation='h'
    )

    fig.update_traces(textposition='outside')
    fig.update_layout(
        xaxis_title='Percentage (%)',
        yaxis_title='Category',
        title_x=0.5,
        title={
            'text': f'Subcategories of "{cat}"',
            'x': 0.5, 
            'y': 0.9,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        yaxis=dict(
            tickmode='array',
            tickvals=list(range(len(sub))),
            ticktext=wrapped_labels
        ),
        xaxis=dict(
            tickformat=".2f",
            range=[0, 3.5]
        ),
        height=800,
        width=600
    )

    return fig

In [33]:
fig_prog = plot_subcat('Multidisciplinary Studies')
fig_prog.show()