In [None]:
# Source data: https://docs.google.com/spreadsheets/d/1Q6KiM2kosM1q1_NK-FzFjJ2o0mLS2YHe0sUkqTj6xbU/edit?gid=506716267#gid=506716267

import pandas as pd
from io import StringIO

data = """LLM,System,Rank,Score
qwen1.5-110B-Chat,LMC-EI (ours),1,65.6
gpt-4o-2024-05-13,LMC-EI (ours),2,59.2
claude-3-opus,LMC-EI (ours),3,50.1
qwen1.5-32B-Chat,LMC-EI (ours),4,50
llama-3-70b-chat,LMC-EI (ours),5,45.1
claude-3-haiku,LMC-EI (ours),6,38.6
mixtral-8x7b,LMC-EI (ours),7,34.4
llama-3-8b-chat,LMC-EI (ours),8,30
gpt-4-0613,LMC-EI (ours),9,44.2
qwen1.5-110B-Chat,Human Study,3,53
gpt-4o-2024-05-13,Human Study,2,54
claude-3-opus,Human Study,1,55.7
qwen1.5-32B-Chat,Human Study,4,50
llama-3-70b-chat,Human Study,5,49.7
claude-3-haiku,Human Study,6,49
mixtral-8x7b,Human Study,7,48.9
llama-3-8b-chat,Human Study,9,40
gpt-4-0613,Human Study,8,43.9
gpt-4o-2024-05-13,Chatbot Arena,1,1287
gpt-4-0613,Chatbot Arena,3,1246
mixtral-8x7b,Chatbot Arena,9,1114
llama-3-70b-chat,Chatbot Arena,4,1208
llama-3-8b-chat,Chatbot Arena,7,1153
claude-3-opus,Chatbot Arena,2,1248
claude-3-haiku,Chatbot Arena,5,1178
qwen1.5-110B-Chat,Chatbot Arena,6,1164
qwen1.5-32B-Chat,Chatbot Arena,8,1126
gpt-4o-2024-05-13,MMLU,1,88.7
gpt-4-0613,MMLU,3,86.4
mixtral-8x7b,MMLU,8,70.6
llama-3-70b-chat,MMLU,4,82.0
llama-3-8b-chat,MMLU,9,68.4
claude-3-opus,MMLU,2,86.8
claude-3-haiku,MMLU,6,75.2
qwen1.5-110B-Chat,MMLU,5,80.2
qwen1.5-32B-Chat,MMLU,7,74.3
gpt-4o-2024-05-13,EQ-Bench,3,83.51
gpt-4-0613,EQ-Bench,1,84.79
mixtral-8x7b,EQ-Bench,7,72.37
llama-3-70b-chat,EQ-Bench,5,82.13
llama-3-8b-chat,EQ-Bench,8,68.88
claude-3-opus,EQ-Bench,4,82.19
claude-3-haiku,EQ-Bench,9,63.65
qwen1.5-110B-Chat,EQ-Bench,2,83.68
qwen1.5-32B-Chat,EQ-Bench,6,75.59
gpt-4o-2024-05-13,Chatbot Arena mined EQ subset (n=100),1,79.56
gpt-4-0613,Chatbot Arena mined EQ subset (n=100),9,30.87
mixtral-8x7b,Chatbot Arena mined EQ subset (n=100),8,37.62
llama-3-70b-chat,Chatbot Arena mined EQ subset (n=100),2,77.21
llama-3-8b-chat,Chatbot Arena mined EQ subset (n=100),4,53.69
claude-3-opus,Chatbot Arena mined EQ subset (n=100),5,50.66
claude-3-haiku,Chatbot Arena mined EQ subset (n=100),7,40.94
qwen1.5-110B-Chat,Chatbot Arena mined EQ subset (n=100),3,69.35
qwen1.5-32B-Chat,Chatbot Arena mined EQ subset (n=100),6,50"""

df = pd.read_csv(StringIO(data))

In [None]:
# Print correlations
pivot_df = df.pivot(index='LLM', columns='System', values='Score')
spearman_corr = pivot_df.corr(method='spearman')

# Reorder the matrix to have 'Human Study' last
system_order = ['EQ-Bench', 'Chatbot Arena', 'Chatbot Arena mined EQ subset (n=100)', 'MMLU', 'LMC-EI (ours)', 'Human Study']
spearman_corr = spearman_corr.loc[system_order, system_order]
spearman_corr

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a custom annotation DataFrame
annot = spearman_corr.copy().astype(str)

# Apply the custom style to the annotations
for i in spearman_corr.index:
    for j in spearman_corr.columns:
        value = spearman_corr.loc[i, j]
        # if i == 'Human Study' or j == 'Human Study':
        if i == 'Human Study':
            annot.loc[i, j] = f'$\mathbf{{{value:.2f}}}$'
        else:
            annot.loc[i, j] = f'{value:.2f}'

# Plot the heatmap
plt.figure(figsize=(15, 13))
sns.heatmap(spearman_corr, annot=annot, cmap='coolwarm', linewidths=3, fmt='',
            annot_kws={"size": 14}, cbar_kws={'label': 'Spearman Correlation'})

# Modify the axis labels to bold "Human Study"
ax = plt.gca()  # Get the current axis

# Bold the 'Human Study' label on the x-axis
# Bold the 'Human Study' label on the y-axis
labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels([f'$\mathbf{{{label}}}$' if label == 'Human Study' else label for label in labels], fontsize=14, rotation=0)

plt.xlabel("")
plt.ylabel("")
plt.xticks(fontsize=14, rotation=90)
plt.yticks(fontsize=14, rotation=0)
plt.tight_layout()
plt.savefig("/Users/justinzhao/Repos/llm-council-public/analysis/leaderboard_comparison.png")
plt.show()

## Individual Judges

In [None]:
# Source data: https://docs.google.com/spreadsheets/d/1Q6KiM2kosM1q1_NK-FzFjJ2o0mLS2YHe0sUkqTj6xbU/edit?gid=506716267#gid=506716267

import pandas as pd
from io import StringIO

data = """LLM,System,Rank,Score
qwen1.5-110B-Chat,Human Study,3,53
gpt-4o-2024-05-13,Human Study,2,54
claude-3-opus,Human Study,1,55.7
qwen1.5-32B-Chat,Human Study,4,50
llama-3-70b-chat,Human Study,5,49.7
claude-3-haiku,Human Study,6,49
mixtral-8x7b,Human Study,7,48.9
llama-3-8b-chat,Human Study,9,40
gpt-4-0613,Human Study,8,43.9
gpt-4o-2024-05-13,Chatbot Arena,1,1287
gpt-4-0613,Chatbot Arena,3,1246
mixtral-8x7b,Chatbot Arena,9,1114
llama-3-70b-chat,Chatbot Arena,4,1208
llama-3-8b-chat,Chatbot Arena,7,1153
claude-3-opus,Chatbot Arena,2,1248
claude-3-haiku,Chatbot Arena,5,1178
qwen1.5-110B-Chat,Chatbot Arena,6,1164
qwen1.5-32B-Chat,Chatbot Arena,8,1126
gpt-4o-2024-05-13,MMLU,1,88.7
gpt-4-0613,MMLU,3,86.4
mixtral-8x7b,MMLU,8,70.6
llama-3-70b-chat,MMLU,4,82.0
llama-3-8b-chat,MMLU,9,68.4
claude-3-opus,MMLU,2,86.8
claude-3-haiku,MMLU,6,75.2
qwen1.5-110B-Chat,MMLU,5,80.2
qwen1.5-32B-Chat,MMLU,7,74.3
gpt-4o-2024-05-13,EQ-Bench,3,83.5
gpt-4-0613,EQ-Bench,1,84.8
mixtral-8x7b,EQ-Bench,7,72.4
llama-3-70b-chat,EQ-Bench,5,82.1
llama-3-8b-chat,EQ-Bench,8,68.9
claude-3-opus,EQ-Bench,4,82.2
claude-3-haiku,EQ-Bench,9,63.7
qwen1.5-110B-Chat,EQ-Bench,2,83.7
qwen1.5-32B-Chat,EQ-Bench,6,75.6
gpt-4o-2024-05-13,Chatbot Arena mined EQ subset (n=100),1,79.56
gpt-4-0613,Chatbot Arena mined EQ subset (n=100),9,30.87
mixtral-8x7b,Chatbot Arena mined EQ subset (n=100),8,37.62
llama-3-70b-chat,Chatbot Arena mined EQ subset (n=100),2,77.21
llama-3-8b-chat,Chatbot Arena mined EQ subset (n=100),4,53.69
claude-3-opus,Chatbot Arena mined EQ subset (n=100),5,50.66
claude-3-haiku,Chatbot Arena mined EQ subset (n=100),7,40.94
qwen1.5-110B-Chat,Chatbot Arena mined EQ subset (n=100),3,69.35
qwen1.5-32B-Chat,Chatbot Arena mined EQ subset (n=100),6,50
qwen1.5-110B-Chat,LMC-EI (llama-3-70b-chat),1,70.53
gpt-4o-2024-05-13,LMC-EI (llama-3-70b-chat),2,65.38
llama-3-70b-chat,LMC-EI (llama-3-70b-chat),3,56.36
claude-3-opus,LMC-EI (llama-3-70b-chat),4,51.64
qwen1.5-32B-Chat,LMC-EI (llama-3-70b-chat),5,50
claude-3-haiku,LMC-EI (llama-3-70b-chat),6,38.66
llama-3-8b-chat,LMC-EI (llama-3-70b-chat),7,29.77
mixtral-8x7b,LMC-EI (llama-3-70b-chat),8,26.74
gpt-4-0613,LMC-EI (llama-3-70b-chat),9,16.9
qwen1.5-110B-Chat,LMC-EI (qwen1.5-110B-Chat),1,68.63
gpt-4o-2024-05-13,LMC-EI (qwen1.5-110B-Chat),2,55.5
qwen1.5-32B-Chat,LMC-EI (qwen1.5-110B-Chat),3,50
claude-3-opus,LMC-EI (qwen1.5-110B-Chat),4,45.5
llama-3-70b-chat,LMC-EI (qwen1.5-110B-Chat),5,35.78
mixtral-8x7b,LMC-EI (qwen1.5-110B-Chat),6,31.68
claude-3-haiku,LMC-EI (qwen1.5-110B-Chat),7,23.35
llama-3-8b-chat,LMC-EI (qwen1.5-110B-Chat),8,19.58
gpt-4-0613,LMC-EI (qwen1.5-110B-Chat),9,18.87
llama-3-70b-chat,LMC-EI (llama-3-8b-chat),1,62.65
qwen1.5-110B-Chat,LMC-EI (llama-3-8b-chat),2,58.53
gpt-4o-2024-05-13,LMC-EI (llama-3-8b-chat),3,52.24
llama-3-8b-chat,LMC-EI (llama-3-8b-chat),4,51.27
claude-3-opus,LMC-EI (llama-3-8b-chat),5,50.92
qwen1.5-32B-Chat,LMC-EI (llama-3-8b-chat),6,50
claude-3-haiku,LMC-EI (llama-3-8b-chat),7,48.82
mixtral-8x7b,LMC-EI (llama-3-8b-chat),8,43.03
gpt-4-0613,LMC-EI (llama-3-8b-chat),9,33.23
qwen1.5-110B-Chat,LMC-EI (gpt-4-0613),1,68
gpt-4o-2024-05-13,LMC-EI (gpt-4-0613),2,59.25
qwen1.5-32B-Chat,LMC-EI (gpt-4-0613),3,50
claude-3-opus,LMC-EI (gpt-4-0613),4,43
llama-3-70b-chat,LMC-EI (gpt-4-0613),5,38.12
claude-3-haiku,LMC-EI (gpt-4-0613),6,28.19
mixtral-8x7b,LMC-EI (gpt-4-0613),7,25.96
gpt-4-0613,LMC-EI (gpt-4-0613),8,20.79
llama-3-8b-chat,LMC-EI (gpt-4-0613),9,14.22
qwen1.5-110B-Chat,LMC-EI (claude-3-opus),1,77
gpt-4o-2024-05-13,LMC-EI (claude-3-opus),2,59.25
qwen1.5-32B-Chat,LMC-EI (claude-3-opus),3,50
claude-3-opus,LMC-EI (claude-3-opus),4,42.5
claude-3-haiku,LMC-EI (claude-3-opus),5,35
llama-3-70b-chat,LMC-EI (claude-3-opus),6,26.5
mixtral-8x7b,LMC-EI (claude-3-opus),7,23.75
gpt-4-0613,LMC-EI (claude-3-opus),8,18.14
llama-3-8b-chat,LMC-EI (claude-3-opus),9,9
qwen1.5-110B-Chat,LMC-EI (mixtral-8x7b),1,69.42
gpt-4o-2024-05-13,LMC-EI (mixtral-8x7b),2,67.16
llama-3-70b-chat,LMC-EI (mixtral-8x7b),2,67.16
claude-3-opus,LMC-EI (mixtral-8x7b),3,63.81
qwen1.5-32B-Chat,LMC-EI (mixtral-8x7b),4,50
llama-3-8b-chat,LMC-EI (mixtral-8x7b),5,44.86
claude-3-haiku,LMC-EI (mixtral-8x7b),6,43.69
mixtral-8x7b,LMC-EI (mixtral-8x7b),7,41.28
gpt-4-0613,LMC-EI (mixtral-8x7b),8,34.43
gpt-4o-2024-05-13,LMC-EI (gpt-4o-2024-05-13),1,67.42
qwen1.5-110B-Chat,LMC-EI (gpt-4o-2024-05-13),2,62.31
qwen1.5-32B-Chat,LMC-EI (gpt-4o-2024-05-13),3,50
llama-3-70b-chat,LMC-EI (gpt-4o-2024-05-13),4,43.02
claude-3-opus,LMC-EI (gpt-4o-2024-05-13),5,41.67
mixtral-8x7b,LMC-EI (gpt-4o-2024-05-13),6,29.55
claude-3-haiku,LMC-EI (gpt-4o-2024-05-13),7,26.15
gpt-4-0613,LMC-EI (gpt-4o-2024-05-13),8,23.36
llama-3-8b-chat,LMC-EI (gpt-4o-2024-05-13),9,19.66
qwen1.5-110B-Chat,LMC-EI (claude-3-haiku),1,61
claude-3-opus,LMC-EI (claude-3-haiku),2,55.5
llama-3-70b-chat,LMC-EI (claude-3-haiku),3,53
gpt-4o-2024-05-13,LMC-EI (claude-3-haiku),4,52.97
qwen1.5-32B-Chat,LMC-EI (claude-3-haiku),5,50
claude-3-haiku,LMC-EI (claude-3-haiku),6,46.08
llama-3-8b-chat,LMC-EI (claude-3-haiku),7,42.5
mixtral-8x7b,LMC-EI (claude-3-haiku),8,37.62
gpt-4-0613,LMC-EI (claude-3-haiku),9,33.65
qwen1.5-110B-Chat,LMC-EI (council.mean_pooling),1,65.75
gpt-4o-2024-05-13,LMC-EI (council.mean_pooling),2,58.5
claude-3-opus,LMC-EI (council.mean_pooling),3,51.25
qwen1.5-32B-Chat,LMC-EI (council.mean_pooling),4,50
llama-3-70b-chat,LMC-EI (council.mean_pooling),5,46.25
claude-3-haiku,LMC-EI (council.mean_pooling),6,36.75
mixtral-8x7b,LMC-EI (council.mean_pooling),7,32.75
llama-3-8b-chat,LMC-EI (council.mean_pooling),8,30
gpt-4-0613,LMC-EI (council.mean_pooling),9,23.53
qwen1.5-110B-Chat,LMC-EI (council.majority),1,78
gpt-4o-2024-05-13,LMC-EI (council.majority),2,68
qwen1.5-32B-Chat,LMC-EI (council.majority),3,50
claude-3-opus,LMC-EI (council.majority),4,47
llama-3-70b-chat,LMC-EI (council.majority),5,36.5
claude-3-haiku,LMC-EI (council.majority),6,27.5
mixtral-8x7b,LMC-EI (council.majority),7,21.5
gpt-4-0613,LMC-EI (council.majority),8,14.71
llama-3-8b-chat,LMC-EI (council.majority),9,13
qwen1.5-110B-Chat,LMC-EI (council.no_agg),1,65.62
gpt-4o-2024-05-13,LMC-EI (council.no_agg),2,59.19
claude-3-opus,LMC-EI (council.no_agg),3,50.09
qwen1.5-32B-Chat,LMC-EI (council.no_agg),4,50
llama-3-70b-chat,LMC-EI (council.no_agg),5,45.08
claude-3-haiku,LMC-EI (council.no_agg),6,38.59
mixtral-8x7b,LMC-EI (council.no_agg),7,34.45
llama-3-8b-chat,LMC-EI (council.no_agg),8,30.04
gpt-4-0613,LMC-EI (council.no_agg),9,44.2
qwen1.5-110B-Chat,LMC-EI (best_council.no_agg),1,68.1
gpt-4o-2024-05-13,LMC-EI (best_council.no_agg),2,65.45
qwen1.5-32B-Chat,LMC-EI (best_council.no_agg),3,50
claude-3-opus,LMC-EI (best_council.no_agg),4,42.91
llama-3-70b-chat,LMC-EI (best_council.no_agg),5,38.75
claude-3-haiku,LMC-EI (best_council.no_agg),6,31.98
mixtral-8x7b,LMC-EI (best_council.no_agg),7,29.5
gpt-4-0613,LMC-EI (best_council.no_agg),8,24.55
llama-3-8b-chat,LMC-EI (best_council.no_agg),9,20.02
qwen1.5-110B-Chat,LMC-EI (flagships_council.no_agg),1,69.49
gpt-4o-2024-05-13,LMC-EI (flagships_council.no_agg),2,62.78
qwen1.5-32B-Chat,LMC-EI (flagships_council.no_agg),3,50
claude-3-opus,LMC-EI (flagships_council.no_agg),4,46.32
llama-3-70b-chat,LMC-EI (flagships_council.no_agg),5,40.57
claude-3-haiku,LMC-EI (flagships_council.no_agg),6,33.1
mixtral-8x7b,LMC-EI (flagships_council.no_agg),7,28.76
gpt-4-0613,LMC-EI (flagships_council.no_agg),8,21.24
llama-3-8b-chat,LMC-EI (flagships_council.no_agg),9,21.11
qwen1.5-110B-Chat,LMC-EI (smallest_council.no_agg),1,59.52
claude-3-opus,LMC-EI (smallest_council.no_agg),2,53.46
gpt-4o-2024-05-13,LMC-EI (smallest_council.no_agg),3,53.39
llama-3-70b-chat,LMC-EI (smallest_council.no_agg),4,51.39
qwen1.5-32B-Chat,LMC-EI (smallest_council.no_agg),5,50
claude-3-haiku,LMC-EI (smallest_council.no_agg),6,44.75
llama-3-8b-chat,LMC-EI (smallest_council.no_agg),7,39.81
mixtral-8x7b,LMC-EI (smallest_council.no_agg),8,39.22
gpt-4-0613,LMC-EI (smallest_council.no_agg),9,31.48
qwen1.5-110B-Chat,LMC-EI (qwen1.5-72B-Chat),1,63.45
claude-3-opus,LMC-EI (qwen1.5-72B-Chat),2,58.55
gpt-4o-2024-05-13,LMC-EI (qwen1.5-72B-Chat),3,50.43
qwen1.5-32B-Chat,LMC-EI (qwen1.5-72B-Chat),4,50
llama-3-70b-chat,LMC-EI (qwen1.5-72B-Chat),5,46.43
claude-3-haiku,LMC-EI (qwen1.5-72B-Chat),6,43.3
llama-3-8b-chat,LMC-EI (qwen1.5-72B-Chat),7,40.83
mixtral-8x7b,LMC-EI (qwen1.5-72B-Chat),8,40.43
gpt-4-0613,LMC-EI (qwen1.5-72B-Chat),9,33.06
qwen1.5-110B-Chat,LMC-EI (command-r-plus),1,64.85
gpt-4o-2024-05-13,LMC-EI (command-r-plus),2,59.65
qwen1.5-32B-Chat,LMC-EI (command-r-plus),3,50
claude-3-opus,LMC-EI (command-r-plus),4,48.5
llama-3-70b-chat,LMC-EI (command-r-plus),5,43.56
claude-3-haiku,LMC-EI (command-r-plus),6,42.57
mixtral-8x7b,LMC-EI (command-r-plus),7,34.16
llama-3-8b-chat,LMC-EI (command-r-plus),8,25.74
gpt-4-0613,LMC-EI (command-r-plus),9,19.16
qwen1.5-110B-Chat,LMC-EI (gemini-1.5-pro),1,72.25
gpt-4o-2024-05-13,LMC-EI (gemini-1.5-pro),2,70.25
qwen1.5-32B-Chat,LMC-EI (gemini-1.5-pro),3,50
claude-3-opus,LMC-EI (gemini-1.5-pro),4,45.79
llama-3-70b-chat,LMC-EI (gemini-1.5-pro),5,44
claude-3-haiku,LMC-EI (gemini-1.5-pro),6,36.63
gpt-4-0613,LMC-EI (gemini-1.5-pro),7,30.24
mixtral-8x7b,LMC-EI (gemini-1.5-pro),8,29.95
llama-3-8b-chat,LMC-EI (gemini-1.5-pro),9,27.72
qwen1.5-110B-Chat,LMC-EI (qwen1.5-32B-Chat),1,56.14
claude-3-opus,LMC-EI (qwen1.5-32B-Chat),2,54.36
qwen1.5-32B-Chat,LMC-EI (qwen1.5-32B-Chat),3,50
mixtral-8x7b,LMC-EI (qwen1.5-32B-Chat),4,47.99
gpt-4o-2024-05-13,LMC-EI (qwen1.5-32B-Chat),5,47.54
claude-3-haiku,LMC-EI (qwen1.5-32B-Chat),6,47.35
llama-3-70b-chat,LMC-EI (qwen1.5-32B-Chat),7,42.46
llama-3-8b-chat,LMC-EI (qwen1.5-32B-Chat),8,37.61
gpt-4-0613,LMC-EI (qwen1.5-32B-Chat),9,33.79
qwen1.5-110B-Chat,LMC-EI (gemini-1.0-pro),1,60.05
gpt-4o-2024-05-13,LMC-EI (gemini-1.0-pro),2,51.44
qwen1.5-32B-Chat,LMC-EI (gemini-1.0-pro),3,50
claude-3-opus,LMC-EI (gemini-1.0-pro),4,49.3
claude-3-haiku,LMC-EI (gemini-1.0-pro),5,45.63
llama-3-70b-chat,LMC-EI (gemini-1.0-pro),6,42.89
mixtral-8x7b,LMC-EI (gemini-1.0-pro),7,40.2
gpt-4-0613,LMC-EI (gemini-1.0-pro),8,35.88
llama-3-8b-chat,LMC-EI (gemini-1.0-pro),9,34.22
qwen1.5-110B-Chat,LMC-EI (gpt-3.5-turbo-0125),1,75.36
gpt-4o-2024-05-13,LMC-EI (gpt-3.5-turbo-0125),2,68.08
claude-3-opus,LMC-EI (gpt-3.5-turbo-0125),3,50.76
qwen1.5-32B-Chat,LMC-EI (gpt-3.5-turbo-0125),4,50
llama-3-70b-chat,LMC-EI (gpt-3.5-turbo-0125),5,40.48
mixtral-8x7b,LMC-EI (gpt-3.5-turbo-0125),6,34.46
claude-3-haiku,LMC-EI (gpt-3.5-turbo-0125),7,33.46
gpt-4-0613,LMC-EI (gpt-3.5-turbo-0125),8,29.79
llama-3-8b-chat,LMC-EI (gpt-3.5-turbo-0125),9,28.08
qwen1.5-110B-Chat,LMC-EI (dbrx-instruct),1,61
gpt-4o-2024-05-13,LMC-EI (dbrx-instruct),2,57.5
claude-3-opus,LMC-EI (dbrx-instruct),3,50.25
qwen1.5-32B-Chat,LMC-EI (dbrx-instruct),4,50
claude-3-haiku,LMC-EI (dbrx-instruct),5,45.39
llama-3-70b-chat,LMC-EI (dbrx-instruct),6,40.93
mixtral-8x7b,LMC-EI (dbrx-instruct),7,34.52
gpt-4-0613,LMC-EI (dbrx-instruct),8,31.6
llama-3-8b-chat,LMC-EI (dbrx-instruct),9,28.19"""

df = pd.read_csv(StringIO(data))


# Get correlations.
pivot_df = df.pivot(index='LLM', columns='System', values='Score')
spearman_corr = pivot_df.corr(method='spearman')

In [None]:
pivot_df

In [None]:
# spearman_corr
spearman_corr["Human Study"].sort_values(ascending=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.patches as mpatches

# Sample data - replace this with your actual pandas Series
data = spearman_corr["Human Study"].sort_values(ascending=False)

# Define a color mapping function
def get_color(key):
    if "council" in key:
        return 'lightblue'
    elif "LMC" in key:
        return "tab:blue"
    elif "mined" in key:
        return "tab:purple"
    elif "EQ-Bench" in key:
        return 'tab:green'
    elif "MMLU" in key:
        return 'red'
    elif "Chatbot Arena" in key:
        return 'tab:purple'
    else:
        return 'gray'  # default color for other bars

# Apply the color mapping function to the index
colors = [get_color(key) for key in data.index]

# Create the bar plot with horizontal bars
plt.figure(figsize=(12, 8))  # Increase width to prevent cutoff
ax = sns.barplot(x=data.values, y=data.index, palette=colors)

# Add labels to each bar
for i in range(len(data)):
    ax.text(data.values[i], i, f'{data.values[i]:.3f}', color='black', ha='left', va='center')

# Set labels
plt.xlabel('')
plt.ylabel('System')
plt.title('Spearman correlation with EI Human Study')

# Adjust the horizontal axis limits to expand space (customize this as needed)
plt.xlim(0, data.max() * 1.1)  # Adds 10% padding on the right

# Create a custom legend
legend_labels = {
    'LMC-EI (council), in-domain': 'lightblue',
    'LMC-EI (single judge), in-domain': 'tab:blue',
    'EQ-Bench': 'tab:green',
    'MMLU': 'red',
    'Chatbot Arena': 'tab:purple',
    # 'Chatbot Arena (EI subset)': 'tab:purple',
}

# Create patches for each label
legend_patches = [mpatches.Patch(color=color, label=label) for label, color in legend_labels.items()]

# Add the legend to the plot
plt.legend(handles=legend_patches, bbox_to_anchor=(1.05, 1), loc='upper left')

# Use tight layout to ensure everything fits
plt.tight_layout()

# Show the plot
plt.savefig("/Users/justinzhao/Repos/llm-council-public/analysis/leaderboard_comparison_expanded.pdf")
plt.show()
