# Cross-Setting Analysis

**Section 3.3, Appendix F:** cross-arena deployment analysis, cross-benchmark analysis.

In [None]:
import re
import pandas as pd
import numpy as np
from scipy.stats import binomtest
import plotly.express as px
import plotly.graph_objects as go
from datasets import load_dataset

from sklearn.linear_model import LinearRegression
from scipy.stats import rankdata, kendalltau, spearmanr, pearsonr
from itertools import combinations

import warnings
warnings.filterwarnings("ignore")

### Utils

In [2]:
def run_binom_test(num_wins, num_losses, num_ties=0):
    data = (num_wins + num_ties//2) * [1] + (num_losses + num_ties//2) * [0]
    n_success = sum(data)
    n_total = len(data)
    result = binomtest(n_success, n_total, p=0.5, alternative='greater' if num_wins > num_losses else 'less')
    return result

def run_binom_test_for_intent(data, count_ties=False):
    for intent in data['intent'].value_counts().sort_index().index:
        data_intent = data[data['intent']==intent]
        print(f"Intent: {intent}")
        num_search_preferred = (data_intent['group_name']=='search_better').sum()
        num_search_not_preferred = (data_intent['group_name']=='search_worse').sum()
        if count_ties:
            num_ties = (data_intent['group_name']=='tie').sum()
            num_search_preferred += num_ties//2
            num_search_not_preferred += num_ties//2
        if num_search_preferred > num_search_not_preferred:
            print(f"Search better: {num_search_preferred} vs {num_search_not_preferred}")
            binary_data = [1] * num_search_preferred + [0] * num_search_not_preferred
            result = binomtest(sum(binary_data), len(binary_data), p=0.5, alternative='greater')
            print(result)
            print("\n")
        else:
            print(f"Search worse: {num_search_not_preferred} vs {num_search_preferred}")
            binary_data = [0] * num_search_preferred + [1] * num_search_not_preferred
            result = binomtest(sum(binary_data), len(binary_data), p=0.5, alternative='greater')
            print(result)
            print("\n")

def plot_intent_dist_preference(data, title, intent_order):
    # --- STEP 1: Aggregate proportions ---
    count_df = data.groupby(['group_name', 'intent']).size().reset_index(name='count')
    total_per_intent = count_df.groupby('intent')['count'].transform('sum')
    count_df['proportion'] = count_df['count'] / total_per_intent
    count_df = count_df[count_df['intent'] != 'Other']

    # --- STEP 2: Format group names ---
    count_df['group_name'] = count_df['group_name'].map({
        'tie': 'Tie',
        'search_better': 'Search Preferred', 
        'search_worse': 'Non-search Preferred'
    })

    # --- STEP 3: Plot ---
    fig = px.bar(
        count_df,
        x='intent',
        y='count',
        color='group_name',
        category_orders={'intent': intent_order},
        color_discrete_map={
            'Tie': 'gray',
            'Search Preferred': '#1f77b4',
            'Non-search Preferred': '#d62728'
        },
        hover_data={'proportion': ':.2%', 'count': True, 'intent': False},
        labels={'count': 'Count', 'group_name': '', 'intent': 'Intent'},
        barmode='group',
        opacity=0.8,
    )

    # --- STEP 4: Text and style ---
    fig.update_traces(
        texttemplate='%{customdata[0]:.0%}',
        textposition='outside',
        textfont=dict(size=26),
        marker_line_color='black',
        marker_line_width=1.2
    )

    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        margin=dict(l=10, r=10, t=40, b=10),
        title=dict(
            text=title,
            font=dict(size=22, color='black'),
            x=0.06
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1,
            xanchor="right",
            x=1,
            font=dict(size=16)
        ),
        xaxis=dict(
            title='',
            tickangle=30,
            tickfont=dict(size=16, color='black'),
            linecolor='black',
        ),
        yaxis=dict(
            title=dict(text='Count', font=dict(size=18, color='black')),
            tickfont=dict(size=14, color='black'),
            linecolor='black',
            gridcolor='lightgray',
            zeroline=False
        ),
        width=1000,
        height=500
    )
    return fig
    
def remove_inline_citations(text):
    return re.sub(r'\s*\[[\d\s,]+\]\s*', '', text)
    
def preprocess_for_describing_diff(data, intent, output_file):
    data_intent = data[data['intent']==intent]
    chat_col_name = 'conversation_' if 'conversation_a' in data_intent.columns else 'messages_'
    data_intent = data_intent[data_intent['group_name']!='tie']
    data_intent['question'] = data_intent[chat_col_name+'a'].apply(lambda x: x[0]['content'])
    data_intent.drop_duplicates(subset=['question'], inplace=True)
    processed = []
    for _, row in data_intent.iterrows():
        if any(keyword in row['model_a'] for keyword in ['grounding', 'search', 'sonar']):
            processed.append({'question': row['question'], 'answer': remove_inline_citations(row[chat_col_name+'a'][1]['content']), 'group_name': 'search'})
            processed.append({'question': row['question'], 'answer': remove_inline_citations(row[chat_col_name+'b'][1]['content']), 'group_name': 'non-search'})
        else:
            processed.append({'question': row['question'], 'answer': remove_inline_citations(row[chat_col_name+'a'][1]['content']), 'group_name': 'non-search'})
            processed.append({'question': row['question'], 'answer': remove_inline_citations(row[chat_col_name+'b'][1]['content']), 'group_name': 'search'})
    processed = pd.DataFrame(processed)
    print(f"Number of rows: {len(processed)}")
    processed.to_csv(output_file)
    display(processed.head(3))


## Cross-Arena Analysis

### Text Arena Analysis

In [None]:
text_arena_battles = pd.read_json('../data/text_arena_battles.jsonl', orient='records', lines=True)
print(f'Number of rows: {len(text_arena_battles)}')
text_arena_battles.head()

In [None]:
text_arena_battles['group_name'] = 'tie'
text_arena_battles.loc[
    ((text_arena_battles['winner']=='model_a') & (text_arena_battles['model_a']=='gemini-2.5-pro-exp-03-25-grounding-none')) |
    ((text_arena_battles['winner']=='model_b') & (text_arena_battles['model_b']=='gemini-2.5-pro-exp-03-25-grounding-none')),
    'group_name'] = 'search_better'
text_arena_battles.loc[
    ((text_arena_battles['winner']=='model_b') & (text_arena_battles['model_a']=='gemini-2.5-pro-exp-03-25-grounding-none')) |
    ((text_arena_battles['winner']=='model_a') & (text_arena_battles['model_b']=='gemini-2.5-pro-exp-03-25-grounding-none')),
    'group_name'] = 'search_worse'

text_arena_battles['group_name'].value_counts()

In [None]:
num_wins = (text_arena_battles['group_name']=='search_better').sum()
num_losses = (text_arena_battles['group_name']=='search_worse').sum()
num_ties = (text_arena_battles['group_name']=='tie').sum()
print(f"search preferred: {num_wins}")
print(f"search not preferred: {num_losses}")
print(f"ties: {num_ties}")
print(run_binom_test(num_wins, num_losses))
print("Running binom tests broken down by intent")
run_binom_test_for_intent(text_arena_battles)

In [None]:
INTENT_ORDER = ['Factual Lookup', 'Info Synthesis', 'Analysis', 'Recommendation', 'Explanation', 'Creative Generation', 'Guidance', 'Text Processing']
fig = plot_intent_dist_preference(text_arena_battles, title='Text Arena Settings', intent_order=INTENT_ORDER)
fig.show()
fig.write_image("plots/lang_vote_dist.pdf")

In [None]:
preprocess_for_describing_diff(text_arena_battles, intent='Text Processing', output_file='../data/describing_diffs/lang_text_processing_qa.csv')
preprocess_for_describing_diff(text_arena_battles, intent='Factual Lookup', output_file='../data/describing_diffs/lang_factual_qa.csv')
preprocess_for_describing_diff(text_arena_battles, intent='Info Synthesis', output_file='../data/describing_diffs/lang_synthesis_qa.csv')

### Search Arena Analysis

In [None]:
search_battles = pd.read_json('../data/search_arena_24k.jsonl', orient='records', lines=True)

search_battles = search_battles[search_battles['winner'].notna()]
search_battles = search_battles[
    ((search_battles['model_a']=='gemini-2.5-pro-exp-03-25-wo-search') & (~search_battles['model_b'].str.contains('gpt'))) |
    ((search_battles['model_b']=='gemini-2.5-pro-exp-03-25-wo-search') & (~search_battles['model_a'].str.contains('gpt')))
]

search_battles.rename(columns={'primary_intent': 'intent'}, inplace=True)
print(f'Number of rows: {len(search_battles)}')
search_battles.head()

In [None]:
search_battles['group_name'] = 'tie'
search_battles.loc[
    ((search_battles['winner']=='model_a') & (search_battles['model_a']=='gemini-2.5-pro-exp-03-25-wo-search')) |
    ((search_battles['winner']=='model_b') & (search_battles['model_b']=='gemini-2.5-pro-exp-03-25-wo-search')),
    'group_name'] = 'search_worse'
search_battles.loc[
    ((search_battles['winner']=='model_b') & (search_battles['model_a']=='gemini-2.5-pro-exp-03-25-wo-search')) |
    ((search_battles['winner']=='model_a') & (search_battles['model_b']=='gemini-2.5-pro-exp-03-25-wo-search')),
    'group_name'] = 'search_better'

search_battles['group_name'].value_counts()

In [None]:
num_wins = (search_battles['group_name']=='search_better').sum()
num_losses = (search_battles['group_name']=='search_worse').sum()
print(f"search preferred: {num_wins}")
print(f"search not preferred: {num_losses}")
print(run_binom_test(num_wins, num_losses))
print("Running binom tests broken down by intent")
run_binom_test_for_intent(search_battles)

In [None]:
INTENT_ORDER = ['Factual Lookup', 'Info Synthesis', 'Analysis', 'Recommendation', 'Explanation', 'Creative Generation', 'Guidance', 'Text Processing']
fig = plot_intent_dist_preference(search_battles, title='Search Arena Settings', intent_order=INTENT_ORDER)
fig.show()
fig.write_image("plots/search_vote_dist.pdf")

In [None]:
preprocess_for_describing_diff(search_battles, intent='Factual Lookup', output_file='../data/describing_diffs/search_factual_qa.csv')
preprocess_for_describing_diff(search_battles, intent='Info Synthesis', output_file='../data/describing_diffs/search_synthesis_qa.csv')

### Prompt differences across datasets

#### Loading datasets

In [None]:
search_arena = pd.read_json('../data/search_arena_24k.jsonl', orient='records', lines=True)
print(f"Number of rows: {len(search_arena)}")
search_arena.head()

In [None]:
text_arena = load_dataset("lmarena-ai/arena-human-preference-100k", split="train")
text_arena = text_arena.to_pandas()
print(f"Number of rows: {len(text_arena)}")
text_arena.head()

In [None]:
simpleqa = pd.read_csv("https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv")
print(f"Number of rows: {len(simpleqa)}")
simpleqa.head()

In [None]:
import base64
import hashlib

def derive_key(password: str, length: int) -> bytes:
    """Derive a fixed-length key from the password using SHA256."""
    hasher = hashlib.sha256()
    hasher.update(password.encode())
    key = hasher.digest()
    return key * (length // len(key)) + key[: length % len(key)]

def decrypt(ciphertext_b64: str, password: str) -> str:
    """Decrypt base64-encoded ciphertext with XOR."""
    encrypted = base64.b64decode(ciphertext_b64)
    key = derive_key(password, len(encrypted))
    decrypted = bytes(a ^ b for a, b in zip(encrypted, key))
    return decrypted.decode()

browsecomp = pd.read_csv("https://openaipublic.blob.core.windows.net/simple-evals/browse_comp_test_set.csv")
for index in browsecomp.index:
    row = browsecomp.iloc[index]
    browsecomp.loc[index, 'problem'] = decrypt(row.get("problem", ""), row.get("canary", ""))
    browsecomp.loc[index, 'answer'] = decrypt(row.get("answer", ""), row.get("canary", ""))
print(f"Number of rows: {len(browsecomp)}")
browsecomp.head()

#### Creating comparison datasets for describing diffs

In [17]:
# search vs lm arena
search_arena_english = search_arena[search_arena['languages'].apply(lambda x: x[0]=='English' if len(x)>0 else False)]
text_arena_english = text_arena[text_arena['language']=='English'].sample(n=len(search_arena_english))
search_arena_prompts = list(search_arena_english['messages_a'].apply(lambda x: x[0]['content']))
text_arena_prompts = list(text_arena_english['conversation_a'].apply(lambda x: x[0]['content']))
search_arena_vs_text_arena = []
for prompt in search_arena_prompts:
    search_arena_vs_text_arena.append(
        {
            "text": prompt,
            "group_name": "search_arena",
        }
    )
for prompt in text_arena_prompts:
    search_arena_vs_text_arena.append(
        {
            "text": prompt,
            "group_name": "language_arena",
        }
    )
search_arena_vs_language_arena = pd.DataFrame(search_arena_vs_text_arena)
search_arena_vs_language_arena.to_csv('../data/describing_diffs/search_arena_vs_text_arena.csv')

In [18]:
# search vs simpleqa
simpleqa_prompts = list(simpleqa['problem'])
search_arena_english = search_arena[search_arena['languages'].apply(lambda x: x[0]=='English' if len(x)>0 else False)].sample(len(simpleqa_prompts))
search_arena_prompts = list(search_arena_english['messages_a'].apply(lambda x: x[0]['content']))
search_arena_vs_simpleqa = []
for prompt in search_arena_prompts:
    search_arena_vs_simpleqa.append(
        {
            "text": prompt,
            "group_name": "search_arena",
        }
    )
for prompt in simpleqa_prompts:
    search_arena_vs_simpleqa.append(
        {
            "text": prompt,
            "group_name": "simpleqa",
        }
    )
search_arena_vs_simpleqa = pd.DataFrame(search_arena_vs_simpleqa)
search_arena_vs_simpleqa.to_csv('../data/describing_diffs/search_arena_vs_simpleqa.csv')

In [19]:
# search vs browsecomp
browsecomp_prompts = list(browsecomp['problem'])
search_arena_english = search_arena[search_arena['languages'].apply(lambda x: x[0]=='English' if len(x)>0 else False)].sample(n=len(browsecomp))
search_arena_prompts = list(search_arena_english['messages_a'].apply(lambda x: x[0]['content']))
search_arena_vs_browsecomp = []
for prompt in search_arena_prompts:
    search_arena_vs_browsecomp.append(
        {
            "text": prompt,
            "group_name": "search_arena",
        }
    )
for prompt in browsecomp_prompts:
    search_arena_vs_browsecomp.append(
        {
            "text": prompt,
            "group_name": "browsecomp",
        }
    )
search_arena_vs_browsecomp = pd.DataFrame(search_arena_vs_browsecomp)
search_arena_vs_browsecomp.to_csv('../data/describing_diffs/search_arena_vs_browsecomp.csv')

## Cross-Benchmark Analysis

In [None]:
models = [
    "sonar-reasoning-pro-high",
    "gemini-2.5-pro-grounding",
    "sonar-pro-high",
    "sonar-reasoning",
    "sonar-pro",
    "sonar",
    "gemini-2.5-flash-grounding",
    "gemini-2.0-flash-grounding",
    "gpt-4o-search-preview-high",
    "gpt-4o-mini-search-preview"    
]

searcharena_scores = [66.6, 66.7, 63.7, 60.1, 57.5, 55.8, 49.8, 41.7, 34.8, 29.3]
searcharena_factual_scores = [68.8, 68.3, 59.9, 58.8, 56.8, 56.8, 54.8, 54.8, 42.8, 36.2, 30.3]
searcharena_nonfactual_scores = [65.0, 65.3, 66.7, 61.3, 58.4, 56.8, 46.1, 41.0, 33.6, 28.6]
simpleqa_scores = [91.8, 90.6, 93.2, 92.6, 92.6, 91.8, 89.8, 89.2, 89.4, 91.2]
arenahard_scores = [28.3, 33.5, 43.4, 29.1, 39.9, 36.2, 22.6, 13.4, 16.6, 8.3]

include_idx = [0, 1, 2, 5, 6, 8, 9]
categories = ["Search Arena", "Search Arena (Fact+Synth)", "SimpleQA", "ArenaHard", "Search Arena (Other)"]

colors = px.colors.qualitative.Vivid

fig = go.Figure()

for i, idx in enumerate(include_idx):
    fig.add_trace(go.Scatterpolar(
        r=[
            searcharena_scores[idx],
            searcharena_factual_scores[idx],
            simpleqa_scores[idx],
            arenahard_scores[idx],
            searcharena_nonfactual_scores[idx],
            searcharena_scores[idx],
        ],
        theta=categories + [categories[0]],
        name=models[idx],
        line=dict(color=colors[i % len(colors)], width=2)
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            tickfont=dict(size=14, color='black'),
            tickvals=[0.0, 20, 40, 60, 80, 100],
        range=[0, 100],
        ),
        angularaxis=dict(
            tickfont=dict(size=18, color='black'),
            rotation=90,
            direction="clockwise"
        )
    ),
    legend=dict(
        font=dict(size=16, color='black'),
        orientation="v",
        yanchor="top",
        y=1.1,
        xanchor="left",
        x=0.8,
    ),
    template="plotly_white",
    margin=dict(r=20, t=20, b=20, l=20),
    width=1000,
    height=600
)

fig.show()
fig.write_image("plots/cross_benchmark_scores.pdf")

In [None]:
searcharena_scores = [66.6, 66.7, 63.7, 60.1, 57.5, 55.8, 49.8, 41.7, 34.8, 29.3]
searcharena_factual_scores = [68.8, 68.3, 59.9, 58.8, 56.8, 56.8, 54.8, 54.8, 42.8, 36.2]
searcharena_nonfactual_scores = [65.0, 65.3, 66.7, 61.3, 58.4, 56.8, 46.1, 41.0, 33.6, 28.6]
simpleqa_scores = [91.8, 90.6, 93.2, 92.6, 92.6, 91.8, 89.8, 89.2, 89.4, 91.2]
arenahard_scores = [28.3, 33.5, 43.4, 29.1, 39.9, 36.2, 22.6, 13.4, 16.6, 8.3]

score_lists = {
    "SearchArena": searcharena_scores,
    "SearchArena (Factual)": searcharena_factual_scores,
    "SearchArena (Nonfactual)": searcharena_nonfactual_scores,
    "SimpleQA": simpleqa_scores,
    "ArenaHard": arenahard_scores
}

ranked_lists = {name: rankdata([-s for s in scores], method='ordinal') for name, scores in score_lists.items()}
results = []

for (name1, r1), (name2, r2) in combinations(ranked_lists.items(), 2):
    raw1 = score_lists[name1]
    raw2 = score_lists[name2]

    tau, _ = kendalltau(r1, r2)
    rho, _ = spearmanr(r1, r2)
    pearson, _ = pearsonr(raw1, raw2)

    results.append(((name1, name2), tau, rho, pearson))

for (name1, name2), tau, rho, pearson in results:
    print(f"{name1} vs {name2}:\n"
          f"  Kendall Tau   = {tau:.3f}\n"
          f"  Spearman Rho  = {rho:.3f}\n"
          f"  Pearson Correlation = {pearson:.3f}\n")

In [None]:
arenahard_scores = {
    "gemini-2.5-pro": {"search": (33.5, -2.1, 2.4), "non-search": (51.8, -2.6, 2.3)},
    "gemini-2.5-flash": {"search": (22.6, -1.4, 1.8), "non-search": (45.5, -2.3, 2.3)},
    "gemini-2.0-flash": {"search": (13.4, -1.0, 1.1), "non-search": (19.5, -1.4, 1.2)}, 
}

simpleqa_scores = {
    "gemini-2.5-pro": {"search": (90.6, -2.6, 2.4), "non-search": (48.6, -4.4, 4.4)},
    "gemini-2.5-flash": {"search": (89.8, -2.6, 2.6), "non-search": (31.4, -4.0, 4.2)},
    "gemini-2.0-flash": {"search": (89.2, -2.8, 2.6), "non-search": (26.4, -3.8, 3.8)},
}

colors = px.colors.qualitative.Plotly
marker_map = {"search": "circle", "non-search": "square"}
neutral_color = "#888888"
fig = go.Figure()
x_vals, y_vals = [], []

for i, model in enumerate(arenahard_scores):
    color = colors[i % len(colors)]
    for version in ["search", "non-search"]:
        x, x_low, x_high = simpleqa_scores[model][version]
        y, y_low, y_high = arenahard_scores[model][version]

        x_vals.append(x)
        y_vals.append(y)

        fig.add_trace(go.Scatter(
            x=[x],
            y=[y],
            mode='markers+text',
            showlegend=False,
            marker=dict(
                symbol=marker_map[version],
                size=10,
                color=color,
                line=dict(width=1, color='black')
            ),
            error_x=dict(
                type='data',
                symmetric=False,
                array=[x_high],
                arrayminus=[abs(x_low)],
                thickness=1.5
            ),
            error_y=dict(
                type='data',
                symmetric=False,
                array=[y_high],
                arrayminus=[abs(y_low)],
                thickness=1.5
            ),
            text=[model],
            textposition="top left" if version == "search" else "top right",
            textfont=dict(size=16, color='black')
        ))

for version in marker_map:
    fig.add_trace(go.Scatter(
        x=[None], y=[None],
        mode='markers',
        name=version,
        marker=dict(
            symbol=marker_map[version],
            size=12,
            color=neutral_color,
            line=dict(width=1, color='black')
        ),
        showlegend=True,
        hoverinfo='skip',
    ))

X = np.array(x_vals).reshape(-1, 1)
y = np.array(y_vals)
reg = LinearRegression().fit(X, y)

slope = reg.coef_[0]
intercept = reg.intercept_
line_x = np.linspace(min(x_vals), max(x_vals), 100)
line_y = slope * line_x + intercept

fig.add_trace(go.Scatter(
    x=line_x,
    y=line_y,
    mode='lines',
    name=f"Fit: y = {slope:.2f}x + {intercept:.2f}",
    line=dict(color='black', dash='dash'),
))

fig.update_layout(
    xaxis=dict(
        title=dict(text="SimpleQA Score", font=dict(size=16, color='black')),
        tickfont=dict(size=16, color='black'),
    ),
    yaxis=dict(
        title=dict(text="ArenaHard Score", font=dict(size=16, color='black')),
        tickfont=dict(size=16, color='black'),
    ),
    legend=dict(
        font=dict(size=16, color='black'),
        orientation="v",
        yanchor="top",
        y=0.9,
        xanchor="left",
    ),
    template="plotly_white",
    margin=dict(r=20, t=20, b=20, l=20),
    width=1000,
    height=600
)

fig.show()
fig.write_image("plots/simpleqa_vs_arenahard.pdf")