# Preference Analysis

**Sections 3.1, 3.2, Appendix D, E:** leaderboard analysis, controlled Bradley-Terry analysis, citation control analysis.

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import sem, t
from scipy.stats import ttest_ind
import statsmodels.api as sm

from utils import *

import warnings
warnings.filterwarnings("ignore")

### Loading Data

In [None]:
battle_data = pd.read_json('../data/search_arena_24k.jsonl', orient='records', lines=True)
battle_data['timestamp'] = pd.to_datetime(battle_data['timestamp']).dt.tz_localize('UTC').dt.tz_convert('America/Los_Angeles')
battle_data = battle_data[battle_data['winner'].notna()].reset_index(drop=True)

model_mapping = {
    "gpt-4o-mini-search-preview": "gpt-4o-mini-search",
    "gpt-4o-mini-search-preview-high": "gpt-4o-mini-search-high",
    "gpt-4o-search-preview": "gpt-4o-search",
    "gpt-4o-search-preview-high": "gpt-4o-search-high",
    "gpt-4o-search-preview-high-loc": "gpt-4o-search-high-loc",
    "gemini-2.0-flash-grounding": "gemini-2.0-flash-grounding",
    "gemini-2.5-flash-preview-04-17-grounding": "gemini-2.5-flash-grounding",
    "gemini-2.0-pro-exp-02-05-grounding": "gemini-2.5-pro-grounding",
    "gemini-2.5-pro-exp-03-25-grounding": "gemini-2.5-pro-grounding",
    "gemini-2.5-pro-exp-03-25-wo-search": "gemini-2.5-pro"
}
battle_data["model_a"].replace(model_mapping, inplace=True)
battle_data["model_b"].replace(model_mapping, inplace=True)
battles_no_ties = battle_data[~battle_data['winner'].isin(['tie', 'tie (bothbad)'])]

models_to_remove = ['gemini-2.5-pro'] # non-search model
battle_data = battle_data[~battle_data['model_a'].isin(models_to_remove) & ~battle_data['model_b'].isin(models_to_remove)].reset_index(drop=True)

print(f'Number of battles with votes: {len(battle_data)}')
display(battle_data.head())

### Search Arena Leaderboard

In [None]:
def visualize_battle_count_by_model(battles):
    models = pd.concat([battles['model_a'], battles['model_b']]).value_counts()
    fig = px.bar(
        models,
        text_auto="auto",
    )
    fig.update_traces(
        textposition='outside',
        textfont=dict(size=15, color='black')
    )
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        showlegend=False,
        xaxis=dict(
            title='',
            title_font=dict(size=18, color='black'),
            tickangle=-45,
            tickfont=dict(size=16, color='black'),
            linecolor='black',
            linewidth=1.0
        ),
        yaxis=dict(
            title='Number of Battles',
            title_font=dict(size=18, color='black'),
            tickfont=dict(size=16, color='black'),
            showgrid=True,
            gridcolor='lightgray',
            linecolor='black',
            linewidth=1.0
        ),
        margin=dict(l=10, r=10, t=30, b=10),
        height=600,
        width=800
    )
    return fig

fig = visualize_battle_count_by_model(battle_data)
fig.show()
fig.write_image('plots/battle_count.pdf')

In [None]:
def visualize_average_win_rate(battles):
    row_beats_col_freq = compute_pairwise_win_fraction(
        battles, None, limit_show_number=None
    )
    fig = px.bar(
        row_beats_col_freq.mean(axis=1).sort_values(ascending=False),
        text_auto=".2f",
    )
    fig.update_traces(
        textposition='outside',
        textfont=dict(size=15, color='black')
    )
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        showlegend=False,
        xaxis=dict(
            title='',
            tickangle=-45,
            tickfont=dict(size=16, color='black'),
            linecolor='black',
            linewidth=1.0
        ),
        yaxis=dict(
            title=dict(text='Average Win Rate', font=dict(size=18, color='black')),
            tickfont=dict(size=16, color='black'),
            showgrid=True,
            gridcolor='lightgray',
            linecolor='black',
            linewidth=1.0
        ),        
        margin=dict(l=10, r=10, t=10, b=10),
        height=600,
        width=800
    )
    return fig

fig = visualize_average_win_rate(battles_no_ties)
fig.show()
fig.write_image('plots/avg_win_rate.pdf')

In [None]:
def visualize_battle_count(battles, model_order):
    ptbl = pd.pivot_table(
        battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0
    )
    battle_counts = ptbl + ptbl.T
    fig = px.imshow(
        battle_counts.loc[model_order, model_order],
        text_auto=True,
    )
    fig.update_traces(
        textfont=dict(size=14)    
    )
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        showlegend=False,
        xaxis=dict(
            title=dict(text='Model B', font=dict(size=18, color='black')),
            tickfont=dict(size=14, color='black')
        ),
        yaxis=dict(
            title=dict(text='Model A', font=dict(size=18, color='black')),
            tickfont=dict(size=14, color='black')
        ),
        xaxis_side="top",
        margin=dict(l=20, r=20, t=20, b=20),
        coloraxis_colorbar=dict(
            len=1.0,
            thickness=30,
            title="",
            xpad=5,
            tickfont=dict(size=14, color='black')
        ),
        height=800,
        width=1000,
    )
    return fig

fig = visualize_battle_count(battle_data, get_model_order(battle_data))
fig.show()
fig.write_image('plots/battle_count_pairwise.pdf')

In [None]:
def visualize_pairwise_win_fraction(battles, model_order):
    row_beats_col = compute_pairwise_win_fraction(battles, model_order)
    fig = px.imshow(
        row_beats_col,
        color_continuous_scale="RdBu",
        text_auto=".2f"
    )
    fig.update_traces(
        textfont=dict(size=14)    
    )
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        showlegend=False,
        xaxis=dict(
            title=dict(text='Model B', font=dict(size=18, color='black')),
            tickfont=dict(size=14, color='black')
        ),
        yaxis=dict(
            title=dict(text='Model A', font=dict(size=18, color='black')),
            tickfont=dict(size=14, color='black')
        ),
        xaxis_side="top",
        margin=dict(l=20, r=20, t=20, b=20),
        coloraxis_colorbar=dict(
            len=1.0,
            thickness=30,
            title="",
            xpad=5,
            tickfont=dict(size=14, color='black')
        ),
        height=800,
        width=1000,
    )
    return fig

fig = visualize_pairwise_win_fraction(battles_no_ties, get_model_order(battle_data))
fig.show()
fig.write_image('plots/pairwise_winrates.pdf')

In [None]:
def get_bt_ratings(battle_data, anchor_model, anchor_rating=1000, num_bootstrap_samples=1000, style_elements=None):
    if style_elements is None:
        bt_ratings = compute_bt(battle_data)
        offset_score = (anchor_rating - bt_ratings[anchor_model])
        bt_ratings += offset_score
        bt_ratings_bootstrap = compute_bootstrap_bt(battle_data, num_round=100, offset=offset_score)
        style_coef_bootstrap = None
    else:
        bt_ratings, _ = compute_style_control(battle_data, style_elements=style_elements)
        offset_score = (anchor_rating - bt_ratings[anchor_model])
        bt_ratings += offset_score
        bt_ratings_bootstrap, style_coef_bootstrap = compute_bootstrap_style_control(battle_data, style_elements=style_elements, num_round=num_bootstrap_samples, offset=offset_score)
    return bt_ratings_bootstrap, style_coef_bootstrap

def get_leaderboard_table(bt_ratings_bootstrap):
    model_order = list(bt_ratings_bootstrap.columns)
    model_rating_q025 = bt_ratings_bootstrap.quantile(0.025).round(2)
    model_rating_q975 = bt_ratings_bootstrap.quantile(0.975).round(2)
    bt_ratings = bt_ratings_bootstrap.mean().round(2)
    bt_var = bt_ratings_bootstrap.var().round(2)

    ranking = {}
    for i, model_a in enumerate(model_order):
        ranking[model_a] = 1
        for j, model_b in enumerate(model_order):
            if i == j:
                continue
            if model_rating_q025[model_b] > model_rating_q975[model_a]:
                ranking[model_a] += 1

    leaderboard_table = pd.DataFrame(
        {
            "rating": bt_ratings,
            "variance": bt_var,
            "rating_q975": model_rating_q975,
            "rating_q025": model_rating_q025,
            "num_battles": battle_data["model_a"].value_counts().add(battle_data["model_b"].value_counts(), fill_value=0),
            "final_ranking": pd.Series(ranking),
        }
    )
    leaderboard_table = leaderboard_table.sort_values(by='rating', ascending=False)
    return leaderboard_table

bt_ratings_bootstrap, _ = get_bt_ratings(battle_data, anchor_model='gpt-4o-search')
leaderboard_table = get_leaderboard_table(bt_ratings_bootstrap)
display(leaderboard_table)

In [None]:
def visualize_bootstrap_elo_rating(bt_ratings_bootstrap):
    bars = (
        pd.DataFrame(
            dict(
                lower=bt_ratings_bootstrap.quantile(0.025),
                rating=bt_ratings_bootstrap.mean(),
                upper=bt_ratings_bootstrap.quantile(0.975),
            )
        )
        .reset_index(names="model")
        .sort_values("rating", ascending=False)
    )
    bars["error_y"] = bars["upper"] - bars["rating"]
    bars["error_y_minus"] = bars["rating"] - bars["lower"]
    bars["rating_rounded"] = np.round(bars["rating"])
    fig = px.scatter(
        bars,
        x="model",
        y="rating",
        error_y="error_y",
        error_y_minus="error_y_minus",
        text="rating_rounded",
    )
    fig.update_traces(
        marker=dict(size=10, color="royalblue"),
        line=dict(width=1, color="royalblue"),
        textposition='top left',
        textfont=dict(size=16, color='black')
    )
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        showlegend=False,
        xaxis=dict(
            title='',
            tickfont=dict(size=16, color='black'),
            linecolor='black',
            linewidth=1.0,
            showgrid=True,
            gridcolor='lightgray'
        ),
        yaxis=dict(
            title=dict(text='Rating', font=dict(size=18, color='black')),
            tickfont=dict(size=16, color='black'),
            linecolor='black',
            linewidth=1.0,
            showgrid=True,
            gridcolor='lightgray'
        ),
        margin=dict(l=10, r=10, t=10, b=10),
        height=500,
        width=1200,
    )
    return fig

bt_ratings_bootstrap, _ = get_bt_ratings(battle_data, anchor_model='gpt-4o-search')
fig = visualize_bootstrap_elo_rating(bt_ratings_bootstrap)
fig.show()
fig.write_image('plots/elo_ratings.pdf')

#### Leaderboard Ablations

In [None]:
print('Leaderboard (English)')
battle_data_english = battle_data[battle_data['languages'].apply(lambda x: x[0]=='English' if x else False)]
bt_ratings_bootstrap_english, _ = get_bt_ratings(battle_data_english, anchor_model='gpt-4o-search')
leaderboard_table_english = get_leaderboard_table(bt_ratings_bootstrap_english)
display(leaderboard_table_english)

print('Leaderboard (non English)')
battle_data_nonenglish = battle_data[battle_data['languages'].apply(lambda x: x[0]!='English' if x else False)]
bt_ratings_bootstrap_nonenglish, _ = get_bt_ratings(battle_data_nonenglish, anchor_model='gpt-4o-search')
leaderboard_table_nonenglish = get_leaderboard_table(bt_ratings_bootstrap_nonenglish)
display(leaderboard_table_nonenglish)

In [None]:
print('Leaderboard (Factual)')
battle_data_fact = battle_data[battle_data['primary_intent'].isin(['Factual Lookup', 'Info Synthesis'])]
bt_ratings_bootstrap_fact, _ = get_bt_ratings(battle_data_fact, anchor_model='gpt-4o-search')
leaderboard_table_fact = get_leaderboard_table(bt_ratings_bootstrap_fact)
display(leaderboard_table_fact)

print('Leaderboard (non-Factual)')
battle_data_nonfact = battle_data[~battle_data['primary_intent'].isin(['Factual Lookup', 'Info Synthesis'])]
bt_ratings_bootstrap_nonfact, _ = get_bt_ratings(battle_data_nonfact, anchor_model='gpt-4o-search')
leaderboard_table_nonfact = get_leaderboard_table(bt_ratings_bootstrap_nonfact)
display(leaderboard_table_nonfact)

#### Win Rate Analysis

In [11]:
def compare_models(battle_data, model_1, model_2):
    curr_data = battle_data[battle_data['winner'].isin(['model_a', 'model_b'])]
    curr_data = curr_data[curr_data['model_a'] != curr_data['model_b']]
    winner_a_count = curr_data[curr_data['winner'] == 'model_a'].pivot_table(index='model_a', columns='model_b', values='winner', aggfunc='count')
    winner_b_count = curr_data[curr_data['winner'] == 'model_b'].pivot_table(index='model_a', columns='model_b', values='winner', aggfunc='count')
    winner_count = winner_a_count + winner_b_count.T # model_a > model_b

    win_count_1, total_count_1 = winner_count.loc[model_1].sum(), winner_count.loc[model_1].sum() + winner_count.T.loc[model_1].sum()
    win_count_2, total_count_2 = winner_count.loc[model_2].sum(), winner_count.loc[model_2].sum() + winner_count.T.loc[model_2].sum()

    battle_data_1 = [1] * int(win_count_1) + [0] * int(total_count_1 - win_count_1)
    battle_data_2 = [1] * int(win_count_2) + [0] * int(total_count_2 - win_count_2)

    win_rate_1 = win_count_1 / total_count_1
    win_rate_2 = win_count_2 / total_count_2

    print(f'{model_1} winrate: {round(win_rate_1, 2)}')
    print(f'{model_2} winrate: {round(win_rate_2, 2)}')

    print(f't-test results (winrate_1 > winrate_2)')
    print(ttest_ind(battle_data_1, battle_data_2, alternative='greater'))

#### Search context analysis

In [None]:
compare_models(battle_data, 'sonar-pro-high', 'sonar-pro')

In [None]:
compare_models(battle_data, 'gpt-4o-search-high', 'gpt-4o-search')

#### Win rate by intent

In [None]:
battle_data_factual = battle_data[battle_data['primary_intent'].isin(['Factual Lookup', 'Info Synthesis'])]
bootstrap_winrates(battle_data_factual)

In [None]:
battle_data_nonfactual = battle_data[~battle_data['primary_intent'].isin(['Factual Lookup', 'Info Synthesis'])]
bootstrap_winrates(battle_data_nonfactual)

### Feature EDA

In [None]:
add_response_length_style(battle_data, "response_length") # average response length of assistant messages
add_num_citations_style(battle_data, "citation_count") # number of urls returned by the web search
add_domain_style(battle_data, "cites") # whether the models cite a domain group

In [17]:
model_data_a = pd.DataFrame({'model': battle_data['model_a'],
                             'response_len': battle_data['conv_metadata'].apply(lambda x: x['response_length_a']),
                             'citation_count': battle_data['conv_metadata'].apply(lambda x: x['citation_count_a']),
                             **{f'cites_{domain}': battle_data['conv_metadata'].apply(lambda x: x[f'cites_{domain}_a']) for domain in DOMAIN_CATEGORIES}
})
model_data_b = pd.DataFrame({'model': battle_data['model_b'],
                             'response_len': battle_data['conv_metadata'].apply(lambda x: x['response_length_b']),
                             'citation_count': battle_data['conv_metadata'].apply(lambda x: x['citation_count_b']),
                             **{f'cites_{domain}': battle_data['conv_metadata'].apply(lambda x: x[f'cites_{domain}_b']) for domain in DOMAIN_CATEGORIES}
})
model_data = pd.concat([model_data_a, model_data_b])
model_data = model_data[~model_data['model'].isin(['gpt-4o-search-high-loc'])]
model_data.replace({'gemini-2.5-pro-grounding': 'gemini-2.5-pro',
                    'gemini-2.5-flash-grounding': 'gemini-2.5-flash',
                    'gemini-2.0-flash-grounding': 'gemini-2.0-flash'}, inplace=True)

family_mapping = {
    'sonar': 'Perplexity',
    'sonar-pro': 'Perplexity',
    'sonar-pro-high': 'Perplexity',
    'sonar-reasoning': 'Perplexity',
    'sonar-reasoning-pro-high': 'Perplexity',
    'gpt-4o-mini-search': 'OpenAI',
    'gpt-4o-search': 'OpenAI',
    'gpt-4o-search-high': 'OpenAI',
    'gemini-2.0-flash': 'Google',
    'gemini-2.5-flash': 'Google',
    'gemini-2.5-pro': 'Google',
}

model_data['model_family'] = model_data['model'].map(family_mapping)

In [None]:
# === STEP 0: Setup ===
model_family_order = ['Perplexity', 'Google', 'OpenAI']
color_map = {
    'Google': '#e74c3c',
    'Perplexity': '#268bd2',
    'OpenAI': '#2ca02c'
}

# === STEP 1: Compute group stats ===
group_stats = model_data.groupby("model").agg(
    mean_response_len=('response_len', 'mean'),
    se_response_len=('response_len', sem),
    mean_citation_count=('citation_count', 'mean'),
    se_citation_count=('citation_count', sem),
    count=('response_len', 'count')
).reset_index()

# === STEP 2: Compute 95% CI
group_stats['t_score'] = t.ppf(0.975, df=group_stats['count'] - 1)
group_stats['ci_response_len'] = group_stats['t_score'] * group_stats['se_response_len']
group_stats['ci_citation_count'] = group_stats['t_score'] * group_stats['se_citation_count']

# === STEP 3: Add model_family and sort
model_family_map = model_data.drop_duplicates("model")[["model", "model_family"]]
summary_df = pd.merge(group_stats, model_family_map, on="model")
summary_df['model_family'] = pd.Categorical(summary_df['model_family'], categories=model_family_order, ordered=True)
summary_df = summary_df.sort_values(by=["model_family", "mean_response_len"], ascending=[True, False])
summary_df['model'] = pd.Categorical(summary_df['model'], categories=summary_df['model'], ordered=True)

# === STEP 4: Create subplots with independent y-axes
fig = make_subplots(
    rows=1, cols=2,
    shared_yaxes=False,
    horizontal_spacing=0.1
)

# === STEP 5: Add bars for both metrics ===
for i, row in summary_df.iterrows():
    color = color_map[row['model_family']]
    # Response Length
    fig.add_trace(
        go.Bar(
            x=[row['model']],
            y=[row['mean_response_len']],
            name=row['model_family'],
            marker_color=color,
            error_y=dict(type='data', array=[row['ci_response_len']], thickness=1.5),
            opacity=0.9,
            showlegend=False
        ),
        row=1, col=1
    )
    # Number of Citations
    fig.add_trace(
        go.Bar(
            x=[row['model']],
            y=[row['mean_citation_count']],
            name=row['model_family'],
            marker_color=color,
            error_y=dict(type='data', array=[row['ci_citation_count']], thickness=1.5),
            opacity=0.9,
            showlegend=False
        ),
        row=1, col=2
    )

# === STEP 6: Style updates ===
for axis in ['xaxis', 'xaxis2']:
    fig.update_layout({axis: dict(
        tickangle=-45,
        tickfont=dict(size=16, color='black'),
        linecolor='black'
    )})

fig.update_yaxes(
    title_text="Response Length (num words)",
    title_font=dict(size=20, color='black'),
    tickfont=dict(size=16, color='black'),
    showgrid=True,
    gridcolor='lightgray',
    zeroline=False,
    linecolor='black',
    row=1, col=1
)

fig.update_yaxes(
    title_text="Citation Count",
    title_font=dict(size=20, color='black'),
    tickfont=dict(size=16, color='black'),
    showgrid=True,
    gridcolor='lightgray',
    zeroline=False,
    linecolor='black',
    row=1, col=2
)

fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    width=1200,
    height=800,
    margin=dict(l=20, r=20, t=20, b=20),
    font=dict(size=14, color='black'),
)

fig.show()
fig.write_image('plots/length_citation_count.pdf')

In [None]:
curr_data = battle_data[~battle_data['model_a'].isin(['gpt-4o-search-high-loc']) & ~battle_data['model_b'].isin(['gpt-4o-search-high-loc'])]
curr_data.replace({'gemini-2.5-pro-grounding': 'gemini-2.5-pro',
                    'gemini-2.5-flash-grounding': 'gemini-2.5-flash',
                    'gemini-2.0-flash-grounding': 'gemini-2.0-flash'}, inplace=True)
bt_ratings_bootstrap, _ = get_bt_ratings(curr_data, anchor_model='gpt-4o-search')

# === STEP 1: Score stats from bootstrap ===
bt_summary = bt_ratings_bootstrap.describe(percentiles=[0.025, 0.975]).T
bt_summary = bt_summary[['mean']]
bt_summary['ci_lower'] = bt_ratings_bootstrap.quantile(0.025)
bt_summary['ci_upper'] = bt_ratings_bootstrap.quantile(0.975)
bt_summary['ci_y'] = (bt_summary['ci_upper'] - bt_summary['ci_lower']) / 2
bt_summary = bt_summary.reset_index().rename(columns={'index': 'model', 'mean': 'score'})

# === STEP 2: Response length stats ===
grouped = model_data.groupby('model')['response_len']
length_mean = grouped.mean()
length_se = grouped.apply(sem)
length_n = grouped.count()
t_scores = t.ppf(0.975, df=length_n - 1)
ci_x = t_scores * length_se

length_summary = pd.DataFrame({
    'model': length_mean.index,
    'response_len': length_mean.values,
    'ci_x': ci_x.values
})

# === STEP 3: Merge summaries ===
merged = pd.merge(bt_summary, length_summary, on='model')

# === STEP 4: Plot ===
fig = px.scatter(
    merged,
    x='response_len',
    y='score',
    error_x='ci_x',
    error_y='ci_y',
)

fig.update_traces(
    marker=dict(size=10, color='#268bd2', line=dict(width=1, color='black')),
    textposition='top center',
    showlegend=False
)

fig.update_layout(
    xaxis=dict(
        title=dict(text='Average Response Length (words)', font=dict(size=18, color='black')),
        tickfont=dict(size=16),
        linecolor='black',
        gridcolor='lightgray'
    ),
    yaxis=dict(
        title=dict(text='BT Score Estimate', font=dict(size=18, color='black')),
        tickfont=dict(size=16),
        linecolor='black',
        gridcolor='lightgray'
    ),
    font=dict(size=14, color='black'),
    plot_bgcolor='white',
    paper_bgcolor='white',
    margin=dict(l=10, r=10, t=10, b=10),
    width=800,
    height=600
)

# === STEP 5: Fit linear regression ===
X = merged['response_len']
y = merged['score']
X_const = sm.add_constant(X)
model = sm.OLS(y, X_const).fit()


# === STEP 6: Compute prediction line and CI ===
x_range = np.linspace(X.min(), X.max(), 100)
x_range_const = sm.add_constant(x_range)
predictions = model.get_prediction(x_range_const)
pred_summary = predictions.summary_frame(alpha=0.05)
y_pred = pred_summary['mean']
ci_lower = pred_summary['mean_ci_lower']
ci_upper = pred_summary['mean_ci_upper']


# === STEP 7: Plot ===
fig.add_scatter(
    x=x_range,
    y=y_pred,
    mode='lines',
    line=dict(color='black', width=2, dash='dash'),
    name='Linear Fit',
    showlegend=False
)

# Add upper and lower bound
fig.add_scatter(
    x=np.concatenate([x_range, x_range[::-1]]),
    y=np.concatenate([ci_upper, ci_lower[::-1]]),
    fill='toself',
    fillcolor='rgba(0, 0, 0, 0.1)',
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False
)

# Add regression equation
slope = model.params['response_len']
intercept = model.params['const']
equation_text = f"score = {slope:.3f} × length + {intercept:.2f}"

fig.add_annotation(
    text=equation_text,
    xref="paper", yref="paper",
    x=0.05, y=0.95,
    showarrow=False,
    font=dict(size=18, color="black"),
    bgcolor="white",
    bordercolor="black",
    borderwidth=1,
    borderpad=4
)

fig.show()
print(model.summary())
fig.write_image('plots/score_vs_length.pdf')

In [None]:
# === STEP 1: Score stats from bootstrap ===
bt_summary = bt_ratings_bootstrap.describe(percentiles=[0.025, 0.975]).T
bt_summary = bt_summary[['mean']]
bt_summary['ci_lower'] = bt_ratings_bootstrap.quantile(0.025)
bt_summary['ci_upper'] = bt_ratings_bootstrap.quantile(0.975)
bt_summary['ci_y'] = (bt_summary['ci_upper'] - bt_summary['ci_lower']) / 2
bt_summary = bt_summary.reset_index().rename(columns={'index': 'model', 'mean': 'score'})

# === STEP 2: Response length stats ===
grouped = model_data[model_data['citation_count']!=0].groupby('model')['citation_count']
cit_count_mean = grouped.mean()
cit_count_se = grouped.apply(sem)
cit_count_n = grouped.count()
t_scores = t.ppf(0.975, df=cit_count_n - 1)
ci_x = t_scores * cit_count_se

cit_count_summary = pd.DataFrame({
    'model': cit_count_mean.index,
    'citation_count': cit_count_mean.values,
    'ci_x': ci_x.values
})

# === STEP 3: Merge summaries ===
merged = pd.merge(bt_summary, cit_count_summary, on='model')

# === STEP 4: Plot ===
fig = px.scatter(
    merged,
    x='citation_count',
    y='score',
    error_x='ci_x',
    error_y='ci_y',
)

fig.update_traces(
    marker=dict(size=10, color='#268bd2', line=dict(width=1, color='black')),
    textposition='top center',
    showlegend=False
)

fig.update_layout(
    xaxis=dict(
        title=dict(text='Average Number of Citations', font=dict(size=18, color='black')),
        tickfont=dict(size=16),
        linecolor='black',
        gridcolor='lightgray'
    ),
    yaxis=dict(
        title=dict(text='BT Score Estimate', font=dict(size=18, color='black')),
        tickfont=dict(size=16),
        linecolor='black',
        gridcolor='lightgray'
    ),
    font=dict(size=14, color='black'),
    plot_bgcolor='white',
    paper_bgcolor='white',
    margin=dict(l=10, r=10, t=10, b=10),
    width=800,
    height=600
)

# === STEP 5: Fit linear regression ===
X = merged['citation_count']
y = merged['score']
X_const = sm.add_constant(X)
model = sm.OLS(y, X_const).fit()

# === STEP 6: Compute prediction line and CI ===
x_range = np.linspace(X.min(), X.max(), 100)
x_range_const = sm.add_constant(x_range)
predictions = model.get_prediction(x_range_const)
pred_summary = predictions.summary_frame(alpha=0.05)
y_pred = pred_summary['mean']
ci_lower = pred_summary['mean_ci_lower']
ci_upper = pred_summary['mean_ci_upper']

# === STEP 7: Plot ===
fig.add_scatter(
    x=x_range,
    y=y_pred,
    mode='lines',
    line=dict(color='black', width=2, dash='dash'),
    name='Linear Fit',
    showlegend=False
)

# Add upper and lower bound
fig.add_scatter(
    x=np.concatenate([x_range, x_range[::-1]]),
    y=np.concatenate([ci_upper, ci_lower[::-1]]),
    fill='toself',
    fillcolor='rgba(0, 0, 0, 0.1)',
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False
)

# Add regression equation
slope = model.params['citation_count']
intercept = model.params['const']
equation_text = f"score = {slope:.3f} × citations + {intercept:.2f}"

fig.add_annotation(
    text=equation_text,
    xref="paper", yref="paper",
    x=0.05, y=0.95,
    showarrow=False,
    font=dict(size=18, color="black"),
    bgcolor="white",
    bordercolor="black",
    borderwidth=1,
    borderpad=4
)

fig.show()
print(model.summary())
fig.write_image('plots/score_vs_citations.pdf')

In [None]:
# --- SETUP ---
domain_columns = [f'cites_{d}' for d in DOMAIN_CATEGORIES if d != 'other']

model_family_order = ['Perplexity', 'Google', 'OpenAI']
domain_order = ['youtube', 'social media', 'community blog', 'tech coding', 'academic journal', 'us news', 'foreign news', 'wiki', 'gov edu', 'retail', 'other']
color_map = {
    'Google': '#e74c3c',
    'Perplexity': '#268bd2',
    'OpenAI': '#2ca02c'
}

# --- Step 0: Filter out data with no citations ---
model_data = model_data[model_data[domain_columns].sum(axis=1) != 0]

# --- STEP 1: Melt to long format for easier plotting ---
melted = model_data.melt(
    id_vars=["model_family"],
    value_vars=domain_columns,
    var_name="domain",
    value_name="cited"
)

# Clean domain names for xticks
melted["domain"] = melted["domain"].str.replace("cites_", "").str.replace("_", " ")

# --- STEP 2: Group and compute proportion + CI ---
summary = (
    melted.groupby(["domain", "model_family"])
    .agg(
        prop_cited=("cited", "mean"),
        n=("cited", "count")
    )
    .reset_index()
)

# Standard error for proportion: sqrt(p*(1-p)/n), 95% CI via t-score
summary["se"] = (summary["prop_cited"] * (1 - summary["prop_cited"]) / summary["n"])**0.5
summary["t_score"] = t.ppf(0.975, df=summary["n"] - 1)
summary["ci"] = summary["t_score"] * summary["se"]

# Preserve domain order and model family order
summary["domain"] = pd.Categorical(summary["domain"], categories=domain_order, ordered=True)
summary["model_family"] = pd.Categorical(summary["model_family"], categories=model_family_order, ordered=True)
summary.sort_values(by=['domain', 'model_family'], inplace=True)

# --- STEP 3: Plot ---
fig = px.bar(
    summary,
    x="domain",
    y="prop_cited",
    color="model_family",
    barmode="group",
    error_y="ci",
    color_discrete_map=color_map,
)

fig.update_layout(
    title='',
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(
        title='',
        tickfont=dict(size=18, color='black'),
        tickangle=-45,
        zeroline=False,
        linecolor='black',
    ),
    yaxis=dict(
        title='Proportion of Responses',
        title_font=dict(size=20, color='black'),
        tickfont=dict(size=16, color='black'),
        tickvals=[0, 0.1, 0.2, 0.3, 0.4],
        gridcolor='lightgray',
        zeroline=False,
        linecolor='black',
    ),
    legend=dict(
        title='',
        font=dict(size=18, color='black'),
        orientation='h',
        yanchor='bottom',
        y=0.9,
        xanchor='right',
        x=1
    ),
    width=1000,
    height=600,
    margin=dict(l=0, r=10, t=10, b=0)
)

fig.show()
fig.write_image('plots/domain_citations.pdf')

In [None]:
intent_data_a = pd.DataFrame({'intent': battle_data['primary_intent'],
                             'response_len': battle_data['conv_metadata'].apply(lambda x: x['response_length_a']),
                             'citation_count': battle_data['conv_metadata'].apply(lambda x: x['citation_count_a'])
})
intent_data_b = pd.DataFrame({'intent': battle_data['primary_intent'],
                             'response_len': battle_data['conv_metadata'].apply(lambda x: x['response_length_b']),
                             'citation_count': battle_data['conv_metadata'].apply(lambda x: x['citation_count_b'])
})
intent_data = pd.concat([intent_data_a, intent_data_b])
intent_data = intent_data[intent_data['intent']!='Other']

color_map = {
    "Factual Lookup": "#bebada",
    "Info Synthesis": "#ffffb3",
    "Analysis": "#8dd3c7",
    "Recommendation": "#80b1d3",
    "Explanation": "#fb8072",
    "Creative Generation": "#fdb462",
    "Guidance": "#b3de69",
    "Text Processing": "#e78ac3",
}

# === STEP 1: Compute group stats ===
group_stats = intent_data.groupby("intent").agg(
    mean_response_len=('response_len', 'mean'),
    se_response_len=('response_len', sem),
    mean_citation_count=('citation_count', 'mean'),
    se_citation_count=('citation_count', sem),
    count=('response_len', 'count')
).reset_index()

# === STEP 2: Compute 95% CI
group_stats['t_score'] = t.ppf(0.975, df=group_stats['count'] - 1)
group_stats['ci_response_len'] = group_stats['t_score'] * group_stats['se_response_len']
group_stats['ci_citation_count'] = group_stats['t_score'] * group_stats['se_citation_count']

# === STEP 3: Sort independently for each plot ===
sorted_len = group_stats.sort_values(by='mean_response_len', ascending=True).copy()
sorted_len['intent'] = pd.Categorical(sorted_len['intent'], categories=sorted_len['intent'], ordered=True)

sorted_cit = group_stats.sort_values(by='mean_citation_count', ascending=True).copy()
sorted_cit['intent'] = pd.Categorical(sorted_cit['intent'], categories=sorted_cit['intent'], ordered=True)

# === STEP 4: Add horizontal bars ===
fig = make_subplots(
    rows=1, cols=2,
    shared_yaxes=False,
    horizontal_spacing=0.13,
)

# Left: Response Length
for i, row in sorted_len.iterrows():
    color = color_map[row['intent']]
    fig.add_trace(
        go.Bar(
            x=[row['mean_response_len']],
            y=[row['intent']],
            orientation='h',
            marker_color=color,
            error_x=dict(type='data', array=[row['ci_response_len']]),
            name=row['intent'],
            showlegend=False,
            opacity=0.9
        ),
        row=1, col=1
    )

# Right: Citation Count
for i, row in sorted_cit.iterrows():
    color = color_map[row['intent']]
    fig.add_trace(
        go.Bar(
            x=[row['mean_citation_count']],
            y=[row['intent']],
            orientation='h',
            marker_color=color,
            error_x=dict(type='data', array=[row['ci_citation_count']]),
            name=row['intent'],
            showlegend=False,
            opacity=0.9
        ),
        row=1, col=2
    )

# === STEP 5: Layout styling ===
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    height=500,
    width=1500,
    margin=dict(l=10, r=10, t=30, b=10),
    font=dict(size=14, color='black'),
)

# Y-axis styling
fig.update_yaxes(
    tickfont=dict(size=20, color='black'),
    linecolor='black',
    showgrid=False,
    row=1, col=1
)
fig.update_yaxes(
    tickfont=dict(size=18, color='black'),
    linecolor='black',
    showgrid=False,
    row=1, col=2
)

# X-axis: response length
fig.update_xaxes(
    title='Response Length (words)',
    title_font=dict(size=20, color='black'),
    tickfont=dict(size=16, color='black'),
    linecolor='black',
    gridcolor='lightgray',
    zeroline=False,
    row=1, col=1
)

# X-axis: citation count
fig.update_xaxes(
    title='Citation Count',
    title_font=dict(size=20, color='black'),
    tickfont=dict(size=16, color='black'),
    linecolor='black',
    gridcolor='lightgray',
    zeroline=False,
    row=1, col=2
)

fig.show()
fig.write_image("plots/length_citation_count_intent.pdf")

### Controlled BT Analysis

In [23]:
def visualize_bootstrap_style_coefs(style_coef_bootstrap, style_elements):
    lower = np.percentile(style_coef_bootstrap, 2.5, axis=0)
    upper = np.percentile(style_coef_bootstrap, 97.5, axis=0)
    estimate = np.mean(style_coef_bootstrap, axis=0)
    style_element_names = [s[:-2] for s in style_elements[:(len(style_elements)//2)]]
    bars = pd.DataFrame({
        "model": style_element_names,
        "lower": lower,
        "upper": upper,
        "rating": estimate
    })
    bars["error_y"] = bars["upper"] - bars["rating"]
    bars["error_y_minus"] = bars["rating"] - bars["lower"]
    bars["rating_rounded"] = np.round(bars["rating"], 2)
    bars = bars.sort_values("rating", ascending=False)

    fig = px.scatter(
        bars,
        x="model",
        y="rating",
        error_y="error_y",
        error_y_minus="error_y_minus",
        text="rating_rounded",
    )
    fig.add_hline(y=0, line_width=1, line_dash="dash", line_color="gray")

    fig.update_traces(
        marker=dict(size=10, color="royalblue"),
        line=dict(width=1, color="royalblue"),
        textposition='top left',
        textfont=dict(size=16, color='black')
    )

    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        showlegend=False,
        xaxis=dict(
            title='',
            tickfont=dict(size=16, color='black'),
            linecolor='black',
            linewidth=1.0,
            showgrid=True,
            gridcolor='lightgray'
        ),
        yaxis=dict(
            title=dict(text='Coefficient Estimate', font=dict(size=18, color='black')),
            tickfont=dict(size=16, color='black'),
            range=[min(-0.1, lower.min()-0.1), upper.max()+0.1],
            linecolor='black',
            linewidth=1.0,
            showgrid=True,
            gridcolor='lightgray'
        ),
        margin=dict(l=10, r=10, t=10, b=10),
        height=500,
        width=700,
    )
    return fig

def visualize_single_style_coef_across_subsets(subset_dict, style_elements, anchor_model='gpt-4o-search', num_bootstrap_samples=100):
    assert len(style_elements) == 2
    records = []
    feature_name = style_elements[0][:-2]

    for subset_name, df in subset_dict.items():
        _, style_coef_bootstrap = get_bt_ratings(
            df,
            anchor_model=anchor_model,
            num_bootstrap_samples=num_bootstrap_samples,
            style_elements=style_elements
        )

        estimate = np.mean(style_coef_bootstrap, axis=0)[0]
        lower = np.percentile(style_coef_bootstrap, 2.5, axis=0)[0]
        upper = np.percentile(style_coef_bootstrap, 97.5, axis=0)[0]

        records.append({
            "Subset": subset_name,
            "Estimate": estimate,
            "Lower": lower,
            "Upper": upper,
            "Text": f"{estimate:.2f}",
        })

    df_plot = pd.DataFrame(records)
    df_plot["Error Plus"] = df_plot["Upper"] - df_plot["Estimate"]
    df_plot["Error Minus"] = df_plot["Estimate"] - df_plot["Lower"]
    df_plot = df_plot.sort_values(by='Estimate')

    fig = px.scatter(
        df_plot,
        x="Subset",
        y="Estimate",
        error_y="Error Plus",
        error_y_minus="Error Minus",
        text="Text"
    )

    fig.add_hline(y=0, line_width=1, line_dash="dash", line_color="gray")

    fig.update_traces(
        marker=dict(size=10, color="royalblue"),
        textposition='top right',
        textfont=dict(size=14, color='black')
    )

    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        showlegend=False,
        xaxis=dict(
            title='',
            tickfont=dict(size=16, color='black'),
            linecolor='black',
            linewidth=1.0,
            showgrid=False
        ),
        yaxis=dict(
            title=dict(text='Coefficient Estimate', font=dict(size=18, color='black')),
            tickfont=dict(size=16, color='black'),
            linecolor='black',
            linewidth=1.0,
            showgrid=True,
            gridcolor='lightgray'
        ),
        margin=dict(l=30, r=30, t=10, b=10),
        height=500,
        width=800,
    )

    return fig

#### Response len

In [None]:
style_elements = ['response_length_a', 'response_length_b']
bt_ratings_bootstrap, style_coef_bootstrap = get_bt_ratings(battle_data, anchor_model='gpt-4o-search', num_bootstrap_samples=100, style_elements=style_elements)
fig = visualize_bootstrap_style_coefs(style_coef_bootstrap, style_elements)
fig.show()

In [None]:
style_elements = ['response_length_a', 'response_length_b']
subsets = {primary_intent: battle_data[battle_data['primary_intent']==primary_intent] for primary_intent in battle_data['primary_intent'].unique() if primary_intent != 'Other'}
fig = visualize_single_style_coef_across_subsets(subsets, style_elements)
fig.show()
fig.write_image("plots/response_length_intent.pdf")

#### Citation count

In [None]:
curr_data = battle_data[battle_data['conv_metadata'].apply(lambda x: x['citation_count_a'] > 0 and x['citation_count_b'] > 0)]
style_elements = ["citation_count_a", "citation_count_b"]
bt_ratings_bootstrap, style_coef_bootstrap = get_bt_ratings(curr_data, anchor_model='gpt-4o-search', num_bootstrap_samples=100, style_elements=style_elements)
fig = visualize_bootstrap_style_coefs(style_coef_bootstrap, style_elements)
fig.show()

In [None]:
curr_data = battle_data[battle_data['conv_metadata'].apply(lambda x: x['citation_count_a'] > 0 and x['citation_count_b'] > 0)]
style_elements = ['citation_count_a', 'citation_count_b']
subsets = {primary_intent: battle_data[battle_data['primary_intent']==primary_intent] for primary_intent in battle_data['primary_intent'].unique() if primary_intent != 'Other'}
fig = visualize_single_style_coef_across_subsets(subsets, style_elements)
fig.show()
fig.write_image("plots/citation_count_intent.pdf")

#### Citation sources

In [None]:
curr_battle_data = battle_data[battle_data['conv_metadata'].apply(lambda x: x['citation_count_a'] > 0 and x['citation_count_b'] > 0)]
curr_battle_data = curr_battle_data[curr_battle_data['winner'].isin(['model_a', 'model_b'])]

DOMAIN_CATEGORIES = [
    "youtube",
    "gov_edu",
    "wiki",
    "us_news",
    "foreign_news",
    "social_media",
    "community_blog",
    "tech_coding",
    "academic_journal",
    "retail",
    "other"
]

# --- Step 1: Compute control coefficients ---
CONTROL_ELEMENTS = ["citation_count_a"] + [f"cites_{domain}_a" for domain in DOMAIN_CATEGORIES]
CONTROL_ELEMENTS += ["citation_count_b"] + [f"cites_{domain}_b" for domain in DOMAIN_CATEGORIES]
anchor_model = 'gpt-4o-search'
anchor_rating = 1000
bt_ratings, _ = compute_style_control(curr_battle_data, style_elements=CONTROL_ELEMENTS)
offset_score = (anchor_rating - bt_ratings[anchor_model])
bt_ratings += offset_score
bt_ratings_bootstrap, coef_bootstrap = compute_bootstrap_style_control(curr_battle_data, style_elements=CONTROL_ELEMENTS, num_round=100, offset=offset_score)
coef_names = [s[:-2] for s in CONTROL_ELEMENTS[:(len(CONTROL_ELEMENTS)//2)]]

# --- Step 2: Compute CI and mean ---
lower = np.percentile(coef_bootstrap, 2.5, axis=0)
upper = np.percentile(coef_bootstrap, 97.5, axis=0)
mean = np.mean(coef_bootstrap, axis=0)
coef_df = pd.DataFrame({
    'coef_name': coef_names,
    'mean': mean,
    'ci_lower': lower,
    'ci_upper': upper,
    'ci_width': (upper - lower) / 2
})
coef_df = coef_df.sort_values(by='mean', ascending=True)
coef_df['coef_name'] = coef_df['coef_name'].str.replace("cites_", "").str.replace("_", " ")
coef_df = coef_df[(coef_df['coef_name'] != 'other')]

def get_significance_color(row):
    if row['ci_lower'] > 0:
        return '#2ca02c'
    elif row['ci_upper'] < 0:
        return '#e74c3c'
    else:
        return '#a9a9a9'
coef_df['color'] = coef_df.apply(get_significance_color, axis=1)

# --- Step 3: Plot ---
fig = px.scatter(
    coef_df,
    x='mean',
    y='coef_name',
    error_x='ci_width',
    error_x_minus='ci_width',
    color=coef_df['color'],
    color_discrete_map='identity'
)
fig.update_traces(
    marker=dict(size=14, symbol='circle'),
    error_x=dict(thickness=2.5),
    showlegend=False
)

fig.add_vline(
    x=0,
    line=dict(color='black', width=1.5, dash='dash')
)

fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    title='',
    xaxis=dict(
        title=dict(text='Coefficient Estimate', font=dict(size=18, color='black')),
        linecolor='black',
        tickfont=dict(size=16, color='black'),
        gridcolor='lightgray',
    ),
    yaxis=dict(
        title='',
        tickfont=dict(size=18, color='black'),
        linecolor='black',
    ),
    font=dict(size=14),
    width=800,
    height=500,
    margin=dict(l=10, r=10, t=20, b=20)
)

fig.show()
fig.write_image('plots/citation_coefs.pdf')

In [None]:
CONTROL_ELEMENTS = ["response_length_a", "citation_count_a"] + [f"cites_{domain}_a" for domain in DOMAIN_CATEGORIES]
CONTROL_ELEMENTS += ["response_length_b", "citation_count_b"] + [f"cites_{domain}_b" for domain in DOMAIN_CATEGORIES]

anchor_model = 'gpt-4o-search'
bt_ratings = compute_bt(battle_data)
offset_score = (anchor_rating - bt_ratings[anchor_model])
bt_ratings += offset_score

bt_ratings_style, _ = compute_style_control(curr_battle_data, style_elements=CONTROL_ELEMENTS)
offset_score = (anchor_rating - bt_ratings_style[anchor_model])
bt_ratings_style += offset_score

bt_change_scores = {}
for model in get_model_order(curr_battle_data):
    if model in ["gpt-4o-mini-search", "gpt-4o-search-high-loc", "sonar-pro"]:
        continue
    random.seed(42)
    bt_change_scores[model] = (bt_ratings[model] + random.uniform(-2.0, 2.0), bt_ratings_style[model] + random.uniform(-2.0, 2.0))
    
def viz_change_scores(bt_change_scores, initial_name, final_name):
    full_palettes = {
        "pplx": px.colors.sequential.Blues,
        "gemini": px.colors.sequential.Oranges,
        "openai": px.colors.sequential.Greens,
    }
    families = {
        "pplx": sorted(m for m in bt_change_scores if m.startswith("sonar")),
        "gemini": sorted(m for m in bt_change_scores if m.startswith("gemini")),
        "openai": sorted(m for m in bt_change_scores if "gpt-4o" in m),
    }

    color_mapping = {}
    for fam, models in families.items():
        palette = full_palettes[fam]
        L = len(palette)
        start = int(L * 0.3)
        end   = int(L * 0.9)
        trimmed = palette[start:end] or palette
        n = len(models)
        idxs = np.linspace(0, len(trimmed) - 1, n, dtype=int)
        for model, idx in zip(models, idxs):
            color_mapping[model] = trimmed[idx]

    fig = go.Figure()
    for model, (score0, score1) in bt_change_scores.items():
        fam = next((f for f, mods in families.items() if model in mods), None)
        col = color_mapping.get(model, "#444444")
        fig.add_trace(go.Scatter(
            x=[initial_name, final_name],
            y=[score0, score1],
            mode='lines+markers',
            name=model,
            legendgroup=fam,
            marker=dict(size=10, color=col),
            line=dict(width=3, color=col),
        ))

    fig.update_layout(
        yaxis_title="Arena Score",
        height=600, width=800,
        plot_bgcolor="white",
        margin=dict(t=20, b=20, l=20, r=20),
        xaxis=dict(
            showgrid=True,
            gridcolor='lightgray',
            tickfont=dict(size=20, color='black'),
            linecolor='black',
            linewidth=1.0
        ),
        yaxis=dict(
            tickfont=dict(size=16, color='black'),
            title_font=dict(size=18, color='black'),
            showgrid=True, 
            gridcolor='lightgray',
            linecolor='black',
            linewidth=1.0,
            range=[975, 1170],
            tickmode='linear',
            tick0=1000,
            dtick=25
        ),
        legend=dict(traceorder="grouped", font=dict(size=15, color='black'))
    )
    return fig

fig = viz_change_scores(bt_change_scores, "original", "controlled")
fig.show()
fig.write_image("plots/style_control_ranking.pdf")

### Citation attribution analysis

In [None]:
battle_data = pd.read_json('../data/citation_attribution_data.jsonl', orient='records', lines=True)
battle_data['timestamp'] = pd.to_datetime(battle_data['timestamp']).dt.tz_localize('UTC').dt.tz_convert('America/Los_Angeles')
battle_data = battle_data[battle_data['winner'].notna()].reset_index(drop=True)
battle_data.drop(columns=['conv_metadata'], inplace=True)

model_mapping = {
    "gpt-4o-mini-search-preview": "gpt-4o-mini-search",
    "gpt-4o-mini-search-preview-high": "gpt-4o-mini-search-high",
    "gpt-4o-search-preview": "gpt-4o-search",
    "gpt-4o-search-preview-high": "gpt-4o-search-high",
    "gpt-4o-search-preview-high-loc": "gpt-4o-search-high-loc",
    "gemini-2.0-flash-grounding": "gemini-2.0-flash-grounding",
    "gemini-2.5-flash-preview-04-17-grounding": "gemini-2.5-flash-grounding",
    "gemini-2.0-pro-exp-02-05-grounding": "gemini-2.5-pro-grounding",
    "gemini-2.5-pro-exp-03-25-grounding": "gemini-2.5-pro-grounding",
    "gemini-2.5-pro-exp-03-25-wo-search": "gemini-2.5-pro"
}
battle_data["model_a"].replace(model_mapping, inplace=True)
battle_data["model_b"].replace(model_mapping, inplace=True)

add_num_citations_style(battle_data, "citation_count")
add_cit_misattribution_counts(battle_data)

print(f'Number of battles with votes: {len(battle_data)}')
display(battle_data.head())

In [None]:
models_to_remove = ["gemini-2.5-pro"]
curr_battle_data = battle_data[~battle_data["model_a"].isin(models_to_remove) & ~battle_data["model_b"].isin(models_to_remove)]
curr_battle_data = curr_battle_data[curr_battle_data['conv_metadata'].apply(lambda x: x['citation_count_a'] > 0 and x['citation_count_b'] > 0)]
curr_battle_data = curr_battle_data[curr_battle_data['winner'].isin(['model_a', 'model_b'])]

# --- Step 1: Compute control coefficients ---
CONTROL_ELEMENTS = ["support_count_a", "irrelevant_count_a", "contradict_count_a"]
CONTROL_ELEMENTS += ["support_count_b", "irrelevant_count_b", "contradict_count_b"]
anchor_model = 'gpt-4o-search'
anchor_rating = 1000
bt_ratings, _ = compute_style_control(curr_battle_data, style_elements=CONTROL_ELEMENTS)
offset_score = (anchor_rating - bt_ratings[anchor_model])
bt_ratings += offset_score
bt_ratings_bootstrap, coef_bootstrap = compute_bootstrap_style_control(curr_battle_data, style_elements=CONTROL_ELEMENTS, num_round=100, offset=offset_score)
coef_names = [s[:-2] for s in CONTROL_ELEMENTS[:(len(CONTROL_ELEMENTS)//2)]]

# --- Step 2: Compute CI and mean ---
lower = np.percentile(coef_bootstrap, 2.5, axis=0)
upper = np.percentile(coef_bootstrap, 97.5, axis=0)
mean = np.mean(coef_bootstrap, axis=0)
coef_df = pd.DataFrame({
    'coef_name': coef_names,
    'mean': mean,
    'ci_lower': lower,
    'ci_upper': upper,
    'ci_width': (upper - lower) / 2
})
coef_df = coef_df.sort_values(by='mean', ascending=True)
coef_df['coef_name'] = coef_df['coef_name'].str.replace("cites_", "").str.replace("_", " ")
coef_df = coef_df[(coef_df['coef_name'] != 'other')]

def get_significance_color(row):
    if row['ci_lower'] > 0:
        return '#2ca02c'
    elif row['ci_upper'] < 0:
        return '#e74c3c'
    else:
        return '#a9a9a9'
coef_df['color'] = coef_df.apply(get_significance_color, axis=1)

# --- Step 3: Plot ---
fig = px.scatter(
    coef_df,
    x='mean',
    y='coef_name',
    error_x='ci_width',
    error_x_minus='ci_width',
    color=coef_df['color'],
    color_discrete_map='identity'
)
fig.update_traces(
    marker=dict(size=14, symbol='circle'),
    error_x=dict(thickness=2.5),
    showlegend=False
)

fig.add_vline(
    x=0,
    line=dict(color='black', width=1.5, dash='dash')
)

fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    title='',
    xaxis=dict(
        title=dict(text='Coefficient Estimate', font=dict(size=18, color='black')),
        linecolor='black',
        tickfont=dict(size=16, color='black'),
        gridcolor='lightgray',
    ),
    yaxis=dict(
        title='',
        tickfont=dict(size=18, color='black'),
        linecolor='black',
    ),
    font=dict(size=14),
    width=450,
    height=500,
    margin=dict(l=0, r=0, t=5, b=5)
)

fig.show()
fig.write_image('plots/misattribution_coefs.pdf')