In [1]:
import pandas as pd
import numpy as np

In [2]:
llm_ajt = pd.read_csv("../results.csv")
# Take median of three scores for LLM AJT responses
llm_ajt = llm_ajt.groupby(["model", "stimulus_number"])["acceptability_rating"].median().reset_index()

human_ajt = pd.read_csv("../human-results.csv")

# Limit to utterances rated by both LLM and human annotators
shared_stimulus_ids = set(llm_ajt["stimulus_number"]).intersection(set(human_ajt["stimulus_number"]))
llm_ajt = llm_ajt[llm_ajt["stimulus_number"].isin(shared_stimulus_ids)]
human_ajt = human_ajt[human_ajt["stimulus_number"].isin(shared_stimulus_ids)]

print(llm_ajt.columns, llm_ajt.describe())
print(human_ajt.columns, human_ajt.describe())

Index(['model', 'stimulus_number', 'acceptability_rating'], dtype='object')        stimulus_number  acceptability_rating
count       720.000000            720.000000
mean        122.750000              5.238889
std          35.522623              1.629403
min          79.000000              1.000000
25%          90.750000              4.000000
50%         116.000000              5.000000
75%         148.000000              7.000000
max         180.000000              7.000000
Index(['participant', 'age', 'gender', 'education', 'aoa_english',
       'aoa_spanish', 'cultural_id', 'blp', 'blp_history_english',
       'blp_history_spanish', 'blp_use_english', 'blp_use_spanish',
       'blp_proficiency_english', 'blp_proficiency_spanish',
       'blp_attitudes_english', 'blp_attitudes_spanish', 'proficiency_english',
       'proficiency_spanish', 'lextale_english', 'lextale_spanish', 'section',
       'language', 'stimulus_number', 'structure', 'condition',
       'lexicalization', 'rating'

In [3]:
def generate_pairwise_preferences(df, annotator_col, stimulus_col, rating_col):
    """
    Generate pairwise preferences for each annotator
    """
    preferences = []
    
    for annotator in df[annotator_col].unique():
        annotator_data = df[df[annotator_col] == annotator]
        
        # Get all pairs of stimuli rated by this annotator
        for i, row1 in annotator_data.iterrows():
            for j, row2 in annotator_data.iterrows():
                if i >= j:  # Avoid duplicates and self-comparisons
                    continue
                
                stimulus_i = row1[stimulus_col]
                stimulus_j = row2[stimulus_col]
                rating_i = row1[rating_col]
                rating_j = row2[rating_col]
                
                # Skip ties
                if rating_i == rating_j:
                    continue
                
                # Record preference
                if rating_i > rating_j:
                    preferences.append({
                        'annotator_id': annotator,
                        'stimulus_i': stimulus_i,
                        'stimulus_j': stimulus_j,
                        'preference': f"{stimulus_i}>{stimulus_j}"
                    })
                else:
                    preferences.append({
                        'annotator_id': annotator,
                        'stimulus_i': stimulus_i,
                        'stimulus_j': stimulus_j,
                        'preference': f"{stimulus_j}>{stimulus_i}"
                    })
    
    return pd.DataFrame(preferences)

In [4]:
def calculate_pairwise_agreement(preferences_df):
    """
    Calculate pairwise agreement within the group
    """
    # Find stimulus pairs that multiple annotators evaluated
    pair_counts = preferences_df.groupby(['stimulus_i', 'stimulus_j']).size()
    shared_pairs = pair_counts[pair_counts > 1].index
    
    agreements = []
    total_comparisons = 0
    
    for stimulus_i, stimulus_j in shared_pairs:
        pair_prefs = preferences_df[
            (preferences_df['stimulus_i'] == stimulus_i) & 
            (preferences_df['stimulus_j'] == stimulus_j)
        ]
        
        if len(pair_prefs) < 2:
            continue
            
        # Check agreement for all annotator pairs who rated this stimulus pair
        annotators = pair_prefs['annotator_id'].tolist()
        preferences = pair_prefs['preference'].tolist()
        
        for i in range(len(annotators)):
            for j in range(i + 1, len(annotators)):
                total_comparisons += 1
                if preferences[i] == preferences[j]:
                    agreements.append(1)
                else:
                    agreements.append(0)
    
    if total_comparisons == 0:
        return 0.0, 0
    
    agreement_rate = np.mean(agreements)
    return agreement_rate, total_comparisons

In [5]:
def compare_group_agreements(human_prefs, llm_prefs):
    """
    Compare group-level agreement scores
    """
    human_agreement, human_comps = calculate_pairwise_agreement(human_prefs)
    llm_agreement, llm_comps = calculate_pairwise_agreement(llm_prefs)
    
    print(f"Human Agreement: {human_agreement:.3f} ({human_comps} comparisons)")
    print(f"LLM Agreement: {llm_agreement:.3f} ({llm_comps} comparisons)")
    print(f"Difference: {llm_agreement - human_agreement:.3f}")
    
    return human_agreement, llm_agreement

In [6]:
def bootstrap_agreement(prefs_df):
    """Helper function for bootstrap resampling"""
    # Resample annotators with replacement
    annotators = prefs_df['annotator_id'].unique()
    sampled_annotators = np.random.choice(annotators, size=len(annotators), replace=True)
    
    # Create bootstrap sample
    bootstrap_prefs = []
    for annotator in sampled_annotators:
        annotator_prefs = prefs_df[prefs_df['annotator_id'] == annotator].copy()
        bootstrap_prefs.append(annotator_prefs)
    
    if bootstrap_prefs:
        bootstrap_df = pd.concat(bootstrap_prefs, ignore_index=True)
        agreement, _ = calculate_pairwise_agreement(bootstrap_df)
        return agreement
    return 0.0

def bootstrap_iteration(args):
    """Single bootstrap iteration - must be at module level for multiprocessing"""
    human_prefs, llm_prefs, seed = args
    np.random.seed(seed)
    human_boot = bootstrap_agreement(human_prefs)
    llm_boot = bootstrap_agreement(llm_prefs)
    return human_boot, llm_boot

def bootstrap_significance_test(human_prefs, llm_prefs, n_bootstrap=1000, n_cores=6):
    """
    Step 4: Statistical significance testing via bootstrap (parallelized)
    """
    from multiprocessing import Pool
    try:
        from tqdm.auto import tqdm
        tqdm_available = True
    except ImportError:
        tqdm_available = False
        print("tqdm not available, running without progress bar")
    
    # Prepare arguments for parallel processing
    args_list = [(human_prefs, llm_prefs, i) for i in range(n_bootstrap)]
    
    # Run bootstrap iterations in parallel with progress bar
    with Pool(n_cores) as pool:
        if tqdm_available:
            results = list(tqdm(pool.imap(bootstrap_iteration, args_list), 
                              total=n_bootstrap, 
                              desc="Bootstrap iterations"))
        else:
            results = pool.map(bootstrap_iteration, args_list)
            print(f"Completed {n_bootstrap} bootstrap iterations")
    
    # Unpack results
    human_boots, llm_boots = zip(*results)
    human_boots = list(human_boots)
    llm_boots = list(llm_boots)
    
    # Calculate difference distribution
    diff_boots = np.array(llm_boots) - np.array(human_boots)
    
    # Two-tailed p-value
    observed_diff = np.mean(llm_boots) - np.mean(human_boots)
    p_value = np.mean(np.abs(diff_boots) >= np.abs(observed_diff))
    
    print(f"Bootstrap p-value: {p_value:.3f}")
    print(f"95% CI for difference: [{np.percentile(diff_boots, 2.5):.3f}, {np.percentile(diff_boots, 97.5):.3f}]")
    
    return p_value, diff_boots

In [7]:
# Step 1: Generate pairwise preferences
human_prefs = generate_pairwise_preferences(
    human_ajt, 'participant', 'stimulus_number', 'rating'
)
llm_prefs = generate_pairwise_preferences(
    llm_ajt, 'model', 'stimulus_number', 'acceptability_rating'
)

In [8]:
# Step 2: Compare agreements
human_agreement, llm_agreement = compare_group_agreements(human_prefs, llm_prefs)

Human Agreement: 0.672 (16342 comparisons)
LLM Agreement: 0.866 (60052 comparisons)
Difference: 0.193


In [9]:
# Step 3: Test significance
import multiprocessing

bootstrap_significance_test(human_prefs, llm_prefs, n_cores=multiprocessing.cpu_count())

Bootstrap iterations:   0%|          | 0/1000 [00:00<?, ?it/s]

Bootstrap p-value: 0.502
95% CI for difference: [0.077, 0.229]


(np.float64(0.502),
 array([0.21670358, 0.1173837 , 0.14321031, 0.19686647, 0.20435964,
        0.18550199, 0.2107745 , 0.19803433, 0.14682387, 0.14907066,
        0.16543062, 0.1817052 , 0.1557842 , 0.13083612, 0.15727408,
        0.19447326, 0.10860588, 0.17652082, 0.0831419 , 0.07718239,
        0.22514517, 0.21399122, 0.16583236, 0.15834233, 0.14247425,
        0.18074066, 0.2252591 , 0.17690553, 0.11319351, 0.1958681 ,
        0.13396077, 0.15003976, 0.09448314, 0.15561152, 0.18486094,
        0.14480757, 0.16761729, 0.08411716, 0.22448345, 0.12800722,
        0.16312736, 0.16346455, 0.20750253, 0.12292501, 0.10691937,
        0.17734833, 0.05695194, 0.15422835, 0.19848603, 0.15675815,
        0.20081492, 0.14268835, 0.06983843, 0.18171205, 0.17996832,
        0.22278148, 0.1643945 , 0.13290053, 0.1350899 , 0.13566995,
        0.14227682, 0.15244832, 0.10059158, 0.15472301, 0.1934317 ,
        0.22346849, 0.18383774, 0.102064  , 0.15474983, 0.15190232,
        0.20478537, 0.176895