In [11]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.metrics import cohen_kappa_score
from scipy import stats

In [12]:
llm_ajt = pd.read_csv("../results.csv")
human_ajt = pd.read_csv("../human-results.csv")

print(llm_ajt.columns, llm_ajt.describe())
print(human_ajt.columns, human_ajt.describe())

Index(['model', 'stimulus_number', 'trial', 'utterance',
       'acceptability_rating'],
      dtype='object')        stimulus_number        trial  acceptability_rating
count      4590.000000  4590.000000           4590.000000
mean        129.500000     2.000000              5.435294
std          29.446656     0.816586              1.663857
min          79.000000     1.000000              1.000000
25%         104.000000     1.000000              5.000000
50%         129.500000     2.000000              6.000000
75%         155.000000     3.000000              7.000000
max         180.000000     3.000000              7.000000
Index(['participant', 'age', 'gender', 'education', 'aoa_english',
       'aoa_spanish', 'cultural_id', 'blp', 'blp_history_english',
       'blp_history_spanish', 'blp_use_english', 'blp_use_spanish',
       'blp_proficiency_english', 'blp_proficiency_spanish',
       'blp_attitudes_english', 'blp_attitudes_spanish', 'proficiency_english',
       'proficiency_span

In [7]:
def generate_pairwise_preferences(df, annotator_col, stimulus_col, rating_col):
    """
    Generate pairwise preferences for each annotator
    """
    preferences = []
    
    for annotator in df[annotator_col].unique():
        annotator_data = df[df[annotator_col] == annotator]
        
        # Get all pairs of stimuli rated by this annotator
        for i, row1 in annotator_data.iterrows():
            for j, row2 in annotator_data.iterrows():
                if i >= j:  # Avoid duplicates and self-comparisons
                    continue
                
                stimulus_i = row1[stimulus_col]
                stimulus_j = row2[stimulus_col]
                rating_i = row1[rating_col]
                rating_j = row2[rating_col]
                
                # Skip ties
                if rating_i == rating_j:
                    continue
                
                # Record preference
                if rating_i > rating_j:
                    preferences.append({
                        'annotator_id': annotator,
                        'stimulus_i': stimulus_i,
                        'stimulus_j': stimulus_j,
                        'preference': f"{stimulus_i}>{stimulus_j}"
                    })
                else:
                    preferences.append({
                        'annotator_id': annotator,
                        'stimulus_i': stimulus_i,
                        'stimulus_j': stimulus_j,
                        'preference': f"{stimulus_j}>{stimulus_i}"
                    })
    
    return pd.DataFrame(preferences)

In [8]:
def calculate_pairwise_agreement(preferences_df):
    """
    Calculate pairwise agreement within the group
    """
    # Find stimulus pairs that multiple annotators evaluated
    pair_counts = preferences_df.groupby(['stimulus_i', 'stimulus_j']).size()
    shared_pairs = pair_counts[pair_counts > 1].index
    
    agreements = []
    total_comparisons = 0
    
    for stimulus_i, stimulus_j in shared_pairs:
        pair_prefs = preferences_df[
            (preferences_df['stimulus_i'] == stimulus_i) & 
            (preferences_df['stimulus_j'] == stimulus_j)
        ]
        
        if len(pair_prefs) < 2:
            continue
            
        # Check agreement for all annotator pairs who rated this stimulus pair
        annotators = pair_prefs['annotator_id'].tolist()
        preferences = pair_prefs['preference'].tolist()
        
        for i in range(len(annotators)):
            for j in range(i + 1, len(annotators)):
                total_comparisons += 1
                if preferences[i] == preferences[j]:
                    agreements.append(1)
                else:
                    agreements.append(0)
    
    if total_comparisons == 0:
        return 0.0, 0
    
    agreement_rate = np.mean(agreements)
    return agreement_rate, total_comparisons

In [9]:
def compare_group_agreements(human_prefs, llm_prefs):
    """
    Compare group-level agreement scores
    """
    human_agreement, human_comps = calculate_pairwise_agreement(human_prefs)
    llm_agreement, llm_comps = calculate_pairwise_agreement(llm_prefs)
    
    print(f"Human Agreement: {human_agreement:.3f} ({human_comps} comparisons)")
    print(f"LLM Agreement: {llm_agreement:.3f} ({llm_comps} comparisons)")
    print(f"Difference: {llm_agreement - human_agreement:.3f}")
    
    return human_agreement, llm_agreement

In [16]:
from tqdm.auto import tqdm


def bootstrap_significance_test(human_prefs, llm_prefs, n_bootstrap=1000):
    """
    Step 4: Statistical significance testing via bootstrap
    """
    from tqdm import tqdm
    
    def bootstrap_agreement(prefs_df):
        # Resample annotators with replacement
        annotators = prefs_df['annotator_id'].unique()
        sampled_annotators = np.random.choice(annotators, size=len(annotators), replace=True)
        
        # Create bootstrap sample
        bootstrap_prefs = []
        for annotator in sampled_annotators:
            annotator_prefs = prefs_df[prefs_df['annotator_id'] == annotator].copy()
            bootstrap_prefs.append(annotator_prefs)
        
        if bootstrap_prefs:
            bootstrap_df = pd.concat(bootstrap_prefs, ignore_index=True)
            agreement, _ = calculate_pairwise_agreement(bootstrap_df)
            return agreement
        return 0.0
    
    # Generate bootstrap samples
    human_boots = []
    llm_boots = []
    
    print("Running bootstrap analysis...")
    for _ in tqdm(range(n_bootstrap), desc="Bootstrap iterations"):
        human_boots.append(bootstrap_agreement(human_prefs))
        llm_boots.append(bootstrap_agreement(llm_prefs))
    
    # Calculate difference distribution
    diff_boots = np.array(llm_boots) - np.array(human_boots)
    
    # Two-tailed p-value
    observed_diff = np.mean(llm_boots) - np.mean(human_boots)
    p_value = np.mean(np.abs(diff_boots) >= np.abs(observed_diff))
    
    print(f"Bootstrap p-value: {p_value:.3f}")
    print(f"95% CI for difference: [{np.percentile(diff_boots, 2.5):.3f}, {np.percentile(diff_boots, 97.5):.3f}]")
    
    return p_value, diff_boots

In [13]:
# Step 1: Generate pairwise preferences
human_prefs = generate_pairwise_preferences(
    human_ajt, 'participant', 'stimulus_number', 'rating'
)
llm_prefs = generate_pairwise_preferences(
    llm_ajt, 'model', 'stimulus_number', 'acceptability_rating'
)

In [14]:
# Step 2: Compare agreements
human_agreement, llm_agreement = compare_group_agreements(human_prefs, llm_prefs)

Human Agreement: 0.672 (16342 comparisons)
LLM Agreement: 0.840 (22027452 comparisons)
Difference: 0.168


In [17]:
# Step 3: Test significance
bootstrap_significance_test(human_prefs, llm_prefs)

Running bootstrap analysis...


Bootstrap iterations:   0%|▏                                                                                                               | 2/1000 [00:31<4:23:25, 15.84s/it]


KeyboardInterrupt: 