In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from movie_pkg.llm_human_agreement import *

### Differences in proportion agreement
* here, we use proportion agreement as our primary alignment metric rather than Cohen’s Kappa because the data were highly unbalanced (i.e., events were far more often not described than described), a condition known to distort κ estimates.* moreover, because both mappings and omissions were central to our research question, we required a metric that would not be differentially biased toward either outcome under class imbalance (Byrt et al., 1993; Friend et al., 2023)* this measure could also be reported as prevalance-adjusted bias-adjusted Kappa (PABAK = prop.agreement/2)

In [None]:
base_prim_coder = '' # path to your primary coder's codes
base_sec_coder = '' # path to your secondary coder's codes
base_llm = '' # path to the llm-generated codes

In [3]:
human_llm_scores = []
human_human_scores = []

for subject in get_reliability_subjects():
    sub_hl_scores = []
    sub_hh_scores = []

    for movie in get_rel_subject_movies(subject):
        
        human_llm_score = compute_proportion_agreement_csv_tsv(
            f"{base_prim_coder}/{subject}_{movie}.csv",
            f"{base_llm}/temple{subject}/subj-temple{subject}_{movie}_event_coded.tsv",
            verbose=False, flexible_match=False, fuzzy=True) # allow fuzzy matching to account for differences in punctuation, capitalization, etc.
        
        human_human_score = compute_proportion_agreement_csv(
            f"{base_prim_coder}/{subject}_{movie}.csv",
            f"{base_sec_coder}/{subject}_{movie}.csv",
            verbose=False, flexible_match=False, fuzzy=True)
        
        sub_hl_scores.append(human_llm_score)  
        sub_hh_scores.append(human_human_score)  
        
    human_llm_scores.append(np.mean(sub_hl_scores))
    human_human_scores.append(np.mean(sub_hh_scores))

print()
print(f'Human-LLM agreement: {np.mean(human_llm_scores)}, range: {min(human_llm_scores)}-{max(human_llm_scores)}, SD: {np.std(human_llm_scores)}')
print(f'Human-Human agreement: {np.mean(human_human_scores)}, range: {min(human_human_scores)}-{max(human_human_scores)}, SD: {np.std(human_human_scores)}')


Human-LLM agreement: 0.8735606947697111, range: 0.75-0.9861111111111112, SD: 0.06916085144099864
Human-Human agreement: 0.9025744860785843, range: 0.8555327868852459-0.9508196721311475, SD: 0.0255615060270125


In [4]:
hh = np.array(human_human_scores)
hl = np.array(human_llm_scores)

res = stats.ttest_ind(hh, hl)

t_stat = res.statistic
p_val = res.pvalue
df = res.df

print(f't({df}) = {t_stat}, p = {p_val}')

t(26.0) = 1.4187697199983234, p = 0.1678431885499739


### Ensure no differences in automated coding performance based on subject's age

In [6]:
ages = np.array(get_rel_subject_ages())

r_h, p_val_h = stats.pearsonr(ages, hh)
r_l, p_val_l = stats.pearsonr(ages, hl)

print(f'human-human by age: r = {r_h}, p = {p_val_h}')
print(f'human-llm by age: r = {r_l}, p = {p_val_l}')

human-human by age: r = -0.3022042466671977, p = 0.29365740606795876
human-llm by age: r = -0.3804903709110294, p = 0.17957081454261997


### Characterize disagreements between human-human and human-llm coding

**Correlation between what movie events lead to disagreements:**

In [9]:
results = []

for sub in get_reliability_subjects():
    for movie in get_rel_subject_movies(subject):
        file1 = f"{base_prim_coder}/{sub}_{movie}.csv"
        file2 = f"{base_sec_coder}/{sub}_{movie}.csv"
        file3 = f"{base_llm}/temple{sub}/subj-temple{sub}_{movie}_event_coded.json"

        try:
            result = mismatch_correlation(str(file1), str(file2), str(file3), verbose=False)
            result.update({"subject": sub, "movie": movie})
            results.append(result)
        except Exception as e:
            print(f"Skipping subj-{sub}, movie-{movie}: {e}")

df = pd.DataFrame(results)
df_clean = df.dropna(subset=['fisher_z'])

# compute mean Fisher z and Pearson r
mean_fisher_z = df_clean['fisher_z'].mean()
mean_r = tanh(mean_fisher_z)

print(f"\nMean Fisher z: {mean_fisher_z:.3f}")
print(f"Mean Pearson r: {mean_r:.3f}")

# group-level significance test
fisher_zs = df_clean['fisher_z'].values
t_stat, p_val = ttest_1samp(fisher_zs, 0)
print(f"Group-level t = {t_stat:.3f}, p = {p_val:.4f}")


Mean Fisher z: 0.435
Mean Pearson r: 0.409
Group-level t = 9.306, p = 0.0000


**Mismatch distance: how far apart are events that human-human coders disagreed on from human-llm disagreements**

In [24]:
human_scores = []
llm_scores = []

for subject in get_reliability_subjects():
    sub_scores = []
    llm_sub_scores = []
    for movie in get_rel_subject_movies(subject):
        file1 = f"{base_prim_coder}/{subject}_{movie}.csv"
        file2 = f"{base_sec_coder}/{subject}_{movie}.csv"
        file3 = f"{base_llm}/temple{subject}/subj-temple{subject}_{movie}_event_coded.json"

        try:
            score = compute_human_human_mismatch_distance(file1, file2, verbose=False)
            #print("Human–Human average mismatch distance:", score)
            sub_scores.append(score)

            llm_score = compute_human_llm_mismatch_distance(file1, file3, verbose=False)
            llm_sub_scores.append(llm_score)

            
        except Exception as e:
            print(f"        Skipping missing: {subject}, {movie}: {e}")
            sub_scores.append(np.nan)
    human_scores.append(np.nanmean(sub_scores))
    llm_scores.append(np.nanmean(llm_sub_scores))


In [26]:
x = np.array(human_scores)
y = np.array(llm_scores)


r, p = pearsonr(x, y)
print(f"Pearson r = {r:.3f}, p = {p:.4f}")


Pearson r = 0.506, p = 0.0647
