# Summary

This notebook evaluates AI model performance against human annotators in classifying moral foundations using Haidt's Moral Foundations Theory. 

Key Components:
- Data: WandB experiment results
- Method: variant of Dawid-Skene's competence model implemented in TensorFlow to estimate annotator competence and consensus
- Metrics: True/false positive rates, balanced accuracy, and percentile rankings

Main Findings: AI understands moral dimensions with more balanced accuracy, ranking 75th-100th percentile vs humans

# Data

Load WandB experiment results and predictions

In [None]:
import wandb
import pandas as pd
from pprint import pprint
api = wandb.Api()

# run_id = "1f1usujv" # claude-4-sonnet MFTC
# run_id = "pckcakff"  # claude-4-sonnet MFTC
# run_id = "5kucsisw" # deepseek MFTC
# run_id = "yon5adbv" # deepseek MFTC
# run_id = "7krxewfl" # deepseek eMFD
# run_id = "jpsu9gfg" # deepseek MFRC
# run_id = "744jcvse" # claude-4-sonnet MFRC
# run_id = "d1kvjg6q" # claude-4-sonnet emfd
# run_id = "2fisp7sj" # llama4_maverick MFTC
# run_id = "fq9kn2ok" # llama4_maverick MFRC
# run_id = "jv4exac1" # llama4_maverick emfd

project_name = "morality-llm"
run = api.run(f"{project_name}/{run_id}")
pprint(run.config)
artifact = api.artifact(f"{project_name}/run-{run_id}-predictions:latest")

# modify the artifact, here add an index
table = artifact.get("predictions")
table = pd.DataFrame(table.data,columns=table.columns)

{'demo': 'false',
 'model_name': 'llama4_maverick',
 'prompt': 'You are an expert in moral psychology, classifying text according '
           "to Haidt's theory.\n"
           '            For each moral foundations, mark true if moral values '
           'from that foundation are expressed in the text, false if not '
           'expressed.\n'
           '\n'
           '            Answer only with a valid JSON in this format:\n'
           '            {\n'
           '                "care/harm": [true / false],\n'
           '                "fairness/cheating": [true / false],\n'
           '                "loyalty/betrayal": [true / false],\n'
           '                "authority/subversion": [true / false],\n'
           '                "sanctity/degradation": [true / false],\n'
           '            }\n'
           '            ',
 'random_state': 13,
 'sample': -1,
 'temperature': 0.3,
 'test_data': 'morality-MFRC'}


[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [None]:
import re

pattern = re.compile(r':\s*\[?(true|false)\]?', re.IGNORECASE)
moral_targets = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']

def extract_booleans(text):
   # Find true/false values in order
   matches = re.findall(r':\s*(true|false)', text.lower())[:5]
   return [m == 'true' for m in matches]


y_pred = table['predictions'].str.findall(pattern).str[:5].apply(lambda x: [s == 'true' for s in x])
# y_pred = table['predictions'].apply(extract_booleans)
y_pred = pd.DataFrame(y_pred.tolist(), columns=moral_targets, index=table['index'])
y_pred.index.name = 'text_id'
print("NaN",y_pred.isna().mean())
print(y_pred.shape)
y_pred.head()

In [None]:
from datasets import load_dataset
from IPython.display import display
test_data = run.config['test_data']
ds_test = load_dataset(f"maciejskorski/{test_data}")['train']
annots = ds_test.to_pandas().set_index('text_id')#.loc[test_idxs]
# print(annots.groupby('Corpus')['tweet_id'].nunique())
display(annots.head())

annots = annots.pivot_table(
    index=['text_id', 'text'], 
    columns='annotator', 
    values='label', 
    aggfunc='first' 
)

In [None]:
# # annots = annots.join(y_pred["care"])

# target = 'sanctity'
# y_preds =  pd.concat([annots[t].str.lower().str.contains(target) for t in annots.columns],axis=1)
# y_preds = y_preds.join(y_pred[target].astype(bool),how='inner')
# y_preds = y_preds.astype(float)
# print(y_preds.shape)


pattern = '|'.join(moral_targets)
y_preds = annots.stack().str.lower().str.contains(pattern).fillna(0).unstack()
y_preds = y_preds.astype(float)
A = y_preds.values

In [None]:
import numpy as np
from sklearn.metrics import cohen_kappa_score
from itertools import combinations

from itertools import combinations
from sklearn.metrics import balanced_accuracy_score

def pabak(annotations):
    obs_aggreements = []
    for ann1, ann2 in combinations(range(annotations.shape[1]), 2):
        y1, y2 = annotations[:,ann1],annotations[:,ann2]
        mask = ~(np.isnan(y1) | np.isnan(y2))
        po = np.mean(y1[mask] == y2[mask])
        obs_aggreements.append( 2*po -1 )

    obs_aggreements = np.array(obs_aggreements)
    return np.nanmean(obs_aggreements)


for target in moral_targets + ['any']:
    pattern = target
    if target == 'any':
        pattern = '|'.join(moral_targets)
        
    # pattern = 'fairness'
    # target = 'care'
    y_preds = annots.stack().str.lower().str.contains(pattern).unstack()
    y_preds = y_preds.astype(float)
    A = y_preds.values
    score = pabak(A)
    
    print(target, score )

# A = y_preds.values
# outs = []

# for i in range(A.shape[1]):
#     mask = np.ones(A.shape[1])==1
#     mask[i] =  False
#     avg_kappa = pabak(A[:,mask])
#     outs.append( avg_kappa )
#     print(f"Average PABAK w/o {i}: {avg_kappa:.3f}")
# outs = np.array(outs)
# outs.mean()

#  Dawid Skene

Implement an annotator competence estimation model in TensorFlow, a variation of Dawid-Skene's algorithm.

## Tensorflow

In [None]:
import tensorflow as tf
import numpy as np
import tensorflow_probability as tfp
from tqdm import tqdm

def init(J,K):
    pi_logits = tf.Variable(tf.random.normal([K]) * 0.1, name='pi_logits')
    initial_theta = 0.6*tf.eye(K)  + 0.4/K
    theta_logits = tf.Variable(
        tf.math.log(tf.tile(initial_theta[None,:,:], [J,1,1]) + 1e-8),
        name='theta_logits'
    ) # annotator x true class x pred class
    class_prior = tfp.distributions.Dirichlet(alpha,name='pi_prior')
    confusion_alpha = tf.ones([J, K, K]) * 0.4
    diag_values = tf.fill([J, K], 0.6)  # Shape [J, K] for J annotators, K classes
    confusion_alpha = tf.linalg.set_diag(confusion_alpha, diag_values)
    confusion_prior = tfp.distributions.Dirichlet(3*confusion_alpha, name='confusion_priors')
    return pi_logits, theta_logits, class_prior, confusion_prior


def log_p(pi_logits, theta_logits, annot_ids):
    """
    Implements the log-likelihood computation for a Dawid-Skene competence model, 
    estimating both class prevalences and annotator confusion matrices.
    
    Parameters:
    -----------
    pi_logits : tf.Variable, shape [K]
        Logits for class prevalence distribution π (before softmax normalization)
    theta_logits : tf.Variable, shape [J, K, K] 
        Logits for annotator confusion matrices θ (before softmax normalization)
        θ[j,i,k] = P(annotator j labels class k | true class i)
    annot_ids : tf.SparseTensor, shape [N, J*K]
        Sparse tensor encoding annotation observations where:
        - indices: (item, annotator) pairs
        - values: observed classes encoded for efficient embedding lookup
        
    Returns:
    --------
    tf.Tensor : scalar
        Log-likelihood = log P(annotations | π, θ) + log P(π) + log P(θ)
        Combines data likelihood with Dirichlet priors on π and θ
        
    Mathematical formulation:
    ------------------------
    log P(annotations) = Σᵢ log Σₖ π_k Πⱼ θⱼₖ,yᵢⱼ + log P(π) + Σⱼ log P(θⱼ)
    where yᵢⱼ is the annotation by annotator j on item i
    """
    log_pi = tf.nn.log_softmax(pi_logits)
    log_theta = tf.nn.log_softmax(theta_logits, axis=-1) # [annotator x true class x pred class]
    pi = tf.math.exp(log_pi)  # [true class]
    theta = tf.math.exp(log_theta)
    log_theta = tf.transpose(log_theta,[0,2,1]) 
    log_theta = tf.reshape(log_theta, (J*K,K)) # [annotator * true class, x pred class]
    log_p = tf.nn.embedding_lookup_sparse(log_theta, annot_ids, sp_weights=None, combiner='sum')  # [items x true class]
    log_p += log_pi[None, :]
    log_p = tf.reduce_logsumexp(log_p, axis=1) # [items]
    log_p = tf.reduce_sum(log_p)
    
    log_p += class_prior.log_prob(pi)
    log_p += tf.reduce_sum(confusion_prior.log_prob(theta))
    return log_p


optimizer = tf.optimizers.Adam(1e-2, )
max_iter = 2000


@tf.function()
def train_step(pi_logits, theta_logits, annot_ids):
    with tf.GradientTape() as tape:
        loss = -log_p(pi_logits, theta_logits, annot_ids)
    gradients = tape.gradient(loss, [pi_logits, theta_logits])
    optimizer.apply_gradients(zip(gradients, [pi_logits, theta_logits]))
    return loss


@tf.function()
def train(pi_logits, theta_logits, annot_ids, max_iter=tf.constant(1)):
    print("tracing")
    for i in tf.range(max_iter):
        loss = train_step(pi_logits, theta_logits, annot_ids )   

true_labels = tf.random.uniform([10000], 0, 2, dtype=tf.int32)
errors = tf.random.uniform([10000, 5]) < 0.25
annotations = tf.where(errors, 1 - true_labels[:, None], true_labels[:, None]).numpy()
 
N, J = annotations.shape
K = 2
alpha = [80,20]

valid_mask = ~np.isnan(annotations) 
n_coords, j_coords = np.where(valid_mask)
idxs = np.column_stack([n_coords, j_coords])
idx_values = j_coords * K + annotations[valid_mask].astype(int)
annot_ids = tf.SparseTensor(indices=idxs, values=idx_values, dense_shape=[N, J])

pi_logits, theta_logits, class_prior, confusion_prior = init(J,K)

for _ in range(3):
    train(pi_logits, theta_logits, annot_ids)


device = "/GPU:0"
with tf.device(device):
    train(pi_logits, theta_logits, annot_ids, max_iter=tf.constant(2000))


theta = tf.nn.softmax(theta_logits, axis=-1)
competences = tf.reduce_sum(tf.ones(K)*1.0/K * tf.linalg.diag_part(theta), axis=1).numpy()
tf.debugging.assert_near(competences, 0.75, atol=2e-2)

In [None]:
from IPython.display import display
from scipy import stats
import altair as alt

moral_colors = {
    'authority': '#6A4C93',
    'care': '#00B4A6',
    'fairness': '#3498DB',
    'loyalty': '#E74C3C',
    'sanctity': '#F39C12',
    'any': 'gray'
}

results = {}

charts = []
for target in moral_targets + ['any']: # moral_targets+

    if target == 'any':
        y_pred_t = y_pred.any(axis=1)
        pattern = '|'.join(moral_targets)
    else:
        y_pred_t = y_pred[target].astype(bool)
        pattern = target
    y_preds = annots.stack().str.lower().str.contains(pattern).unstack()
    # y_preds =  annots.stack().str.lower().str.contains(target).unstack()
    y_pred_t.name = run.config['model_name']
    y_preds = y_preds.join(y_pred_t,how='inner')
    y_preds = y_preds.astype(float)
    annotations = y_preds.values

    N, J = annotations.shape
    K = int(np.nanmax(annotations))+1
    print(N,J,K)
    valid_mask = ~np.isnan(annotations) 
    n_coords, j_coords = np.where(valid_mask)
    idxs = np.column_stack([n_coords, j_coords])
    idx_values = j_coords * K + annotations[valid_mask].astype(int)
    annot_ids = tf.SparseTensor(indices=idxs, values=idx_values, dense_shape=[N, J])

    pi_logits, theta_logits, class_prior, confusion_prior = init(J,K)
    if target=='any':
        class_prior = tfp.distributions.Dirichlet([500,500],name='pi_prior')
    optimizer = tf.optimizers.Adam(1e-2, )

    @tf.function()
    def train_step(pi_logits, theta_logits, annot_ids):
        with tf.GradientTape() as tape:
            loss = -log_p(pi_logits, theta_logits, annot_ids)
        gradients = tape.gradient(loss, [pi_logits, theta_logits])
        optimizer.apply_gradients(zip(gradients, [pi_logits, theta_logits]))
        return loss

    device = "/GPU:0"
    with tf.device(device):
        for _ in tqdm(range(2000), total=2000):
            train_step(pi_logits, theta_logits, annot_ids, )    

    pi = tf.nn.softmax(pi_logits)
    theta = tf.nn.softmax(theta_logits, axis=-1)

    # competences = tf.reduce_sum(tf.ones(K)*1.0/K * tf.linalg.diag_part(theta), axis=1).numpy()

    tpr = theta[:,1,1].numpy()
    tnr = theta[:,0,0].numpy()

    f11 = 2*tpr/(2-(1-tpr)+(1-tnr)*pi[0]/pi[1])
    f10 = 2*tnr/(2-(1-tnr)+(1-tpr)*pi[1]/pi[0])
    competences = (tpr+tnr)/2
    
    tpr_rank = stats.percentileofscore(tpr, tpr[-1])
    tnr_rank = stats.percentileofscore(tnr, tnr[-1])
    ba = (tpr+tnr)/2
    ba_mean = ba.mean()
    ba_rank = stats.percentileofscore(ba, ba[-1])
    
    results[target] = {
        'tpr_ai_rank': tpr_rank,
        'tnr_ai_rank': tnr_rank,
        'tpr_mean': tpr.mean(),
        'tnr_mean': tnr.mean(),
        'ba_mean': ba.mean(),
        'ba_ai_rank': ba_rank,
        'tpr_ai': tpr[-1],
        'tnr_ai': tnr[-1], 
        'ba_ai': ba[-1],
        'pi_1': pi[1].numpy(),
    }
    
    acc = pd.Series(data=competences,index=y_preds.columns)
    acc.index.name = "Annotator"
    acc.name = "Accuracy"

    df = acc.sort_values().reset_index()
    ai_pos = df[df['Annotator'] == acc.index[-1]].index[0]
    df['Annotator'] = range(len(df))
    df['Moral Dimension'] = target
    df['type'] = 'Annotator'  # for human annotators
    df.loc[ai_pos, 'type'] = 'AI'  # for AI row
    mean_acc = acc.mean()
    ai_percentile = stats.percentileofscore(df['Accuracy'], df.iloc[ai_pos]['Accuracy'])
    y_ticks = [0.5, mean_acc, 1.0]

    chart = alt.Chart(df).mark_bar(color=moral_colors[target]).encode(
        x=alt.X('Annotator:O', axis=alt.Axis(labels=False)),
        y=alt.Y('Accuracy:Q', axis=alt.Axis(values=y_ticks, format='.0%'))
    )
    chart = chart + alt.Chart(df.iloc[[ai_pos]]).mark_rule(
        color='black',  strokeDash=[2,2]
    ).encode(x='Annotator:O')
    chart = chart + alt.Chart(df.iloc[[ai_pos]]).mark_text(
        text='💻', fontSize=20, dy=-30, 
    ).encode(x='Annotator:O', y='Accuracy:Q')
    chart = chart + alt.Chart(df.iloc[[ai_pos]]).mark_text(
        text=f'{ai_percentile:.0f}th', 
        fontSize=12, 
        fontWeight='bold',
        dy=-10
    ).encode(x='Annotator:O', y='Accuracy:Q')
    chart = chart + alt.Chart().mark_rule(
        color=moral_colors[target], strokeDash=[2,2], strokeWidth=2,
    ).encode(y=alt.datum(mean_acc))
    chart = chart.properties(width=200, height=150)
    chart = chart.resolve_legend(color='shared')
    # chart = chart.facet(data=df, column='moral:N')

    # chart = chart.facet(data=df,column='moral:N')
    chart.show()
    charts.append( chart )

In [None]:
class_prior = tfp.distributions.Dirichlet([10,10],name='pi_prior')
optimizer = tf.optimizers.Adam(1e-2, )

@tf.function()
def train_step(pi_logits, theta_logits, annot_ids):
    with tf.GradientTape() as tape:
        loss = -log_p(pi_logits, theta_logits, annot_ids)
    gradients = tape.gradient(loss, [pi_logits, theta_logits])
    optimizer.apply_gradients(zip(gradients, [pi_logits, theta_logits]))
    return loss

device = "/GPU:0"
with tf.device(device):
    for _ in tqdm(range(3000), total=3000):
        train_step(pi_logits, theta_logits, annot_ids, )    

pi = tf.nn.softmax(pi_logits)
theta = tf.nn.softmax(theta_logits, axis=-1)
theta[-1]

In [None]:
import wandb
wandb.init(project='morality-llm', id=run_id, resume="must")
wandb.config

In [None]:
color_scale = alt.Scale(
   domain=list(moral_colors.keys()),
   range=list(moral_colors.values())
)

for i, chart in enumerate(charts):
   legend = alt.Legend(orient='bottom') if i == 0 else None
   charts[i] = chart.encode(
       color=alt.Color('Moral Dimension:N', scale=color_scale, legend=legend)
   )

grid = alt.vconcat(
   alt.hconcat(charts[0], charts[1],),
   alt.hconcat(charts[2], charts[3],),
   alt.hconcat(charts[4], charts[5] )
).resolve_scale(y='shared'
).resolve_axis(y='shared'
).resolve_legend(color='shared')

grid = grid.properties(
    title=alt.TitleParams(
        text=["Human vs AI Performance \n -- Mean Accuracy, -- 💻 AI System"],
        fontSize=14
    )
)

grid.save("figures/accuracy.png",ppi=300)
grid.save("figures/accuracy.svg")
# wandb.summary.update({"accuracy": wandb.Image("figures/accuracy.png")})
# wandb.run.summary["accuracy"] = wandb.Image("figures/accuracy.png")
wandb.log({"accuracy": wandb.Image("figures/accuracy.png")})
wandb.log_artifact("figures/accuracy.svg", name="accuracy", type="plot")
wandb.log({"stats": wandb.Table(dataframe=pd.DataFrame(results).T.reset_index())})

grid.show()

In [None]:
grid.save("figures/accuracy.svg")

In [None]:
import wandb
run = wandb.init()
artifact = run.use_artifact('mskorski-university-of-warsaw/morality-llm/accuracy:v0', type='plot')
artifact_dir = artifact.download()

In [None]:
wandb.finish()

# Summarization

Aggregate results from across multiple experiments

## Accuracy

In [None]:
# run_ids = ['yon5adbv','2fisp7sj','pckcakff'] # MFTC
run_ids = ['jv4exac1','d1kvjg6q','7krxewfl'] # eMFD
# run_ids = ['jpsu9gfg','744jcvse','fq9kn2ok'] # MFRC

df = []

for run_id in tqdm(run_ids):

    run = api.run(f"{project_name}/{run_id}")
    artifact = api.artifact(f"{project_name}/run-{run_id}-stats:latest")
    table = artifact.get("stats").get_dataframe()
    table['model_name'] = run.config['model_name']
    table['data'] = run.config['test_data']
    df.append( table )
    
df = pd.concat(df)
df.rename({'index':'foundation'},axis=1,inplace=True)
df

In [None]:
pivot = df.pivot_table(
   index='foundation',
   columns='model_name', 
   values=['ba_ai_rank', 'ba_ai', ],
   aggfunc='first'
)

pivot.columns = pivot.columns.set_levels(['Acc%', 'Pct'], level=0)
pivot.loc[:, ('Acc%', slice(None))] = pivot.loc[:, ('Acc%', slice(None))] * 100
pivot = pivot.swaplevel(0, 1, axis=1)
pivot = pivot.sort_index(axis=1, level=0)
pivot = pivot.T
pivot.index = pivot.index.set_names(["Annotator","Metric"])
pivot

row = df.groupby("foundation")["ba_mean"].mean()*100
pivot.loc[('Human','Avg Acc%'),:,] = row
pivot

In [None]:
print(pivot.to_latex(float_format="%.0f", escape=True))

## Errors

In [None]:
run_ids_mftc = ['yon5adbv','2fisp7sj','pckcakff'] # MFTC
run_ids_emfd = ['jv4exac1','d1kvjg6q','7krxewfl'] # eMFD
run_ids_mfrc = ['jpsu9gfg','744jcvse','fq9kn2ok'] # MFRC

run_ids = run_ids_mftc + run_ids_emfd + run_ids_mfrc

df = []

for run_id in tqdm(run_ids):

    run = api.run(f"{project_name}/{run_id}")
    artifact = api.artifact(f"{project_name}/run-{run_id}-stats:latest")
    table = artifact.get("stats").get_dataframe()
    table['model_name'] = run.config['model_name']
    table['data'] = run.config['test_data']
    df.append( table )
    
df = pd.concat(df)
df.rename({'index':'foundation'},axis=1,inplace=True)
df

  0%|          | 0/9 [00:00<?, ?it/s][34m[1mwandb[0m:   1 of 1 files downloaded.  
 11%|█         | 1/9 [00:01<00:12,  1.52s/it][34m[1mwandb[0m:   1 of 1 files downloaded.  
 22%|██▏       | 2/9 [00:03<00:10,  1.51s/it][34m[1mwandb[0m:   1 of 1 files downloaded.  
 33%|███▎      | 3/9 [00:04<00:09,  1.51s/it][34m[1mwandb[0m:   1 of 1 files downloaded.  
 44%|████▍     | 4/9 [00:06<00:07,  1.55s/it][34m[1mwandb[0m:   1 of 1 files downloaded.  
 56%|█████▌    | 5/9 [00:07<00:06,  1.54s/it][34m[1mwandb[0m:   1 of 1 files downloaded.  
 67%|██████▋   | 6/9 [00:09<00:04,  1.52s/it][34m[1mwandb[0m:   1 of 1 files downloaded.  
 78%|███████▊  | 7/9 [00:10<00:03,  1.55s/it][34m[1mwandb[0m:   1 of 1 files downloaded.  
 89%|████████▉ | 8/9 [00:12<00:01,  1.52s/it][34m[1mwandb[0m:   1 of 1 files downloaded.  
100%|██████████| 9/9 [00:13<00:00,  1.49s/it]


Unnamed: 0,foundation,tpr_ai_rank,tnr_ai_rank,tpr_mean,tnr_mean,ba_mean,ba_ai_rank,tpr_ai,tnr_ai,ba_ai,pi_1,model_name,data
0,care,83.333333,4.166667,0.503838,0.932504,0.718171,70.833333,0.848118,0.715231,0.781675,0.283195,deepseek-v3,morality-MFTC
1,fairness,66.666667,29.166667,0.571687,0.936933,0.75431,70.833333,0.713705,0.913012,0.813358,0.221594,deepseek-v3,morality-MFTC
2,loyalty,83.333333,12.5,0.534841,0.914832,0.724837,79.166667,0.771089,0.832729,0.801909,0.115926,deepseek-v3,morality-MFTC
3,authority,75.0,29.166667,0.468762,0.867069,0.667915,87.5,0.75371,0.88911,0.82141,0.187448,deepseek-v3,morality-MFTC
4,sanctity,83.333333,41.666667,0.420978,0.92692,0.673949,91.666667,0.732782,0.962425,0.847604,0.148356,deepseek-v3,morality-MFTC
5,any,75.0,29.166667,0.732798,0.715026,0.723912,54.166667,0.973736,0.565991,0.769863,0.63654,deepseek-v3,morality-MFTC
0,care,83.333333,4.166667,0.507586,0.931878,0.719732,62.5,0.800954,0.718333,0.759643,0.277201,llama4_maverick,morality-MFTC
1,fairness,79.166667,12.5,0.557706,0.93961,0.748658,70.833333,0.771988,0.885044,0.828516,0.228853,llama4_maverick,morality-MFTC
2,loyalty,95.833333,8.333333,0.530836,0.910353,0.720595,75.0,0.867138,0.722938,0.795038,0.118964,llama4_maverick,morality-MFTC
3,authority,75.0,29.166667,0.463266,0.867558,0.665412,79.166667,0.761056,0.82044,0.790748,0.189762,llama4_maverick,morality-MFTC


In [None]:
baseline_row = df.groupby(['data','foundation'])[['tpr_mean','tnr_mean']].mean().reset_index()
baseline_row.loc[:,'model_name'] = 'Human'
baseline_row = baseline_row.rename({'tpr_mean':'tpr_ai','tnr_mean':'tnr_ai'},axis=1)
baseline_row
df = pd.concat([df, baseline_row], ignore_index=True)

In [None]:
mask = df['foundation'] != 'any'

In [None]:
# Calculate FPR and FNR
df['fpr'] = 1 - df['tnr_ai']
df['fnr'] = 1 - df['tpr_ai']
df['annotator'] = df['model_name'].map({
    'Human': 'Human Baseline',
    'claude-4-sonnet': 'Claude-4',
    'deepseek-v3': 'DeepSeek-V3', 
    'llama4_maverick': 'Llama4-Maverick',
})
df['data'] = df['data'].str.split("-").str[-1]
df_mask = df[mask]

chart = alt.Chart(df_mask).mark_point(size=200, stroke='white', strokeWidth=2, filled=True).encode(
   x=alt.X('jitter_x:Q', title='False Positive Rate', axis=alt.Axis(format='.0%'), scale=alt.Scale(domain=[0.7, 0.7])),
   y=alt.Y('jitter_y:Q', title='False Negative Rate', axis=alt.Axis(format='.0%'), scale=alt.Scale(domain=[0.7, 0.7])),
   color=alt.Color('foundation:N', scale=color_scale, legend=alt.Legend(title="Moral Dimension")),
   shape=alt.Shape('annotator:N',
               scale=alt.Scale(
                   domain=['Human Baseline', 'Claude-4', 'DeepSeek-V3', 'Llama4-Maverick'],
                   range=['circle', 'square', 'triangle-up', 'diamond']
               ),
               legend=alt.Legend(title="Model")),
   tooltip=['foundation', 'annotator', 'fpr:Q', 'fnr:Q']
).transform_calculate(
   jitter_x='datum.fpr + (random() - 0.5) * 0.01',
   jitter_y='datum.fnr + (random() - 0.5) * 0.01'
).properties(
   width=300, 
   height=200,
)

line = alt.Chart(df_mask).mark_line(color='gray', strokeDash=[2, 2]).encode(
    x=alt.X('value:Q', scale=alt.Scale(domain=[0, 0.7])),
    y=alt.Y('value:Q', scale=alt.Scale(domain=[0, 0.7]))
).transform_calculate(
    value='sequence(0, 0.8, 0.1)'
).transform_flatten(['value'])

chart = alt.layer(line, chart).facet(
   column=alt.Column('data:N', title="Dataset")
).resolve_scale(
   color='shared',
   shape='shared'
).configure_facet(
   columns=3
).configure_legend(
   orient='top',
   columns=2,  # or however many columns you want
   symbolLimit=0  # removes symbol limit if you have many legend items
)

chart

In [None]:
# Swap column levels to put fpr/fnr at second level
pivot_df = df.pivot_table(
  index=['data', 'annotator'], 
  columns='foundation', 
  values=['fpr', 'fnr']
) * 100

# Swap the column levels and rename
pivot_df = pivot_df.swaplevel(0, 1, axis=1).sort_index(axis=1)
pivot_df.columns = pivot_df.columns.set_names(['Moral Dimension', 'Metric'])

# Rename FPR/FNR to uppercase
pivot_df = pivot_df.rename(columns={'fpr': 'FPR', 'fnr': 'FNR'}, level=1)

pivot_df = pivot_df.round(1)

latex_table = pivot_df.to_latex(
  multirow=True,
  multicolumn=True,
  escape=False,
  column_format='ll' + 'rr' * len(pivot_df.columns.get_level_values(0).unique()),
  float_format='%.1f'
)

pivot_df
# print(latex_table)

## Discrepancies

Case studies of AI-human disagreements, false negatives by humans that were flagged by AI.

In [None]:
idx = pd.Series(ds_test['text']).str.contains("Anyone think Macron").argmax().astype(int)
idx = ds_test.select([idx])['text_id'][0]
print(ds_test.to_pandas().set_index('text_id').loc[idx])

pattern = '|'.join(moral_targets)
y_true = annots.stack().str.lower().str.contains(pattern).unstack()
(y_pred.sum(1) - y_true.sum(1)).sort_values(ascending=False).head(20)

                                                      text   subreddit  \
text_id                                                                  
2002     Anyone think Macron should dispose of the Alge...  neoliberal   
2002     Anyone think Macron should dispose of the Alge...  neoliberal   
2002     Anyone think Macron should dispose of the Alge...  neoliberal   

                  bucket    annotator     annotation confidence label  
text_id                                                                
2002     French politics  annotator03      Non-Moral  Confident  none  
2002     French politics  annotator04  Thin Morality  Confident  none  
2002     French politics  annotator02      Non-Moral  Confident  none  


text_id  text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
12950    The RNC fully endorsed the Roy Moore campaign after they said they found the child molestation allegations credible                                                                                                                           