# Human Annotation Performance

Measure human annotator success in word level recovery.

In [1]:
# Copyright 2025 Luke Moffett
# Licensed under the Apache License, Version 2.0

import pandas as pd
import matplotlib.pyplot as plt
import os
import tqdm
import torch

from IPython.display import display
from clz_or_cls import datasets as corc_ds

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

plt.rcParams['figure.dpi'] = 300
%matplotlib inline

In [None]:
datasets = [
    ("legit_extended", "visual"),
    ("dces", "visual"),
    ("ices", "visual"),
    ("zeroe_noise", "typo"),
    ("zeroe_typo", "typo"),
    ("anthro_typo", "typo"),
    ("anthro_phonetic", "phonetic"),
    ("phonee", "phonetic"),
    ("zeroe_phonetic", "phonetic"),
    ('repeated', 'repeated')
]

class_map = {t[0]: t[1] for t in datasets}
class_map

{'legit_extended': 'visual',
 'dces': 'visual',
 'ices': 'visual',
 'zeroe_noise': 'typo',
 'zeroe_typo': 'typo',
 'anthro_typo': 'typo',
 'anthro_phonetic': 'phonetic',
 'phonee': 'phonetic',
 'zeroe_phonetic': 'phonetic',
 'repeated': 'repeated'}

In [3]:
from pathlib import Path

dfs = {}
prep_path = Path(os.environ['CORC_DATASETS_PREP_DIR'])
for i in range(0,5):
    dfs[i] = pd.read_csv(prep_path/'annotations'/'results'/f'PerturbationRecovery_{i}.csv')
    print(i, dfs[i].shape)

dfs.keys()

0 (501, 2)
1 (500, 2)
2 (500, 2)
3 (500, 2)
4 (500, 2)


dict_keys([0, 1, 2, 3, 4])

In [4]:
orig = {}
for i in range(5):
    orig[i] = pd.read_csv(prep_path/'annotations'/'selections'/f'group{i}_perturbations_full.csv')
    print(i, orig[i].shape)

orig.keys()

0 (501, 3)
1 (500, 3)
2 (500, 3)
3 (500, 3)
4 (500, 3)


dict_keys([0, 1, 2, 3, 4])

In [5]:
from nltk.metrics import edit_distance
keys = {}
for i in range(0,5):
    __answers = dfs[i]
    __orig = orig[i]

    __key = __answers.merge(__orig, left_index=True, right_index=True, how='inner')
    __key['match'] = (__key['Original Word'].str.lower().str.strip() == __key['clean'].str.lower().str.strip())

    __key['ascii'] = __key['clean'].apply(lambda x: x.isascii())
    __key['edit'] = __key.apply(lambda row: edit_distance(row['Original Word'] if type(row['Original Word']) == str else '', row['clean']), axis=1)

    for j in range(1, 4):
        __key[f'edit{j}'] = __key['edit'] <= j

    __key['class'] = __key['source'].apply(lambda x: class_map[x])
    keys[i] = __key

    print(i, __key.value_counts(['match'], normalize=True))

    # display(__key.groupby(['source']).value_counts(['match'], normalize=True))

0 match
True     0.682635
False    0.317365
dtype: float64
1 match
True     0.638
False    0.362
dtype: float64
2 match
True     0.62
False    0.38
dtype: float64
3 match
True     0.682
False    0.318
dtype: float64
4 match
True     0.652
False    0.348
dtype: float64


### Check repeated word performance as upper-bound

In [6]:
# excluding grâce since grace is also a valid English word
for i in range(0,5):
    __k = keys[i]
    score = __k[(__k['source'] == 'repeated') & (__k['clean'] != 'grâce')].value_counts(['match'], normalize=True)
    print(i, score)

0 match
True     1.0
dtype: float64
1 match
True     0.909091
False    0.090909
dtype: float64
2 match
True     0.954545
False    0.045455
dtype: float64
3 match
True     0.954545
False    0.045455
dtype: float64
4 match
True     0.954545
False    0.045455
dtype: float64


### Analyze Performance on Visual Legibility

In [7]:
dces = corc_ds.generated_df('dces', split='test')
ices = corc_ds.generated_df('ices', split='test')
legit = corc_ds.generated_df('legit_extended', split='test')

In [8]:
orig_with_references = {}
for i in range(0,5):
    orign_with_references = orig[i].merge(pd.concat([dces, ices, legit]), on=('clean', 'perturbed'), how='left')
    orig_with_references[i] =orign_with_references

orig_with_references.keys()

dict_keys([0, 1, 2, 3, 4])

In [9]:
leg_score = {}
for i in range(0,5):
    __answers = keys[i]
    __orig = orig_with_references[i]

    __key = __answers.merge(__orig, on=('clean', 'perturbed'), how='left').drop_duplicates(['clean', 'perturbed', 'Original Word'])
    __key['match'] = (__key['Original Word'].str.lower().str.strip() == __key['clean'].str.lower().str.strip())

    __key['ascii'] = __key['clean'].apply(lambda x: x.isascii())
    __key['edit'] = __key.apply(lambda row: edit_distance(row['Original Word'] if type(row['Original Word']) == str else '', row['clean']), axis=1)

    for j in range(1, 4):
        __key[f'edit{j}'] = __key['edit'] <= j

    __key['class'] = __key['source_x'].apply(lambda x: class_map[x])
    leg_score[i] = __key

    print(i, 'all', __key[(__key['class'] == 'visual')].value_counts(['match'], normalize=True))

0 all match
True     0.704403
False    0.295597
dtype: float64
1 all match
True     0.578616
False    0.421384
dtype: float64
2 all match
True     0.610063
False    0.389937
dtype: float64
3 all match
True     0.698113
False    0.301887
dtype: float64
4 all match
True     0.654088
False    0.345912
dtype: float64


In [10]:
all_responses = pd.concat((leg_score[i] for i in range(0,5)))

In [11]:
print(i, 'all', all_responses[(all_responses['class'] == 'visual')].value_counts(['match'], normalize=True))

for attack in ['ices', 'dces', 'legit_extended']:
    print(attack, all_responses[(all_responses['class'] == 'visual') & (all_responses['source_x'] == attack)].value_counts(['match'], normalize=True))

4 all match
True     0.649057
False    0.350943
dtype: float64
ices match
False    0.528302
True     0.471698
dtype: float64
dces match
True     0.841509
False    0.158491
dtype: float64
legit_extended match
True     0.633962
False    0.366038
dtype: float64


### Restimate the Threshold for Legible Perturbations

In [12]:
visual_responses = all_responses[all_responses['class'] == 'visual']
visual_responses

Unnamed: 0,Perturbed Word,Original Word,clean,perturbed,source_x,match,ascii,edit,edit1,edit2,edit3,class,source_y,legibility_score,legible
1,naₛty,nasty,nasty,naₛty,dces,True,True,0,True,True,True,visual,dces,2.880576,
7,ѕwiss,Swiss,swiss,ѕwiss,legit_extended,True,True,1,True,True,True,visual,legit_extended,4.787932,True
9,cutօ,cute,cute,cutօ,legit_extended,True,True,0,True,True,True,visual,legit_extended,0.110016,False
17,Ԩrieňdly,friendly,friendly,Ԩrieňdly,ices,True,True,0,True,True,True,visual,ices,1.148038,
21,zabeƳ,Gabe,zdnet,zabeƳ,ices,False,True,4,False,False,False,visual,ices,-0.170627,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524,ℂouȓts,courts,courts,ℂouȓts,dces,True,True,0,True,True,True,visual,dces,1.979183,
525,bθ♄al≮≀e౩,behalves,penalties,bθ♄al≮≀e౩,legit_extended,False,True,4,False,False,False,visual,legit_extended,-3.126042,False
528,5Ⰰὲᵰᵰiɼg,stirring,staffing,5Ⰰὲᵰᵰiɼg,legit_extended,False,True,3,False,False,True,visual,legit_extended,-2.776737,False
533,tǝsțe,taste,taste,tǝsțe,legit_extended,True,True,0,True,True,True,visual,legit_extended,3.888106,True


In [13]:
all_responses[(all_responses['class'] == 'visual')].shape

(795, 15)

### Are readers mispelling the words they are inputting?

In [14]:
tqdm.tqdm.pandas()
pd.options.display.float_format = '{:.2%}'.format

In [15]:
import enchant
b = enchant.Broker()
b.set_ordering("*","aspell,myspell")

en_dict_aspell = b.request_dict("en_US")

assert en_dict_aspell.provider.name == 'aspell'

def closest_for(sp_dict):
    def next_closest_word(word):
        if type(word) is float:
            return None
        suggestions = sp_dict.suggest(word)
        if len(suggestions) == 0:
            return None
        else:
            return suggestions[0]
        
    return next_closest_word

suggest_aspell = closest_for(en_dict_aspell)
all_responses['aspell'] = all_responses['Original Word'].progress_apply(lambda x: suggest_aspell(x))
all_responses['aspell_match'] = all_responses['aspell'].str.strip().str.lower() == all_responses['clean'].str.strip().str.lower()

  0%|          | 0/2498 [00:00<?, ?it/s]

100%|██████████| 2498/2498 [00:01<00:00, 1805.38it/s]


In [16]:
all_responses['match'].value_counts(normalize=True)

True    65.49%
False   34.51%
Name: match, dtype: float64

In [17]:
all_responses['aspell_match'].value_counts(normalize=True)

True    66.49%
False   33.51%
Name: aspell_match, dtype: float64

In [18]:
all_responses[~all_responses['aspell'].isna()].groupby('class').agg({'match': 'mean', 'aspell_match': 'mean'})

Unnamed: 0_level_0,match,aspell_match
class,Unnamed: 1_level_1,Unnamed: 2_level_1
phonetic,71.41%,72.17%
repeated,92.04%,93.81%
typo,58.51%,59.66%
visual,67.01%,68.05%


### Are annotators picking the right words, but wrong form?

In [19]:
# import these modules
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize(word):
    if type(word) is float:
        return None
    lemma = lemmatizer.lemmatize(word)
    if len(lemma) == 0:
        return None
    else:
        return lemma

all_responses['lemma_clean'] = all_responses['clean'].progress_apply(lemmatize)
all_responses['lemma'] = all_responses['Original Word'].progress_apply(lemmatize)
all_responses['lemma_match'] = all_responses['lemma_clean'].str.strip().str.lower() == all_responses['lemma'].str.strip().str.lower()
all_responses['lemma_match'].value_counts(normalize=True)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/users/lam135/nltk_data...
100%|██████████| 2498/2498 [00:01<00:00, 2005.11it/s]
100%|██████████| 2498/2498 [00:00<00:00, 89870.49it/s]


True    67.01%
False   32.99%
Name: lemma_match, dtype: float64

In [20]:
all_responses[~all_responses['lemma'].isna()].groupby('class').agg({'match': 'mean', 'lemma_match': 'mean'})

Unnamed: 0_level_0,match,lemma_match
class,Unnamed: 1_level_1,Unnamed: 2_level_1
phonetic,71.41%,73.70%
repeated,92.04%,90.27%
typo,58.51%,59.79%
visual,67.01%,68.57%


#### Analyze Individual Responses

In [21]:
__k = keys[2]

__k[__k['clean'] == 'grâce']['clean']

183    grâce
298    grâce
Name: clean, dtype: object

In [22]:
for i in range(0,5):
    __k = keys[i]
    print(i, __k[__k['ascii']].value_counts(['match'], normalize=True))

0 match
True    69.31%
False   30.69%
dtype: float64
1 match
True    64.97%
False   35.03%
dtype: float64
2 match
True    63.14%
False   36.86%
dtype: float64
3 match
True    69.45%
False   30.55%
dtype: float64
4 match
True    66.40%
False   33.60%
dtype: float64


In [23]:
all = pd.concat([keys[i] for i in range(0,5)])
all.shape

(2501, 12)

In [24]:
all.groupby(['class', 'source']).value_counts(['match'], normalize=True).unstack()

Unnamed: 0_level_0,match,False,True
class,source,Unnamed: 2_level_1,Unnamed: 3_level_1
phonetic,anthro_phonetic,13.96%,86.04%
phonetic,phonee,25.28%,74.72%
phonetic,zeroe_phonetic,48.68%,51.32%
repeated,repeated,8.70%,91.30%
typo,anthro_typo,50.19%,49.81%
typo,zeroe_noise,32.83%,67.17%
typo,zeroe_typo,45.66%,54.34%
visual,dces,15.85%,84.15%
visual,ices,52.83%,47.17%
visual,legit_extended,36.47%,63.53%


In [25]:
all.groupby(['class']).value_counts(['match'], normalize=True).unstack()

match,False,True
class,Unnamed: 1_level_1,Unnamed: 2_level_1
phonetic,29.31%,70.69%
repeated,8.70%,91.30%
typo,42.89%,57.11%
visual,35.05%,64.95%


In [26]:
all[all['source'] != 'repeated'].value_counts(['match'], normalize=True)

match
True    64.25%
False   35.75%
dtype: float64