# Select a Subset of Legit for Ad-Word

This subset is used for human annotation and for interacting with high-cost LLMs (ChatGPT, Palm2).

In [1]:
# Copyright 2025 Luke Moffett
# Licensed under the Apache License, Version 2.0

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import torch
import pathlib

from IPython.display import display
from clz_or_cls import datasets as corc_ds


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

plt.rcParams['figure.dpi'] = 300
%matplotlib inline

In [2]:
np.random.seed(123456)
random.seed(123456)

In [None]:
legit = corc_ds.legit_extended(do_perturb_legit=False)

In [6]:
clean = legit['test'].to_pandas()[['clean']]
clean

Unnamed: 0,clean
0,sterling
1,sterling
2,listings
3,listings
4,swingers
...,...
7493,bequeathed
7494,servicemen
7495,sanford
7496,disposition


In [7]:
clean_unique = clean.drop_duplicates().copy()
clean_unique

Unnamed: 0,clean
0,sterling
2,listings
4,swingers
6,productivity
8,reprints
...,...
7492,nostalgic
7493,bequeathed
7494,servicemen
7495,sanford


In [8]:
clean_unique['clean_len'] = clean_unique['clean'].apply(len)

In [9]:
group_sizes = dict((clean_unique['clean_len'].value_counts()//9//3).sort_index())
group_sizes

{4: 6, 5: 8, 6: 10, 7: 9, 8: 7, 9: 6, 10: 4, 11: 2, 12: 1, 13: 0, 14: 0}

In [10]:
word_group_df = None
clean_unique['rand'] = np.random.rand(len(clean_unique))
clean_unique = clean_unique.sort_values('rand').drop_duplicates('clean')
clean_unique['group'] = np.nan

for j, (str_len, group_clean) in enumerate(clean_unique.groupby('clean_len')):
    for i in range(9):
        group_clean_slice = group_clean.iloc[i*group_sizes[str_len]: (i+1)*group_sizes[str_len]]
        group_clean_slice.loc[group_clean_slice.index, 'group'] = i

        if word_group_df is None:
            word_group_df = group_clean_slice
        else:
            word_group_df = pd.concat([word_group_df, group_clean_slice])

word_group_df.groupby('group').count()

Unnamed: 0_level_0,clean,clean_len,rand
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,53,53,53
1.0,53,53,53
2.0,53,53,53
3.0,53,53,53
4.0,53,53,53
5.0,53,53,53
6.0,53,53,53
7.0,53,53,53
8.0,53,53,53


In [11]:
prep_path = pathlib.Path(os.environ['CORC_DATASETS_PREP_DIR'])
word_group_df.to_csv(prep_path/'annotations'/'selections'/'human_dataset_selection.csv')

In [12]:
word_group_df[word_group_df['group'] == 7]

Unnamed: 0,clean,clean_len,rand,group
2942,sink,4,0.241471,7.0
1648,meta,4,0.243045,7.0
2662,kept,4,0.251353,7.0
6046,reed,4,0.255744,7.0
7467,memo,4,0.258588,7.0
3312,math,4,0.259505,7.0
2016,world,5,0.299514,7.0
590,blank,5,0.308961,7.0
7459,lange,5,0.31343,7.0
6254,frame,5,0.319982,7.0


# Generate Individual Datasets

In [14]:
datasets = [
    "legit_extended",
    "dces",
    "ices",
    "zeroe_noise",
    "zeroe_typo",
    "anthro_typo",
    "anthro_phonetic",
    "phonee",
    "zeroe_phonetic"
]

for dataset in datasets:
    display(corc_ds.generated_df(dataset, split='train').head())

Unnamed: 0,clean,perturbed,legible
0,distributed,ԁisʈriþuʈeԁ,True
1,distributed,dᒨstṙᒨḃuߙɛⅾ,False
2,specified,sᴅeດiƚieή,False
3,specified,spёcifie⍺,False
4,exec,e⨲e⊂,True


Unnamed: 0,clean,perturbed
0,distributed,ɗistrībuₜed
1,distributed,ḋìstȓi℔ᴝted
2,specified,speℂifiɝd
3,specified,śpeçi㎌ѝeđ
4,exec,exèç


Unnamed: 0,clean,perturbed
0,distributed,ɑiɜtrΊbuҨed
1,distributed,ôìsthiéՇteԀ
2,specified,speƃƚfiʙd
3,specified,ʚpeciͳӏeɹ
4,exec,exez


Unnamed: 0,clean,perturbed
0,distributed,distriuted
1,distributed,dseirbittud
2,specified,spceeiifd
3,specified,secpeiifd
4,exec,%ex$ec


Unnamed: 0,clean,perturbed
0,distributed,distribuited
1,distributed,distribuited
2,specified,apecifiee
3,specified,s'ecifiex
4,exec,sxev


Unnamed: 0,clean,perturbed
0,distributed,distrubited
1,distributed,distrubited
2,specified,spesified
3,specified,spesified
4,exec,exac


Unnamed: 0,clean,perturbed
0,distributed,Distributed
1,distributed,distributed
2,specified,specfied
3,specified,specified
4,exec,EXEC


Unnamed: 0,clean,perturbed
0,distributed,distteriebiuted
1,distributed,destribiutted
2,specified,speacifid
3,specified,spesifid
4,exec,ehxec


Unnamed: 0,clean,perturbed
0,distributed,distteriebiuted
1,distributed,destribiutted
2,specified,speacifid
3,specified,spesifid
4,exec,ehxec


In [19]:
aggregate_df = None
for dataset in datasets:
    this_df = corc_ds.generated_df(dataset, split='train').sample(20)

    if aggregate_df is None:
        aggregate_df = group_clean_slice
    else:
        aggregate_df = pd.concat([aggregate_df, this_df])

aggregate_df['rand'] = np.random.rand(len(aggregate_df))
aggregate_df.sort_values('rand', inplace=True)

# UNCOMMENT TO RECREATE
# aggregate_df[['clean', 'perturbed']].to_csv(prep_path/'annotations'/'selections'/'example_perturbations.csv', index=False)

In [24]:
NUM_SUBSETS = 9

repeated_len = 500 - word_group_df.groupby('group').count()['clean'].sum()

repeated_sample = corc_ds.generated_df('repeated', split='test').drop_duplicates('clean')
repeated_sample['rand'] = np.random.rand(len(repeated_sample))
repeated_sample['source'] = 'repeated'
repeated_sample = repeated_sample.sort_values('rand').head(repeated_len)

# UNCOMMENT TO RECREATE
# repeated_sample.to_csv(prep_path/'annotations'/'selections'/'repeated_sample.csv', index=False)

assert len(datasets) == word_group_df['group'].nunique()

for i, group_start in enumerate(range(NUM_SUBSETS)):
    aggregate_df = None
    for ds_idx_root in range(len(datasets)):
        ds_idx = (group_start+ds_idx_root) % len(datasets)
        dataset = datasets[ds_idx]
        this_df = corc_ds.generated_df(dataset, split='test')

        this_df['rand'] = np.random.rand(len(this_df))
        this_df['source'] = dataset
        this_df.sort_values('rand', inplace=True)
        this_df = this_df.drop_duplicates(['clean'])

        grouping = list(word_group_df[word_group_df['group'] == ds_idx_root]['clean'])

        this_df = this_df[this_df['clean'].isin(grouping)]

        if aggregate_df is None:
            aggregate_df = this_df
        else:
            aggregate_df = pd.concat([aggregate_df, this_df])

    aggregate_df = pd.concat([aggregate_df, repeated_sample])
    aggregate_df['rand'] = np.random.rand(len(aggregate_df))
    aggregate_df.sort_values('rand', inplace=True)


    # UNCOMMENT TO RECREATE
    # aggregate_df[['clean', 'perturbed', 'source']].to_csv(prep_path/'annotations'/'selections'/f'group{group_start}_perturbations_full.csv', index=False)
    # aggregate_df['perturbed'].to_csv(prep_path/'annotations'/'selections'/f'group{group_start}_perturbations.csv', index=False, quoting=False, header=False)

# Split out Human Annotation Groups

In [23]:
for group_id in range(5):
    __df = pd.read_csv(prep_path/'annotations'/'selections'/f'group{group_id}_perturbations_full.csv')

    # UNCOMMENT TO RECREATE
    # __df['clean'].to_csv(prep_path/'annotations'/'selections'/f'group{group_id}_clean.csv', index=False, quoting=False, header=False)