# Profile Ad-Word Datasets

In [1]:
# Copyright 2025 Luke Moffett
# Licensed under the Apache License, Version 2.0

import pandas as pd
import matplotlib.pyplot as plt
import torch

from clz_or_cls import datasets as corc_ds

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

plt.rcParams['figure.dpi'] = 300
%matplotlib inline

In [2]:
train_df, test_df, valid_df = [corc_ds.generated_df('visual+phonetic+typo_full', split) for split in ['train', 'test', 'valid']]

In [3]:
train_df['split'] = 'train'
test_df['split'] = 'test'
valid_df['split'] = 'valid'

In [None]:
[df[['clean']].drop_duplicates('clean').nunique() for df in [train_df, test_df, valid_df]]

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
valid_df.shape

In [None]:
train_df.drop_duplicates(['clean', 'perturbed']).shape

In [None]:
df = pd.concat([train_df, test_df, valid_df])
df.shape

In [None]:
df.drop_duplicates(['clean', 'perturbed']).shape

In [11]:
316661/392646

0.8064796279600455

In [None]:
df.drop_duplicates('perturbed').shape

In [14]:
datasets = [
    ("repeated", None),
    ("legit_extended", "visual"),
    ("dces", "visual"),
    ("ices", "visual"),
    ("zeroe_noise", "typo"),
    ("zeroe_typo", "typo"),
    ("anthro_typo", "typo"),
    ("anthro_phonetic", "phonetic"),
    ("phonee", "phonetic"),
    ("zeroe_phonetic", "phonetic")
]

class_map = {t[0]: t[1] for t in datasets}

df = None

for dataset, clazz in datasets:
    for split in ['train', 'test', 'valid']:
        ds = corc_ds.generated_df(dataset, split=split)
        ds['source'] = dataset
        ds['class'] = clazz
        ds['split'] = split

        if df is None:
            df = ds
        else:
            df = pd.concat([df, ds])

df.head(3)

Unnamed: 0,clean,perturbed,legible,legibility_score,source,class,split
0,distributed,distributed,True,1.0,repeated,,train
1,distributed,distributed,True,1.0,repeated,,train
2,specified,specified,True,1.0,repeated,,train


In [15]:
df_uniq = df.drop_duplicates(['clean', 'perturbed', 'source'])
df_unique_groups = df_uniq.groupby(['source', 'split']).nunique()

df_unique_groups

Unnamed: 0_level_0,Unnamed: 1_level_0,clean,perturbed,legible,legibility_score,class
source,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
anthro_phonetic,test,96,170,0,0,1
anthro_phonetic,train,100,166,0,0,1
anthro_phonetic,valid,100,171,0,0,1
anthro_typo,test,96,157,0,0,1
anthro_typo,train,100,163,0,0,1
anthro_typo,valid,100,162,0,0,1
dces,test,96,200,0,0,1
dces,train,100,199,0,0,1
dces,valid,100,200,0,0,1
ices,test,96,200,0,0,1


In [16]:
df_unique_groups.reset_index().pivot(index=['split'], columns=['source'], values=['perturbed']).transpose()

Unnamed: 0_level_0,split,test,train,valid
Unnamed: 0_level_1,source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
perturbed,anthro_phonetic,170,166,171
perturbed,anthro_typo,157,163,162
perturbed,dces,200,199,200
perturbed,ices,200,200,200
perturbed,legit_extended,194,193,197
perturbed,phonee,187,192,190
perturbed,repeated,96,100,100
perturbed,zeroe_noise,194,194,198
perturbed,zeroe_phonetic,187,192,190
perturbed,zeroe_typo,174,171,169


In [17]:
__df = df_unique_groups.reset_index()
print('unique')
__df[__df['source'] == 'repeated'][['source', 'split', 'clean']].transpose()

unique


Unnamed: 0,18,19,20
source,repeated,repeated,repeated
split,test,train,valid
clean,96,100,100


In [18]:
__df = df.groupby(['source', 'split']).count().reset_index()
print('total')
__df[__df['source'] == 'repeated'][['source', 'split', 'clean']].transpose()

total


Unnamed: 0,18,19,20
source,repeated,repeated,repeated
split,test,train,valid
clean,200,200,200


In [None]:
visual_ds = corc_ds.visual()
visual_ctx_ds = corc_ds.visual_ctx()
phonetic_ctx_ds = corc_ds.phonetic_ctx()
typo_ctx_ds = corc_ds.typo_ctx()

In [None]:
visual_ds['train']['clean'][:5]

['distributed', 'distributed', 'specified', 'specified', 'exec']

In [None]:
visual_ctx_ds['train']['clean'][:5], visual_ctx_ds['train']['perturbed'][:5]

(['distributed', 'distributed', 'specified', 'specified', 'exec'],
 ['method: "visual", word: "dİstriduțed"',
  'method: "visual", word: "distribu˵ed"',
  'method: "visual", word: "зpeσifiɘd"',
  'method: "visual", word: "ӭpecıքied"',
  'method: "visual", word: "exϲc"'])

In [None]:
phonetic_ctx_ds['train']['perturbed'][:5], typo_ctx_ds['train']['perturbed'][:5]

(['method: "phonetic", word: "distteriebiuted"',
  'method: "phonetic", word: "destribiutted"',
  'method: "phonetic", word: "speacifid"',
  'method: "phonetic", word: "spesifid"',
  'method: "phonetic", word: "ehxec"'],
 ['method: "typo", word: "disributed"',
  'method: "typo", word: "dstbtiierud"',
  'method: "typo", word: "sicefpied"',
  'method: "typo", word: "sefceiipd"',
  'method: "typo", word: "%e$xe<c"'])

In [None]:
vis_typ = corc_ds.geneterated_ds_ctx('visual+typo', splits=['train', 'test', 'valid'])
vis_typ

In [None]:
vis = corc_ds.geneterated_ds_ctx('visual', splits=['train', 'test', 'valid'], ctx={'visual': 'Strategy A', 'phonetic': 'Strategy B', 'typo': 'Strategy C'})

In [23]:
vis['train']['perturbed']

['method: "Strategy A", word: "ɑiɜtrΊbuҨed"',
 'method: "Strategy A", word: "ôìsthiéՇteԀ"',
 'method: "Strategy A", word: "speƃƚfiʙd"',
 'method: "Strategy A", word: "ʚpeciͳӏeɹ"',
 'method: "Strategy A", word: "exez"',
 'method: "Strategy A", word: "сxec"',
 'method: "Strategy A", word: "poɞsesз"',
 'method: "Strategy A", word: "ρбƃsesɛ"',
 'method: "Strategy A", word: "suıtɛ"',
 'method: "Strategy A", word: "suìfs"',
 'method: "Strategy A", word: "mɛcιԀɾa"',
 'method: "Strategy A", word: "montanƌ"',
 'method: "Strategy A", word: "Ǌetro"',
 'method: "Strategy A", word: "metɦo"',
 'method: "Strategy A", word: "ɿarker"',
 'method: "Strategy A", word: "parƙer"',
 'method: "Strategy A", word: "ċhemicaɨs"',
 'method: "Strategy A", word: "zheǌiŏals"',
 'method: "Strategy A", word: "sɛientɫfĭc"',
 'method: "Strategy A", word: "сcienƭific"',
 'method: "Strategy A", word: "Ԁeϧieо"',
 'method: "Strategy A", word: "ɑeȧIϵü"',
 'method: "Strategy A", word: "pɛlӌ"',
 'method: "Strategy A", word: "pò

In [None]:
full = corc_ds.geneterated_ds_ctx('visual+phonetic+typo_full', splits=['train', 'test', 'valid'], ctx={'visual': 'Strategy A', 'phonetic': 'Strategy B', 'typo': 'Strategy C'})

In [25]:
full['train']['perturbed'][-6:]

['method: "Strategy C", word: "di$s^close"',
 'method: "Strategy C", word: "dssolcie"',
 'method: "Strategy C", word: "surv`ivo<r"',
 'method: "Strategy C", word: "_sur<vivo:r"',
 'method: "Strategy C", word: "k,ids"',
 'method: "Strategy C", word: "kis"']

In [None]:
corc_ds.geneterated_ds_ctx('visual+typo+phonetic_full', splits=['train', 'test', 'valid'])