# Produce Baserates for GNU Aspell

In [1]:
# Copyright 2025 Luke Moffett
# Licensed under the Apache License, Version 2.0

import pandas as pd
import matplotlib.pyplot as plt
import os
import tqdm
import torch
import pathlib
import enchant

from IPython.display import display
from clz_or_cls import datasets as corc_ds

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tqdm.tqdm.pandas()

plt.rcParams['figure.dpi'] = 300
%matplotlib inline

In [None]:
datasets = [
    ("legit_extended", "visual"),
    ("dces", "visual"),
    ("ices", "visual"),
    ("zeroe_noise", "typo"),
    ("zeroe_typo", "typo"),
    ("anthro_typo", "typo"),
    ("anthro_phonetic", "phonetic"),
    ("phonee", "phonetic"),
    ("zeroe_phonetic", "phonetic")
]

class_map = {t[0]: t[1] for t in datasets}

all_test_df = None

for dataset, clazz in datasets:
    ds = corc_ds.generated_df(dataset, split='test')
    ds['source'] = dataset
    ds['class'] = clazz

    if all_test_df is None:
        all_test_df = ds
    else:
        all_test_df = pd.concat([all_test_df, ds])

all_test_df.sample(25)

In [None]:
b = enchant.Broker()
b.set_ordering("*","aspell,myspell")

en_dict_aspell = b.request_dict("en_US")

assert en_dict_aspell.provider.name == 'aspell'

def closest_for(sp_dict):
    def next_closest_word(word):
        suggestions = sp_dict.suggest(word)
        if len(suggestions) == 0:
            return None
        else:
            return suggestions[0]
        
    return next_closest_word

suggest_aspell = closest_for(en_dict_aspell)
all_test_df['aspell'] = all_test_df['perturbed'].progress_apply(lambda x: suggest_aspell(x))
all_test_df

In [None]:
results_path = pathlib.Path(os.environ['CORC_RESULTS_DIR'])
all_test_df.to_csv(results_path/'ad-word'/'spellcheck_test.csv')

# Check Performance

In [6]:
all_test_df = pd.read_csv(results_path/'ad-word'/'spellcheck_test.csv')

In [None]:
all_test_df['correct'] = all_test_df['aspell'].str.lower() == all_test_df['clean'].str.lower()
all_test_df['correct'].value_counts(normalize=True, dropna=False)

In [None]:
class_accs = []
for clazz in list(all_test_df['class'].unique()) + ['all']:
    if clazz == 'all':
        selection = all_test_df
    else:
        selection = all_test_df[all_test_df['class'] == clazz]
    corr = selection['correct']
    acc = corr.value_counts(normalize=True)[True]
    correct = corr.value_counts()[True]
    total = len(corr)
    class_accs.append((clazz, acc))

class_accs_df = pd.DataFrame(class_accs, columns=['class', 'acc'])
print('Spellcheck by Class')
class_accs_df.set_index('class')

In [None]:
strat_accs = []
for strategy in all_test_df['source'].unique():

    selection = all_test_df[all_test_df['source'] == strategy]
    corr = selection['correct']
    clazz = class_map[strategy]
    acc = corr.value_counts(normalize=True)[True]
    correct = corr.value_counts()[True]
    total = len(corr)
    strat_accs.append((clazz, strategy, acc))

strat_accs_df = pd.DataFrame(strat_accs, columns=['class', 'strategy', 'acc'])
print('Spellcheck by Strategy')
strat_accs_df.set_index(['class', 'strategy'])