In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Preparing data

In [3]:
from src.read_file_to_df import read_file_to_df

df1 = read_file_to_df('../data/spell-testset1.txt')
df2 = read_file_to_df('../data/spell-testset2.txt')

# ===== Misleading incorrect string ===== #
df3 = read_file_to_df('../data/aspell.txt')
df4 = read_file_to_df('../data/birkbeck.txt')
df5 = read_file_to_df('../data/wikipedia.txt')

In [4]:
df_combined = pd.concat([df1, df2], axis=0, ignore_index=True)

In [5]:
df_combined.to_csv('../data/combined.csv', index=False)

In [6]:
df_combined = pd.read_csv('../data/combined.csv')

In [7]:
df_sampled = df_combined.sample(n=100, random_state=42)

[PySpellChecker](https://pypi.org/project/pyspellchecker/ - Levenshtein distance

In [8]:
from spellchecker import SpellChecker
from tqdm import tqdm

spell = SpellChecker()

def get_correction(word):
    misspelled = spell.unknown([word])
    if misspelled:
        correction = spell.correction(word)
        if correction:
            return correction
        else:
            return "incorrectWord"
    return word

tqdm.pandas()

df_sampled['py_spell_checker'] = df_sampled['incorrect'].progress_apply(get_correction)


  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:01<01:05,  1.49it/s][A
 10%|█         | 10/100 [00:01<00:14,  6.02it/s][A
 14%|█▍        | 14/100 [00:02<00:15,  5.60it/s][A
 18%|█▊        | 18/100 [00:03<00:16,  5.09it/s][A
 19%|█▉        | 19/100 [00:05<00:27,  2.97it/s][A
 20%|██        | 20/100 [00:06<00:35,  2.28it/s][A
 37%|███▋      | 37/100 [00:07<00:08,  7.19it/s][A
 39%|███▉      | 39/100 [00:08<00:11,  5.53it/s][A
 53%|█████▎    | 53/100 [00:08<00:04,  9.59it/s][A
 59%|█████▉    | 59/100 [00:09<00:04,  9.04it/s][A
 62%|██████▏   | 62/100 [00:09<00:04,  7.97it/s][A
 69%|██████▉   | 69/100 [00:10<00:04,  7.62it/s][A
 80%|████████  | 80/100 [00:11<00:02,  9.68it/s][A
 82%|████████▏ | 82/100 [00:12<00:02,  8.21it/s][A
 87%|████████▋ | 87/100 [00:13<00:01,  7.79it/s][A
100%|██████████| 100/100 [00:13<00:00,  7.41it/s][A


In [9]:
y_true = df_sampled['correct']
y_pred = df_sampled['py_spell_checker']

In [10]:
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_true, y_pred, average='macro')

In [11]:
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.73
Precision: 0.5855855855855856
Recall: 0.5780780780780781
F1 Score: 0.5807807807807808


In [12]:
with open('../data/py_spellchecker_metrics.txt', 'w') as f:
    f.write(f'Accuracy: {accuracy}\n')
    f.write(f'Precision: {precision}\n')
    f.write(f'Recall: {recall}\n')
    f.write(f'F1 Score: {f1}\n')

[Bhuvana/t5-base-spellchecker](https://huggingface.co/Bhuvana/t5-base-spellchecker)

In [13]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("Bhuvana/t5-base-spellchecker")

model = T5ForConditionalGeneration.from_pretrained("Bhuvana/t5-base-spellchecker")

In [14]:
def correct(inputs):
    input_ids = tokenizer.encode(inputs,return_tensors='pt')
    sample_output = model.generate(
        input_ids,
        do_sample=True,
        max_length=50,
        top_p=0.99,
        num_return_sequences=1
    )
    res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
    return res

In [15]:
tqdm.pandas()

df_sampled['base_spellchecker'] = df_sampled['incorrect'].progress_apply(correct)


  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:01<01:10,  1.39it/s][A
  3%|▎         | 3/100 [00:01<00:56,  1.72it/s][A
  4%|▍         | 4/100 [00:02<00:49,  1.95it/s][A
  5%|▌         | 5/100 [00:02<00:48,  1.98it/s][A
  6%|▌         | 6/100 [00:03<00:50,  1.87it/s][A
  7%|▋         | 7/100 [00:03<00:44,  2.08it/s][A
  8%|▊         | 8/100 [00:04<00:40,  2.24it/s][A
  9%|▉         | 9/100 [00:04<00:42,  2.13it/s][A
 10%|█         | 10/100 [00:04<00:39,  2.27it/s][A
 11%|█         | 11/100 [00:05<00:36,  2.43it/s][A
 12%|█▏        | 12/100 [00:05<00:34,  2.52it/s][A
 13%|█▎        | 13/100 [00:06<00:43,  2.02it/s][A
 14%|█▍        | 14/100 [00:06<00:37,  2.30it/s][A
 15%|█▌        | 15/100 [00:07<00:37,  2.25it/s][A
 16%|█▌        | 16/100 [00:07<00:33,  2.54it/s][A
 17%|█▋        | 17/100 [00:07<00:29,  2.78it/s][A
 18%|█▊        | 18/100 [00:08<00:37,  2.19it/s][A
 19%|█▉        | 19/100 [00:08<00:39,  2.04it/s][A
 20%|██        | 20/100 [00:

In [19]:
y_true = df_sampled['correct']
y_pred = df_sampled['base_spellchecker']

In [20]:
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_true, y_pred, average='macro')

In [21]:
with open('../data/base_spellchecker_metrics.txt', 'w') as f:
    f.write(f'Accuracy: {accuracy}\n')
    f.write(f'Precision: {precision}\n')
    f.write(f'Recall: {recall}\n')
    f.write(f'F1 Score: {f1}\n')

In [22]:
# Load metrics and comapre them in dataframe
with open('../data/py_spellchecker_metrics.txt', 'r') as f:
    py_spellchecker_metrics = f.readlines()
    
with open('../data/base_spellchecker_metrics.txt', 'r') as f:
    base_spellchecker_metrics = f.readlines()
    
py_spellchecker_metrics = [metric.strip() for metric in py_spellchecker_metrics]
base_spellchecker_metrics = [metric.strip() for metric in base_spellchecker_metrics]

df_metrics = pd.DataFrame({
    'py_spellchecker': py_spellchecker_metrics,
    'base_spellchecker': base_spellchecker_metrics
})

In [23]:
df_metrics

Unnamed: 0,py_spellchecker,base_spellchecker
0,Accuracy: 0.73,Accuracy: 0.09
1,Precision: 0.5855855855855856,Precision: 0.05
2,Recall: 0.5780780780780781,Recall: 0.04351851851851852
3,F1 Score: 0.5807807807807808,F1 Score: 0.045370370370370366
