In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from spellcheck.utils import *

In [340]:
from spellcheck.utils import tokens

proverbs = ("""A little knowledge is a dangerous thing
  A man who is his own lawyer has a fool for his client
  All work and no play makes Jack a dull boy
  Better to remain silent and be thought a fool that to speak and remove all doubt;
  Do unto others as you would have them do to you
  Early to bed and early to rise, makes a man healthy, wealthy and wise
  Fools rush in where angels fear to tread
  Genius is one percent inspiration, ninety-nine percent perspiration
  If you lie down with dogs, you will get up with fleas
  Lightning never strikes twice in the same place
  Power corrupts; absolute power corrupts absolutely
  Here today, gone tomorrow
  See no evil, hear no evil, speak no evil
  Sticks and stones may break my bones, but words will never hurt me
  Take care of the pence and the pounds will take care of themselves
  The bigger they are, the harder they fall
  The grass is always greener on the other side of the fence
  The more things change, the more they stay the same
  Those who do not learn from history are doomed to repeat it"""
  .splitlines())

proverbs = list(map(lambda s: " ".join(tokens(s)), proverbs))

In [365]:
max_edit_distance = 2

In [409]:
from spellcheck.spellcheck import SpellCheck, SpellCheckFast

words_path = "spellcheck/data/train/symspellpy_words.txt"
bigrams_path = "spellcheck/data/train/symspellpy_bigrams.txt"
#spell_check = SpellCheck(words_path)
spell_check_fast = SpellCheckFast(words_path, bigrams_path, max_dictionary_edit_distance=max_edit_distance, prefix_length=7)

In [378]:
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=max_edit_distance, prefix_length=7)
sym_spell.load_dictionary("spellcheck/data/train/symspellpy_words.txt", term_index=0, count_index=1, separator='\t')
sym_spell.load_bigram_dictionary("spellcheck/data/train/symspellpy_bigrams.txt", term_index=0, count_index=1, separator='\t')

def sym_spell_correct(word):
    suggestions = sym_spell.lookup(word, Verbosity.TOP, include_unknown=True)
    return suggestions[0].term

def sym_spell_correct_compound(word):
    suggestions = sym_spell.lookup_compound(word, max_edit_distance)
    return suggestions[0].term


In [359]:
test_set_wiki = import_test_set("spellcheck/data/test/list_of_common_misspellings.txt")
test_set_toefl = import_test_set("spellcheck/data/test/toefl-spell.txt")
test_set_aspell = import_test_set("spellcheck/data/test/aspell-test.txt")

In [410]:
# print("SpellCheck:")
# spelltest(test_set, spell_check.correct, spell_check._words)
print("SpellCheckFast w/ correct:")
spelltest(test_set_wiki, spell_check_fast.correct, spell_check_fast._words)
spelltest(test_set_toefl, spell_check_fast.correct, spell_check_fast._words)
spelltest(test_set_aspell, spell_check_fast.correct, spell_check_fast._words)
print("SpellCheckFast w/ correct_text_best:")
spelltest(test_set_wiki, spell_check_fast.correct_text_best, spell_check_fast._words)
spelltest(test_set_toefl, spell_check_fast.correct_text_best, spell_check_fast._words)
spelltest(test_set_aspell, spell_check_fast.correct_text_best, spell_check_fast._words)
print("SymSpell w/ lookup:")
spelltest(test_set_wiki, sym_spell_correct, sym_spell.words)
spelltest(test_set_toefl, sym_spell_correct, sym_spell.words)
spelltest(test_set_aspell, sym_spell_correct, spell_check_fast._words)
print("SymSpell w/ lookup_compound:")
spelltest(test_set_wiki, sym_spell_correct_compound, sym_spell.words)
spelltest(test_set_toefl, sym_spell_correct_compound, sym_spell.words)
spelltest(test_set_aspell, sym_spell_correct_compound, spell_check_fast._words)

SpellCheckFast w/ correct:
82% of 4017 correct (4% unknown) at 3316 words per second 
72% of 6232 correct (8% unknown) at 5385 words per second 
50% of 547 correct (6% unknown) at 5954 words per second 
SpellCheckFast w/ correct_text_best:
82% of 4017 correct (3% unknown) at 1604 words per second 
76% of 6232 correct (4% unknown) at 2719 words per second 
51% of 547 correct (5% unknown) at 802 words per second 
SymSpell w/ lookup:
82% of 4017 correct (4% unknown) at 19644 words per second 
72% of 6232 correct (8% unknown) at 14935 words per second 
50% of 547 correct (6% unknown) at 12569 words per second 
SymSpell w/ lookup_compound:
80% of 4017 correct (3% unknown) at 677 words per second 
74% of 6232 correct (3% unknown) at 744 words per second 
49% of 547 correct (4% unknown) at 820 words per second 


In [425]:
proverbs_w_errors = list(map(lambda s: (add_spelling_errors(s, 0.2, segment=True), s), proverbs))

In [427]:
print("SpellCheckFast dumb:", end="\t")
print(test_sentence_correction(spell_check_fast.correct_text_dumb, proverbs_w_errors, False))
print("SpellCheckFast:", end="\t\t")
print(test_sentence_correction(spell_check_fast.correct_text, proverbs_w_errors, False))
print("SpellCheckFast better:", end="\t")
print(test_sentence_correction(spell_check_fast.correct_text_better, proverbs_w_errors, False))
print("SpellCheckFast best:", end="\t")
print(test_sentence_correction(spell_check_fast.correct_text_best, proverbs_w_errors, False))
print("Symspellpy lookup:", end="\t")
print(test_sentence_correction(lambda p: sym_spell.lookup_compound(p, 2)[0].term, proverbs_w_errors, False))
print("Symspellpy segment:", end="\t")
print(test_sentence_correction(lambda p: sym_spell.word_segmentation(p, 2).corrected_string, proverbs_w_errors, False))

SpellCheckFast dumb:	85.71
SpellCheckFast:		86.95
SpellCheckFast better:	87.17
SpellCheckFast best:	87.18
Symspellpy lookup:	88.71
Symspellpy segment:	88.68
