In [3]:
import nltk
import numpy as np
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
def separate_words(tokens): #this function is responsable to get only words from the token list ["word", "?", "word", "."]
  word_list = []
  for token in tokens:
    if token.isalpha():
      word_list.append(token)
    else:
      continue
  return word_list

def normalize_words(list): #this function is responsable to set all characters in lower case
  word_list = []
  for word in list:
    word_list.append(word.lower())
  return word_list

def generate_words(word): #this function is responsable to genarate all possible words for all possible mistakes
  slices = []
  for i in range(len(word) + 1):
    slices.append((word[:i], word[i:]))
  new_words = insert_letter(slices)
  new_words += delete_character(slices)
  new_words += change_character(slices)
  new_words += change_pos(slices)
  return new_words

def generate_words_turbo(words): #this function is responsable to recall 'generate_words' and deal with mistaken 2 positions ahead
  new_words = []
  for word in words:
    new_words += generate_words(word)
  return new_words

def insert_letter(slices): # possible mistake (case 1): word misses a letter
  new_words = []
  letters = "abcdefghijklmnopqrstuvwxyzáàãâéèêíìîóòõôúùûç"
  for l_slice, r_slice in slices:
    for letter in letters:
      new_words.append( l_slice + letter + r_slice)
  return new_words

def delete_character(slices): # possible mistake (case 2): word has an additional letter
    new_words = []
    for l_slice, r_slice in slices:
        new_words.append(l_slice + r_slice[1:])
    return new_words

def change_character(slices): # possible mistake (case 3): word has a wrong letter
    new_words = []
    letters = "abcdefghijklmnopqrstuvwxyzáàãâéèêíìîóòõôúùûç"
    for l_slice, r_slice in slices:
        for letter in letters:
            new_words.append(l_slice + letter + r_slice[1:])
    return new_words

def change_pos(slices): # possible mistake (case 4): word has letter switched place with the next one
    new_words = []
    for l_slice, r_slice in slices:
        if len(r_slice) > 1:
            new_words.append(l_slice + r_slice[1] + r_slice[0] + r_slice[2:])
    return new_words

def probability(generated_word): #this function is responsable to calculate the frequency a word is an amount of words
    freq = nltk.FreqDist(word_list)
    word_count = len(word_list)
    return freq[generated_word]/word_count

def create_test_data(file): 
  test_words = []
  f = open(file, "r", encoding="utf8")
  for line in f:
    right_word, wrong_word = line.split()
    test_words.append((right_word, wrong_word))
  f.close()
  return test_words

def validate_corrector(test_words, vocabulary):
    number_test_words = len(test_words)
    got_right = 0
    unknow_words = 0
    for right_word, wrong_word in test_words:
        corrected_word = corrector(wrong_word)
        unknow_words += (right_word not in vocabulary)
        if corrected_word == right_word:
            got_right += 1
        else:
          print(wrong_word + '-' + corrector(wrong_word) + '-' + new_corrector(wrong_word))
    hit_rate = round(got_right * 100/number_test_words, 2)
    unknow_rate = round(unknow_words*100/number_test_words, 2)
    print(f'{hit_rate}% de palavras corrigidas a partir de {number_test_words} palavras, desconhecida é {unknow_rate}%')

def corrector(word): #this is the first version of the corrector
    generated_words = generate_words(word)
    right_word = max(generated_words, key = probability)
    return right_word

def new_corrector(word, list): #this is the corrector improved, should be faster
    vocabulary = np.array(list)
    generated_words = generate_words(word)
    turbo_words = generate_words_turbo(generated_words)
    whole = generated_words + turbo_words
    all_words = np.array(whole)
    print(whole)
    real_words = []
    for word in all_words:
        if word in vocabulary:
          real_words.append(word)
    right_word = max(real_words, key = probability)
    return right_word

In [19]:
with open("artigos.txt", "r", encoding="utf8") as file:
    artigos = file.read()
tokens = nltk.tokenize.word_tokenize(artigos)

word_list = separate_words(tokens)
word_list = normalize_words(word_list)
vocabulary = word_list.copy
test_words = create_test_data("palavras.txt")

In [29]:
corrected_word = new_corrector("lógicha", vocabulary)
print(corrected_word)

['alógicha', 'blógicha', 'clógicha', 'dlógicha', 'elógicha', 'flógicha', 'glógicha', 'hlógicha', 'ilógicha', 'jlógicha', 'klógicha', 'llógicha', 'mlógicha', 'nlógicha', 'ológicha', 'plógicha', 'qlógicha', 'rlógicha', 'slógicha', 'tlógicha', 'ulógicha', 'vlógicha', 'wlógicha', 'xlógicha', 'ylógicha', 'zlógicha', 'álógicha', 'àlógicha', 'ãlógicha', 'âlógicha', 'élógicha', 'èlógicha', 'êlógicha', 'ílógicha', 'ìlógicha', 'îlógicha', 'ólógicha', 'òlógicha', 'õlógicha', 'ôlógicha', 'úlógicha', 'ùlógicha', 'ûlógicha', 'çlógicha', 'laógicha', 'lbógicha', 'lcógicha', 'ldógicha', 'leógicha', 'lfógicha', 'lgógicha', 'lhógicha', 'liógicha', 'ljógicha', 'lkógicha', 'llógicha', 'lmógicha', 'lnógicha', 'loógicha', 'lpógicha', 'lqógicha', 'lrógicha', 'lsógicha', 'ltógicha', 'luógicha', 'lvógicha', 'lwógicha', 'lxógicha', 'lyógicha', 'lzógicha', 'láógicha', 'làógicha', 'lãógicha', 'lâógicha', 'léógicha', 'lèógicha', 'lêógicha', 'líógicha', 'lìógicha', 'lîógicha', 'lóógicha', 'lòógicha', 'lõógicha', 'lô

ValueError: max() arg is an empty sequence