In [89]:
from nltk.metrics.distance import edit_distance

class Autocorrect:
    def __init__(self, corpus):
        self._words = corpus
        self._word_freq = self.get_word_freq(corpus)
    
    def get_word_freq(self, corpus) -> dict:
        wordcount = len(corpus)
        freq_dict = {}
        for word in corpus:
            if word in freq_dict.keys():
                count = freq_dict.get(word)["cnt"] + 1
                freq_dict[word] = { "cnt": count }
            else:
                freq_dict[word] = { "cnt": 1 }
        for key in freq_dict:
            freq_dict[key]["rel"] = freq_dict[key]["cnt"]/wordcount
        return freq_dict

    def checkWord(self, string) -> list:
        if string in self._word_freq:
            return [(0, string, self._word_freq[string]["rel"])]
        match = []
        for key in self._word_freq:
            dist = edit_distance(string, key, substitution_cost=2)
            match.append((dist, key, self._word_freq[key]["rel"]))
        match.sort()
        match = match[:5]
        return match
    
    def test_freq(self, string) -> int:
        count = 0
        for word in self._words:
            if word == string:
                count += 1
        return count

In [94]:
from nltk.corpus import brown

ca01 = brown.words("ca01")

ac = Autocorrect(ca01)

print(ac._word_freq["the"]["cnt"]/len(ca01))
print(str(ac._word_freq["the"]["cnt"]) + "   " + str(ac._word_freq["the"]["rel"]))

ac.checkWord("wherefore")

0.056645851917930416
127   0.056645851917930416


[(4, 'where', 0.00044603033006244426),
 (5, 'Before', 0.00044603033006244426),
 (5, 'before', 0.00044603033006244426),
 (5, 'were', 0.0008920606601248885),
 (6, 'There', 0.00044603033006244426)]

In [96]:
print(ac._word_freq["were"]["cnt"])
print(ac.test_freq("were"))

2
2
