In [2]:
import numpy as np
import pandas as pd
import textdistance 
from collections import Counter
import re

# File Opening And Cleaning (change formate to utf-8

In [14]:
words = []
with open('Oxford English Dictionary.txt','r',encoding='utf-8') as f:
    data = f.read()
    data = data.lower()
    word = re.findall('\w+', data)
    words +=word

In [15]:
print(words[0:10])

['a', 'a', 'prefix', 'also', 'an', 'before', 'a', 'vowel', 'sound', 'not']


# make vocabulary

In [16]:
len(words)

741352

In [17]:
V = set(words)

# build the frequency of those words

In [18]:
word_freq_dict = Counter(words)

In [19]:
word_freq_dict.most_common(10)

[('n', 29511),
 ('of', 27308),
 ('a', 25517),
 ('or', 22306),
 ('1', 16203),
 ('2', 16094),
 ('the', 13438),
 ('to', 13240),
 ('in', 12041),
 ('adj', 10212)]

# Relative Frequency of words
Now we want to get the probability of occurrence of each word, this equals the relative frequencies of the words:

The formula used to calculate the probability of a word in the provided code is:

Probability(word) = Frequency(word) / Total count of all words

In [20]:
Total_words_freq = sum(word_freq_dict.values())


probs = {}
for k in word_freq_dict.keys():
    probs[k] = word_freq_dict[k] / Total_words_freq

In [21]:
probs

{'a': 0.034419546989823996,
 'prefix': 0.00011060872567956922,
 'also': 0.0024023675662843023,
 'an': 0.004248993730373696,
 'before': 0.000369595010197585,
 'vowel': 5.395547594125328e-05,
 'sound': 0.0007405389072937012,
 'not': 0.002645167208019942,
 'without': 0.0006366746161067887,
 'amoral': 2.697773797062664e-06,
 'greek': 0.0027247515350332903,
 'aa': 2.697773797062664e-06,
 'abbr': 0.0010008740787102483,
 '1': 0.02185601441690317,
 'automobile': 6.74443449265666e-06,
 'association': 8.228210081041124e-05,
 '2': 0.021708985744963258,
 'alcoholics': 1.348886898531332e-06,
 'anonymous': 8.093321391187992e-06,
 '3': 0.007904477225393605,
 'anti': 3.1024398666220634e-05,
 'aircraft': 0.0002819173617930484,
 'aardvark': 1.348886898531332e-06,
 'n': 0.03980700126255814,
 'mammal': 8.228210081041124e-05,
 'with': 0.008164812396810153,
 'tubular': 1.8884416579438646e-05,
 'snout': 2.5628851072095308e-05,
 'and': 0.006687781242918344,
 'long': 0.000674443449265666,
 'tongue': 7.01421187

# Finding Similar Words¶
Now we will sort similar words according to the Jaccard distance by calculating the 2 grams Q of the words. Next, we will return the 5 most similar words ordered by similarity and probability:

The Jaccard distance measures the dissimilarity between two sets by comparing their intersection and union

In [27]:
def autocorrect(word): # Hel is
    word = word.lower() 
    if word in probs:
        print('the word is already there', word)
    else:
        similarities = [1-(textdistance.Jaccard(qval= 2)).distance(w,word) for w in word_freq_dict.keys()]
        df = pd.DataFrame.from_dict(probs,orient='index').reset_index()
        df = df.rename(columns={'index':'Word',0:'Prob'})
        df['Similarity'] = similarities
        output = df.sort_values(['Similarity','Prob'],ascending=False).head(10)
        return(output)
autocorrect('carrect')

Unnamed: 0,Word,Prob,Similarity
13577,carrel,1e-06,0.571429
1430,care,0.000117,0.5
4278,correct,9.4e-05,0.5
28847,hectare,5e-06,0.5
17928,rect,4e-06,0.5
45140,scarred,4e-06,0.5
31938,surrect,3e-06,0.5
38260,korrect,1e-06,0.5
50336,corrects,1e-06,0.444444
197,carry,0.000165,0.428571
