# **Keyboard-Auto-Suggestion-NLP-Python-Project**

In [1]:
pip install textdistance

Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading textdistance-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.6.3


In [2]:
import numpy as np
import re
import pandas as pd
import textdistance

In [3]:
from collections import Counter

## File Opening And Cleaning (change formate to utf-8)

In [4]:
words = []
with open('autocorrect book.txt','r',encoding='utf-8') as f:
    data = f.read()
    data = data.lower()
    word = re.findall('\w+', data)
    words +=word

In [5]:
print(words[0:10])

['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale']


## make vocabulary

In [6]:
len(words)

222663

In [7]:
V = set(words)

## build the frequency of those words

In [8]:
word_freq_dict = Counter(words)

In [9]:
word_freq_dict.most_common(10)

[('the', 14703),
 ('of', 6742),
 ('and', 6517),
 ('a', 4799),
 ('to', 4707),
 ('in', 4238),
 ('that', 3081),
 ('it', 2534),
 ('his', 2530),
 ('i', 2120)]

## Relative Frequency of words

Now we want to get the probability of occurrence of each word, this equals the relative frequencies of the words:

The formula used to calculate the probability of a word in the provided code is:

Probability(word) = Frequency(word) / Total count of all words

In [10]:
Total_words_freq = sum(word_freq_dict.values())


probs = {}
for k in word_freq_dict.keys():
    probs[k] = word_freq_dict[k] / Total_words_freq

In [11]:
probs

{'the': 0.06603252448767868,
 'project': 0.0004086893646452262,
 'gutenberg': 0.0004221626404027611,
 'ebook': 4.4910919191783095e-05,
 'of': 0.030278941719100165,
 'moby': 0.0004041982727260479,
 'dick': 0.0004041982727260479,
 'or': 0.003579400259585113,
 'whale': 0.005524043060589321,
 'by': 0.005488114325235894,
 'herman': 1.796436767671324e-05,
 'melville': 1.796436767671324e-05,
 'this': 0.006462681271697588,
 'is': 0.007863901950481221,
 'for': 0.007383355115129142,
 'use': 0.0002200635040397372,
 'anyone': 2.694655151506986e-05,
 'anywhere': 7.185747070685296e-05,
 'at': 0.005995607712103043,
 'no': 0.002667708599991916,
 'cost': 1.796436767671324e-05,
 'and': 0.029268446037285047,
 'with': 0.00794474160502643,
 'almost': 0.000884745108078127,
 'restrictions': 8.98218383835662e-06,
 'whatsoever': 3.143764343424817e-05,
 'you': 0.004302466058572821,
 'may': 0.001145228439390469,
 'copy': 8.533074646438789e-05,
 'it': 0.011380426923197837,
 'give': 0.0004041982727260479,
 'away':

## Finding Similar Words

Now we will sort similar words according to the Jaccard distance by calculating the 2 grams Q of the words. Next, we will return the 5 most similar words ordered by similarity and probability:

The Jaccard distance measures the dissimilarity between two sets by comparing their intersection and union

In [13]:
def autocorrect(word): # Hel is
    word = word.lower()
    if word in probs:
        print('the word is already there', word)
    else:
        similarities = [1-(textdistance.Jaccard()).distance(w,word) for w in word_freq_dict.keys()]
        df = pd.DataFrame.from_dict(probs,orient='index').reset_index()
        df = df.rename(columns={'index':'Word',0:'Prob'})
        df['Similarity'] = similarities
        output = df.sort_values(['Similarity','Prob'],ascending=False).head(10)
        return(output)
autocorrect('hel')

Unnamed: 0,Word,Prob,Similarity
3114,hole,0.000198,0.75
2969,help,0.000184,0.75
3031,held,0.000166,0.75
4473,helm,0.000157,0.75
1653,hell,7.6e-05,0.75
7989,heel,3.6e-05,0.75
355,he,0.008515,0.666667
6058,eh,8.5e-05,0.666667
15260,le,4e-06,0.666667
8,whale,0.005524,0.6
