In [4]:
import pandas as pd
import numpy as np
import textdistance
import re
from collections import Counter

# Initialize an empty list to hold words
words = []

# Read and process the text file
with open('autocorrect book.txt', 'r', encoding='utf-8') as f:
    data = f.read().lower()  # Convert to lowercase
    words = re.findall(r'\w+', data)  # Find all words

# Create a set of unique words (vocabulary)
V = set(words)

# Create a frequency dictionary for the words
words_freq_dict = Counter(words)

# Calculate total number of words
Total = sum(words_freq_dict.values())

# Calculate probabilities for each word
probs = {k: v / Total for k, v in words_freq_dict.items()}

def autocorrect(input_word):
    """
    Suggests the correct word or provides close matches based on Jaccard similarity.
    
    Args:
        input_word (str): The input word to be checked.
        
    Returns:
        DataFrame: Suggestions for the closest words.
    """
    input_word = input_word.lower()  # Convert input to lowercase
    
    # Check if the word is already correct
    if input_word in V:
        return f'Your word seems to be correct: {input_word}'
    else:
        # Calculate similarities using Jaccard distance
        similarities = [1 - textdistance.Jaccard(qval=2).distance(v, input_word) for v in words_freq_dict.keys()]
        
        # Create a DataFrame to hold words, probabilities, and similarities
        df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
        df = df.rename(columns={'index': 'Word', 0: 'Prob'})
        df['Similarity'] = similarities
        
        # Sort by similarity and probability, returning the top 3 suggestions
        output = df.sort_values(['Similarity', 'Prob'], ascending=False).head(10)
        return output

def word_suggestion(input_word):
    """
    Suggests words similar to the input word based on Jaccard similarity.
    
    Args:
        input_word (str): The input word for which suggestions are sought.
        
    Returns:
        DataFrame: Suggestions for similar words.
    """
    input_word = input_word.lower()  # Convert input to lowercase
    
    # Check if the word is already present
    if input_word in probs:
        return f'The word is already there: {input_word}'
    else:
        # Calculate similarities
        jaccard = textdistance.Jaccard()
        similarities = [1 - jaccard.distance(w, input_word) for w in words_freq_dict.keys()]
        
        # Create a DataFrame for suggestions
        df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
        df = df.rename(columns={'index': 'Word', 0: 'Prob'})
        df['Similarity'] = similarities
        
        # Sort by similarity and probability, returning the top 10 suggestions
        output = df.sort_values(['Similarity', 'Prob'], ascending=False).head(10)
        return output

# Example usage
print(autocorrect("helo"))  # Autocorrect example

print('------------------------------------------')
print(word_suggestion("hel"))  # Word suggestion example


          Word      Prob  Similarity
2969      help  0.000184    0.500000
3031      held  0.000166    0.500000
4473      helm  0.000157    0.500000
1653      hell  0.000076    0.500000
7989      heel  0.000036    0.500000
375   bachelor  0.000040    0.428571
6121     below  0.000234    0.400000
1560     wheel  0.000027    0.400000
2836     shelf  0.000022    0.400000
9257     shell  0.000018    0.400000
------------------------------------------
        Word      Prob  Similarity
3114    hole  0.000198    0.750000
2969    help  0.000184    0.750000
3031    held  0.000166    0.750000
4473    helm  0.000157    0.750000
1653    hell  0.000076    0.750000
7989    heel  0.000036    0.750000
355       he  0.008515    0.666667
6058      eh  0.000085    0.666667
15260     le  0.000004    0.666667
8      whale  0.005524    0.600000
