In [1]:
import nltk
from collections import Counter
from nltk import ngrams
from Levenshtein import distance
import string

# Download necessary NLTK data
nltk.download('words')
nltk.download('brown')

from nltk.corpus import words, brown

[nltk_data] Downloading package words to /home/nathalie/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package brown to /home/nathalie/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
# Load vocabulary and word frequencies
vocabulary = set(words.words())  # List of valid words
word_frequencies = Counter(brown.words())  # Word frequencies from the Brown corpus


In [3]:
len(vocabulary)

235892

In [4]:
len(word_frequencies)

56057

In [5]:
# Build an N-gram model using the Brown corpus
def build_ngram_model(corpus, n):
    """Build N-gram counts from a corpus."""
    tokens = [word.lower() for word in corpus]
    return Counter(ngrams(tokens, n))

In [6]:
# Create a bigram model
bigram_model = build_ngram_model(brown.words(), 2)

In [7]:
bigram_model

Counter({('of', 'the'): 9717,
         (',', 'and'): 6302,
         ('.', 'the'): 6081,
         ('in', 'the'): 6025,
         (',', 'the'): 3787,
         ('.', '``'): 3515,
         ('to', 'the'): 3484,
         ("''", '.'): 3332,
         (';', ';'): 2784,
         ('.', 'he'): 2660,
         ('on', 'the'): 2466,
         ('?', '?'): 2346,
         ('and', 'the'): 2246,
         ("''", ','): 2032,
         (',', 'but'): 1856,
         ('for', 'the'): 1852,
         ('.', 'it'): 1836,
         ('to', 'be'): 1718,
         ('at', 'the'): 1655,
         ('.', 'in'): 1619,
         ('with', 'the'): 1533,
         (',', 'he'): 1495,
         ('of', 'a'): 1472,
         ('it', 'is'): 1470,
         ('in', 'a'): 1414,
         ('from', 'the'): 1411,
         ('that', 'the'): 1379,
         ('by', 'the'): 1347,
         (',', 'a'): 1301,
         ('it', 'was'): 1296,
         ('.', 'but'): 1226,
         ('.', 'i'): 1197,
         (',', '``'): 1093,
         ('he', 'was'): 1088,
         ('

In [14]:
len(bigram_model)

436003

In [8]:
# Preprocessing text
def preprocess_text(text):
    """Remove punctuation and lowercase text."""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator).lower()

In [9]:
# Generate suggestions for misspelled words
def generate_suggestions(word, vocabulary, word_frequencies, max_candidates=5):
    """Generate word suggestions based on edit distance and frequency."""
    suggestions = [
        (vocab_word, distance(word, vocab_word), word_frequencies[vocab_word])
        for vocab_word in vocabulary
    ]
    # Sort by edit distance (asc), then by frequency (desc)
    suggestions = sorted(suggestions, key=lambda x: (x[1], -x[2]))
    return [suggestion[0] for suggestion in suggestions[:max_candidates]]

In [10]:
# Calculate N-gram probability
def ngram_probability(word1, word2, bigram_counts):
    """Calculate probability of word2 following word1 using bigram counts."""
    bigram_count = bigram_counts.get((word1, word2), 0)
    unigram_count = sum(1 for bigram in bigram_counts if bigram[0] == word1)
    return bigram_count / unigram_count if unigram_count > 0 else 0

In [11]:
# Correct a single word based on context
def correct_word_with_context(prev_word, word, vocabulary, word_frequencies, bigram_model):
    """Correct a word considering the context (previous word)."""
    if word in vocabulary:
        return word  # Word is correct
    suggestions = generate_suggestions(word, vocabulary, word_frequencies)
    # Rank suggestions by bigram probability
    if prev_word:
        best_suggestion = max(
            suggestions, 
            key=lambda suggestion: ngram_probability(prev_word, suggestion, bigram_model)
        )
    else:
        best_suggestion = suggestions[0]  # No context, fallback to first suggestion
    return best_suggestion

In [12]:
# Full spellcheck function
def spellcheck(text, vocabulary, word_frequencies, bigram_model):
    """Correct an entire text using N-grams for context-aware suggestions."""
    text = preprocess_text(text)
    words = text.split()
    corrected_words = []
    for i, word in enumerate(words):
        prev_word = corrected_words[i - 1] if i > 0 else None
        corrected_word = correct_word_with_context(prev_word, word, vocabulary, word_frequencies, bigram_model)
        corrected_words.append(corrected_word)
    return " ".join(corrected_words)

In [13]:
# Example usage
input_text = "cheking speling is complicatek"
corrected_text = spellcheck(input_text, vocabulary, word_frequencies, bigram_model)
print("Original Text: ", input_text)
print("Corrected Text:", corrected_text)

Original Text:  cheking speling is complicatek
Corrected Text: choking spelling is complicated
