In [1]:
#!pip install gensim textblob

In [2]:
import spacy
import pandas as pd
from gensim import corpora, models
from textblob import TextBlob
from transformers import pipeline

In [3]:
homophones = {
    "accept": ["except"],
    "except": ["accept"],
    "affect": ["effect"],
    "effect": ["affect"],
    "allude": ["elude"],
    "elude": ["allude"],
    "altar": ["alter"],
    "alter": ["altar"],
    "arc": ["ark"],
    "ark": ["arc"],
    "bare": ["bear"],
    "bear": ["bare"],
    "beach": ["beech"],
    "beech": ["beach"],
    "berry": ["bury"],
    "bury": ["berry"],
    "billed": ["build"],
    "build": ["billed"],
    "blue": ["blew"],
    "blew": ["blue"],
    "board": ["bored"],
    "bored": ["board"],
    "brake": ["break"],
    "break": ["brake"],
    "buy": ["by", "bye"],
    "by": ["buy", "bye"],
    "bye": ["buy", "by"],
    "cell": ["sell"],
    "sell": ["cell"],
    "cent": ["scent", "sent"],
    "scent": ["cent", "sent"],
    "sent": ["cent", "scent"],
    "cite": ["site", "sight"],
    "site": ["cite", "sight"],
    "sight": ["cite", "site"],
    "complement": ["compliment"],
    "compliment": ["complement"],
    "coarse": ["course"],
    "course": ["coarse"],
    "dear": ["deer"],
    "deer": ["dear"],
    "die": ["dye"],
    "dye": ["die"],
    "fair": ["fare"],
    "fare": ["fair"],
    "flour": ["flower"],
    "flower": ["flour"],
    "for": ["four"],
    "four": ["for"],
    "hair": ["hare"],
    "hare": ["hair"],
    "heal": ["heel", "he'll"],
    "heel": ["heal", "he'll"],
    "he'll": ["heal", "heel"],
    "here": ["hear"],
    "hear": ["here"],
    "higher": ["hire"],
    "hire": ["higher"],
    "hole": ["whole"],
    "whole": ["hole"],
    "hour": ["our"],
    "our": ["hour"],
    "knight": ["night"],
    "night": ["knight"],
    "knot": ["not"],
    "not": ["knot"],
    "know": ["no"],
    "no": ["know"],
    "made": ["maid"],
    "maid": ["made"],
    "mail": ["male"],
    "male": ["mail"],
    "meat": ["meet", "mete"],
    "meet": ["meat", "mete"],
    "mete": ["meat", "meet"],
    "morning": ["mourning"],
    "mourning": ["morning"],
    "new": ["knew"],
    "knew": ["new"],
    "none": ["nun"],
    "nun": ["none"],
    "one": ["won"],
    "won": ["one"],
    "pair": ["pare", "pear"],
    "pare": ["pair", "pear"],
    "pear": ["pair", "pare"],
    "peace": ["piece"],
    "piece": ["peace"],
    "plain": ["plane"],
    "plane": ["plain"],
    "principal": ["principle"],
    "principle": ["principal"],
    "rain": ["reign", "rein"],
    "reign": ["rain", "rein"],
    "rein": ["rain", "reign"],
    "right": ["rite", "write"],
    "rite": ["right", "write"],
    "write": ["right", "rite"],
    "sea": ["see"],
    "see": ["sea"],
    "sew": ["so", "sow"],
    "so": ["sew", "sow"],
    "sow": ["sew", "so"],
    "stare": ["stair"],
    "stair": ["stare"],
    "stationary": ["stationery"],
    "stationery": ["stationary"],
    "steal": ["steel"],
    "steel": ["steal"],
    "tail": ["tale"],
    "tale": ["tail"],
    "there": ["their", "they're"],
    "their": ["there", "they're"],
    "they're": ["there", "their"],
    "threw": ["through"],
    "through": ["threw"],
    "to": ["too", "two"],
    "too": ["to", "two"],
    "two": ["to", "too"],
    "vain": ["vane", "vein"],
    "vane": ["vain", "vein"],
    "vein": ["vain", "vane"],
    "waste": ["waist"],
    "waist": ["waste"],
    "wait": ["weight"],
    "weight": ["wait"],
    "way": ["weigh", "whey"],
    "weigh": ["way", "whey"],
    "whey": ["way", "weigh"],
    "weak": ["week"],
    "week": ["weak"],
    "wear": ["where"],
    "where": ["wear"],
    "weather": ["whether"],
    "whether": ["weather"],
    "which": ["witch"],
    "witch": ["which"],
    "you're": ["your"],
    "your": ["you're"]
}


In [4]:
# additional_homophones = {
#     "canvas": ["canvass"],
#     "canvass": ["canvas"],
#     "cereal": ["serial"],
#     "serial": ["cereal"],
#     "chord": ["cord"],
#     "cord": ["chord"],
#     "council": ["counsel"],
#     "counsel": ["council"],
#     "currant": ["current"],
#     "current": ["currant"],
#     "dual": ["duel"],
#     "duel": ["dual"],
#     "gait": ["gate"],
#     "gate": ["gait"],
#     "grate": ["great"],
#     "great": ["grate"],
#     "guessed": ["guest"],
#     "guest": ["guessed"],
#     "holy": ["wholly"],
#     "wholly": ["holy"],
#     "idle": ["idol"],
#     "idol": ["idle"],
#     "leak": ["leek"],
#     "leek": ["leak"],
#     "lessen": ["lesson"],
#     "lesson": ["lessen"],
#     "miner": ["minor"],
#     "minor": ["miner"],
#     "naval": ["navel"],
#     "navel": ["naval"],
#     "pedal": ["peddle"],
#     "peddle": ["pedal"],
#     "peer": ["pier"],
#     "pier": ["peer"],
#     "profit": ["prophet"],
#     "prophet": ["profit"],
#     "root": ["route"],
#     "route": ["root"],
#     "sole": ["soul"],
#     "soul": ["sole"],
#     "stake": ["steak"],
#     "steak": ["stake"],
#     "suite": ["sweet"],
#     "sweet": ["suite"],
#     "vial": ["vile"],
#     "vile": ["vial"],
#     "wail": ["whale"],
#     "whale": ["wail"]
# }

# # Merge with existing homophones list
# homophones.update(additional_homophones)

In [5]:
# Load spaCy model for POS tagging and dependency parsing
nlp = spacy.load("en_core_web_sm")

# Initialize BERT fill-mask pipeline
fill_mask = pipeline('fill-mask', model='bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Load spaCy model for POS tagging and dependency parsing
nlp = spacy.load("en_core_web_sm")

# Initialize BERT fill-mask pipeline
fill_mask = pipeline('fill-mask', model='bert-base-cased')

# Function to analyze sentence with spaCy (POS and dependency parsing)
def analyze_sentence(sentence):
    return nlp(sentence)

def identify_phrases_for_masking(doc, homophones):
    phrases_to_mask = []

    for token in doc:
        if token.text.lower() in homophones:
            # Start with the full subtree of the homophone
            start = token.left_edge.i
            end = token.right_edge.i

            # Expand or contract the phrase based on sentence structure
            # Handle compound sentences
            if token.dep_ in ['conj', 'ccomp']:
                start = min(start, token.head.left_edge.i)
                end = max(end, token.head.right_edge.i)

            # Include modifiers for nouns and verbs
            if token.pos_ in ['NOUN', 'PROPN', 'VERB']:
                # Include preceding modifiers like adjectives, determiners, auxiliaries, etc.
                while start > 0 and doc[start - 1].dep_ in ['amod', 'det', 'nummod', 'aux', 'advmod']:
                    start -= 1
                # Include objects or complements for verbs
                if token.pos_ == 'VERB' and end < len(doc) - 1 and doc[end + 1].dep_ in ['dobj', 'attr', 'prep']:
                    end = doc[end + 1].right_edge.i

            # Construct the phrase
            phrase = doc[start:end + 1].text
            phrases_to_mask.append((phrase, token.i))

    return phrases_to_mask

# Function to create multiple masked versions of the sentence
def create_masked_versions(doc, phrases_to_mask):
    masked_versions = []

    for phrase, phrase_index in phrases_to_mask:
        # Decide whether to mask the entire phrase or each word in the phrase
        # This decision can be based on the length of the phrase or other criteria
        if len(phrase.split()) > 2:  # Example condition: if phrase is longer than 2 words
            # Mask the entire phrase with a single [MASK] token
            masked_sentence = doc.text.replace(phrase, '[MASK]', 1)
            masked_versions.append((masked_sentence, phrase_index))
        else:
            # Mask each word in the phrase with a [MASK] token
            phrase_words = phrase.split()
            masked_sentence = [token.text_with_ws for token in doc]
            for i in range(len(masked_sentence)):
                if phrase_index <= i < phrase_index + len(phrase_words):
                    masked_sentence[i] = '[MASK]' + (' ' if masked_sentence[i].endswith(' ') else '')
            masked_versions.append((''.join(masked_sentence), phrase_index))

    return masked_versions

def custom_homophone_replacement(sentence, word, suggestion, index):
    # Example: Custom logic for "four" and "for"
    if word == "four" and suggestion != "for":
        # Additional checks can be added here to ensure contextually appropriate replacement
        return sentence.replace(word, "for", 1)
    return sentence.replace(word, suggestion, 1)

def sequential_masking_for_homophones(doc, phrases_to_mask):
    corrected_sentence = doc.text
    for phrase, index in sorted(phrases_to_mask, key=lambda x: x[1]):
        # Mask the current phrase
        masked_sentence = corrected_sentence.replace(phrase, '[MASK]', 1)
        
        # Get BERT's prediction for the masked sentence
        prediction = fill_mask(masked_sentence)[0]['token_str']
        
        # Replace [MASK] with the prediction in the corrected sentence
        corrected_sentence = corrected_sentence.replace('[MASK]', prediction, 1)

    return corrected_sentence

def analyze_context(sentence, surrounding_text):
    # Create a combined document of the surrounding text and the sentence
    combined_doc = nlp(surrounding_text + ' ' + sentence)

    # NER and Noun Chunks Extraction
    entities = [ent.text for ent in combined_doc.ents]
    noun_chunks = [chunk.text for chunk in combined_doc.noun_chunks]

    # Basic Context Analysis
    subjects = [token.text for token in combined_doc if token.dep_ in ['nsubj', 'nsubjpass']]
    themes = [chunk.lower() for chunk in noun_chunks if chunk.lower() not in subjects]

    # Advanced Topic Modeling (LDA) - # Tokenize the text
    tokenized_text = [token.text.lower() for token in combined_doc if not token.is_stop and not token.is_punct]
    
    # Create a dictionary representation of the documents
    dictionary = corpora.Dictionary([tokenized_text])
    
    # Convert dictionary to a bag of words corpus
    corpus = [dictionary.doc2bow(tokenized_text)]
    
    # Apply LDA
    lda = models.LdaModel(corpus, num_topics=1, id2word=dictionary, passes=10)
    lda_topics = lda.print_topics(num_words=3)

    # Sentiment Analysis
    sentiment = TextBlob(combined_doc.text).sentiment
    
    # Adding Part-of-Speech (POS) tagging to context analysis
    pos_tags = {token.text.lower(): token.pos_ for token in nlp(sentence)}

    # Combine All Context Elements
    combined_context = {
        "entities": entities,
        "subjects": subjects,
        "themes": themes,
        "lda_topics": lda_topics,
        "sentiment": sentiment,
        "pos_tags": pos_tags
    }

    return combined_context


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
surrounding_text = "The project was challenging but rewarding. The team worked hard."
sentence = "The new plan was to innovate and improve efficiency."
context = analyze_context(sentence, surrounding_text)
print("Context:", context)

Context: {'entities': [], 'subjects': ['project', 'team', 'plan'], 'themes': ['the project', 'the team', 'the new plan', 'efficiency'], 'lda_topics': [(0, '0.091*"team" + 0.091*"project" + 0.091*"worked"')], 'sentiment': Sentiment(polarity=0.2111742424242424, subjectivity=0.7490530303030303), 'pos_tags': {'the': 'DET', 'new': 'ADJ', 'plan': 'NOUN', 'was': 'AUX', 'to': 'PART', 'innovate': 'VERB', 'and': 'CCONJ', 'improve': 'VERB', 'efficiency': 'NOUN', '.': 'PUNCT'}}


In [8]:
def correct_homophones_advanced(sentence, surrounding_text=None):
    doc = analyze_sentence(sentence)
    
    # Identify phrases for masking
    phrases_to_mask = identify_phrases_for_masking(doc, homophones)
    
    # Create masked versions of the sentence
    masked_versions = create_masked_versions(doc, phrases_to_mask)

    table_data = []
    
    # Sequentially mask and process each homophone
    corrected_sentence = sequential_masking_for_homophones(doc, phrases_to_mask)
    # corrected_sentence = sentence

    for masked_sentence, index in masked_versions:
        # Use fill-mask pipeline and decide on replacements
        suggestion = fill_mask(masked_sentence)[0]['token_str']

        # Replace homophone in the original sentence if suggestion is different
        if suggestion.lower() != doc[index].text.lower():
            corrected_words = [t.text for t in doc]
            corrected_words[index] = suggestion
            new_corrected_sentence = ' '.join(corrected_words)
            if new_corrected_sentence != corrected_sentence:
                corrected_sentence = new_corrected_sentence
                table_data.append([sentence, doc[index].text, index, corrected_sentence])

    # Context analysis with surrounding text (if provided)
    if surrounding_text:
        context_elements = analyze_context(sentence, surrounding_text)
        # Extract elements from context
        entities = context_elements.get("entities", [])
        subjects = context_elements.get("subjects", [])
        themes = context_elements.get("themes", [])
        lda_topics = context_elements.get("lda_topics", [])
        sentiment = context_elements.get("sentiment", None)
        pos_tags = context_elements.get("pos_tags", {})

        # Context-sensitive decision making
        for masked_sentence, index in masked_versions:
            suggestion = fill_mask(masked_sentence)[0]['token_str']
            original_word = doc[index].text.lower()

            if suggestion.lower() != original_word:
                contextually_relevant = False

                # Check if the suggestion matches a subject or theme in the context
                if suggestion.lower() in subjects or suggestion.lower() in themes:
                    contextually_relevant = True
                
                # Additional check: Use POS tagging to ensure grammatical consistency
                if pos_tags.get(original_word) == pos_tags.get(suggestion.lower()):
                    contextually_relevant = True

                # Replace homophone only if it's contextually relevant
                if contextually_relevant:
                    corrected_words = [t.text for t in doc]
                    corrected_words[index] = suggestion
                    new_corrected_sentence = ' '.join(corrected_words)

                    if new_corrected_sentence != corrected_sentence:
                        corrected_sentence = new_corrected_sentence
                        table_data.append([sentence, original_word, index, corrected_sentence])


    # Create a DataFrame for the table
    table = pd.DataFrame(table_data, columns=['Original Sentence', 'Homophone', 'Position', 'Corrected Sentence'])

    return corrected_sentence, table

# Example usage
surrounding_text = "The team was discussing their upcoming project. They were excited about the new opportunities."
sentence = "The new plan was to knew the route."
corrected_sentence, corrections = correct_homophones_advanced(sentence, surrounding_text)
print("Corrected Sentence:", corrected_sentence)
print("Corrections:", corrections)

Corrected Sentence: The new plan was to perfect the route .
Corrections:                      Original Sentence Homophone  Position  \
0  The new plan was to knew the route.       new         1   
1  The new plan was to knew the route.        to         4   
2  The new plan was to knew the route.      knew         5   

                          Corrected Sentence  
0  The original plan was to knew the route .  
1       The new plan was he knew the route .  
2    The new plan was to perfect the route .  


In [9]:
surrounding_text = "She was always meticulous about her writing. Accuracy mattered to her."
sentence = "She wrote there report with great care."
corrected_sentence, table = correct_homophones_advanced(sentence, surrounding_text)
print("Corrected Sentence:", corrected_sentence)
print("Table:\n", table)

Corrected Sentence: She wrote the report with great care .
Table:
                          Original Sentence Homophone  Position  \
0  She wrote there report with great care.     there         2   

                       Corrected Sentence  
0  She wrote the report with great care .  


In [10]:
surrounding_text = "The baking competition is next week. Everyone is practicing their recipes."
sentence = "Flour power is essential four the baker, but he needs to knead the dough right."
corrected_sentence, table = correct_homophones_advanced(sentence, surrounding_text)
print("Corrected Sentence:", corrected_sentence)
print("Table:\n", table)

Corrected Sentence: Flour power is essential for the baker , but he needs to knead the dough out .
Table:
                                    Original Sentence Homophone  Position  \
0  Flour power is essential four the baker, but h...     Flour         0   
1  Flour power is essential four the baker, but h...      four         4   
2  Flour power is essential four the baker, but h...     right        15   

                                  Corrected Sentence  
0  This power is essential four the baker , but h...  
1  Flour power is essential to the baker , but he...  
2  Flour power is essential four the baker , but ...  


In [11]:
# # Function to correct homophones and generate output table
# def correct_homophones_and_generate_table(sentence, homophones):
#     words = sentence.split()
#     table_data = []
#     corrected_words = words.copy()

#     for i, word in enumerate(words):
#         lower_word = word.lower()
#         if lower_word in homophones:
#             print(f"Detected homophone: {word} at position {i}")
#             masked_sentence = ' '.join(words[:i] + ['[MASK]'] + words[i+1:])
#             print(f"Masked Sentence: {masked_sentence}")
#             suggestion = fill_mask(masked_sentence)[0]['token_str']
#             print(f"BERT's suggestion: {suggestion}")

#             corrected_words[i] = suggestion
#             table_data.append([sentence, word, i, ' '.join(corrected_words)])

#     table = pd.DataFrame(table_data, columns=['Original Sentence', 'Homophone', 'Position', 'Corrected Sentence'])
#     return table, ' '.join(corrected_words)

# # Example usage
# sentence = "I new the knew phone was expensive"
# table, corrected_sentence = correct_homophones_and_generate_table(sentence, homophones)

# print("Corrected Sentence:", corrected_sentence)
# print("\nTable:")
# print(table)