In [17]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, BertTokenizer, T5Tokenizer, BertForMaskedLM, logging
import torch
from rouge_score import rouge_scorer
import numpy as np
from spellchecker import SpellChecker

logging.set_verbosity_error()

##### Downloaded dataset

Subham Sahu, Yogesh Kumar Vishwakarma, Jeevanlal kori, Jitendra Singh Thakur . Evaluating performance of different grammar checking tools, International Journal of Advanced Trends in Computer Science and Engineering, Vol. 9 No. 2 pp. 2227 – 2233, April 2020.

In [2]:
dataset = pd.read_excel("GTD.xlsx")
dataset.head()

Unnamed: 0,Sr. no.,Correct Sentence,Errneous Sentence,Error Type,Error Subtype,Error Description,Grammarly,Ginger,ProWrittingAid,LanguageTools,After thr Deadline
0,1.0,The child who has a rash was just diagnosed wi...,The child who has a rash.,Sentence Structure Error,Fragment error,missing verb,YES,NO,NO,NO,NO
1,2.0,"Since the drugs have many side effects, the pa...",Since the drugs have many side effects.,Sentence Structure Error,Fragment error,missing verb,NO,NO,NO,YES,NO
2,3.0,"After the doctor performed the operation, the ...",The doctor performed the operation the patient...,Sentence Structure Error,Run-on error,missing conjunction,NO,NO,NO,NO,NO
3,4.0,"Although the doctor performed the operation, t...","The doctor performed the operation, the patien...",Sentence Structure Error,Run-on error,wrong comma,NO,NO,YES,NO,NO
4,5.0,We have hundreds of pages of reading to do; it...,"We have hundreds of pages of reading to do, it...",Sentence Structure Error,Run-on error,wrong comma,NO,NO,YES,NO,NO


In [3]:
dataset_downloaded = dataset[dataset['Error Type'] == "Spelling Error"]
dataset_downloaded = dataset_downloaded[["Correct Sentence", "Errneous Sentence", "Error Type"]]
dataset_downloaded = dataset_downloaded.dropna()

In [4]:
dataset_downloaded.head()

Unnamed: 0,Correct Sentence,Errneous Sentence,Error Type
200,He often quarrelled with his friends.,He often quarelled with his friends.,Spelling Error
201,We haven't ever been there.,We haven't ever beeen there.,Spelling Error
202,It was impossible to know his father's name.,It was impossibble to know his father's name.,Spelling Error
203,The two of us work late hours.,The two of us work lete hours.,Spelling Error
204,It was drizzling.,It was drizling.,Spelling Error


## My own test dataset

In [5]:
dataset_misspelled = [
    ("I hav a dreem to chase.", "I have a dream to chase."),
    ("She is a grea friend.", "She is a great friend."),
    ("It was an amazng experience.", "It was an amazing experience."),
    ("The weather is quite lovly today.", "The weather is quite lovely today."),
    ("He is definitly coming to the party.", "He is definitely coming to the party."),
    ("We should go to the beech this weekend.", "We should go to the beach this weekend."),
    ("This is a beautful painting.", "This is a beautiful painting."),
    ("Can you beleive it's already October.", "Can you believe it's already October."),
    ("I need to fix the leaky fauce.", "I need to fix the leaky faucet."),
    ("The quick brown fox jumps over the lazzy dog.", "The quick brown fox jumps over the lazy dog."),
    ("She enjoys readng books in her free time.", "She enjoys reading books in her free time."),
    ("My favorite fruits are bananas and appless.", "My favorite fruits are bananas and apples."),
    ("I like to drink coffe every morning.", "I like to drink coffee every morning."),
    ("He is a talented writter.", "He is a talented writer."),
    ("The flowers are blooming in the gardn.", "The flowers are blooming in the garden."),
    ("I always forget my umbrella in the rainny season.", "I always forget my umbrella in the rainy season."),
    ("They went to the restraunt for dinner.", "They went to the restaurant for dinner."),
    ("Her birthday is on the 25th of Decmber.", "Her birthday is on the 25th of December."),
    ("This cake is delicius.", "This cake is delicious."),
    ("I will send you the infromation.", "I will send you the information."),
    ("He has a great sense of humr.", "He has a great sense of humor.")
]


dataset_mine = pd.DataFrame(dataset_misspelled, columns=["Errneous Sentence", "Correct Sentence"])

## Pyspellchecker library

PySpellChecker is a Python library that uses the Levenshtein Distance algorithm to identify and suggest the most suitable replacement for incorrectly spelled words in a sentence.

In [6]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

def pySpellChecker(dataset):
    spell = SpellChecker()
    sum = 0
    rouge1 = []
    data_len = dataset.shape[0]
    for row in dataset.iterrows():
        text_split = row[1]['Errneous Sentence'].split()
        result = []
        for word in text_split:
            # Checks if word is spelled correctly and replaces it by a correct word
            result += [word if not spell.unknown([word]) else (spell.correction(word) if spell.correction(word) else word)]

        res_len = len(result)
        if res_len > 1:
            result[0] = result[0].capitalize()
        result = ' '.join(result)
        if res_len > 1 and result[-1] != ".":
            result += "."

        if result == row[1]['Correct Sentence']:
            sum += 1

        # Calculates ROUGE-1 score
        score = scorer.score(row[1]['Correct Sentence'], result)
        rouge1.append(score['rouge1'])

    f1 = np.mean([s.fmeasure for s in rouge1])
    print("Rouge F1 score: ", str(round(f1, 2)*100) + " %")

    print("Sentence accuracy: " + str(round(sum*100/data_len, 2)) + " %")

#### Testing on my own data

In [7]:
pySpellChecker(dataset_mine)

Rouge F1 score:  96.0 %
Sentence accuracy: 76.19 %


#### Testing on downloaded dataset

In [8]:
pySpellChecker(dataset_downloaded)

Rouge F1 score:  96.0 %
Sentence accuracy: 53.0 %



### Pros:
- Does not generate nonsensical words.
- Always ensures correct spelling.

### Cons:
- Ignores the sentence’s context, relying solely on the closest word by Levenshtein distance.
- Works well for individual word corrections but may struggle with contextual nuances.
- Although multiple candidate words are often available, the best one is not always selected.


### Examples of wrong prediction:

- sentence to check: It was drizling.
    - prediction: drizling -> drilling
    - label: drizling -> drizzling
- sentence to check: He is a talented writter.
    - prediction: writter -> written
    - label: writter -> writer

These examples show that even though the spelling is corrected, the resulting sentences may not make sense due to incorrect word choices.



## LLM

I am using an LLM (T5) fine-tuned for a grammar correction task from Hugging Face. T5-base is an encoder-decoder model developed by Google, designed for text-to-text generation tasks.

In [9]:
checkpoint = "vennify/t5-base-grammar-correction"

tokenizer_T5 = T5Tokenizer.from_pretrained(checkpoint)
model_TP = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

def LLM_spell_checker(dataset):
    sum = 0
    data_len = dataset.shape[0]
    rouge1 = []
    for row in dataset.iterrows():
        # Tokenizes the sentence that is than sent to the model
        input_ids = tokenizer_T5(row[1]['Errneous Sentence'], return_tensors="pt").input_ids
        size = len(input_ids[0])

        # Generates correct sentence based on the input
        output = model_TP.generate(input_ids=input_ids, max_new_tokens=size*1.2)
        result = tokenizer_T5.decode(output[0], skip_special_tokens=True)
        
        if result == row[1]['Correct Sentence']:
            sum += 1

        score = scorer.score(row[1]['Correct Sentence'], result)
        rouge1.append(score['rouge1'])

    f1 = np.mean([s.fmeasure for s in rouge1])
    print("Rouge F1 score: ", str(round(f1, 2)*100) + " %")

    print("Sentence accuracy: " + str(round(sum*100/data_len, 2)) + " %")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


#### Testing on my own data

In [10]:
LLM_spell_checker(dataset_mine)

Rouge F1 score:  97.0 %
Sentence accuracy: 76.19 %


#### Testing on downloaded dataset

In [11]:
LLM_spell_checker(dataset_downloaded)

Rouge F1 score:  94.0 %
Sentence accuracy: 43.0 %


#### Pros:
- Evaluates the context of the sentence to select the most appropriate word.

#### Cons:
- Occasionally substitutes words that fit the sentence better but do not match the intended correction, which can be problematic when only spelling accuracy is desired.


#### Examples of wrong prediction:

- sentence to check: She is a grea friend.
    - prediction: grea -> good
    - label: grea -> great
- sentence to check: The quick brown fox jumps over the lazzy dog.
    - prediction: lazzy -> lazzy
    - label: lazzy -> lazy

These examples highlight two key issues:
- The model sometimes fails to correct spelling mistakes.
- In other cases, it replaces misspelled words with synonyms, maintaining the meaning but diverging from the desired correction.

## Combination of LLM and pyspellchecker

I also experimented with combining both approaches to achieve better results. Specifically, I used PySpellChecker to identify misspelled words and generate candidate corrections. I then replaced the misspelled word with a [MASK] token and used a BERT language model to predict the best-fitting word from the set of candidates. The word with the highest score was selected as the final correction.

In [18]:

tokenizer_BERT = BertTokenizer.from_pretrained('bert-base-uncased')
model_BERT = BertForMaskedLM.from_pretrained('bert-base-uncased')

def Combination_spell_checker(dataset):
    sum = 0
    data_len = dataset.shape[0]
    rouge1 = []
    for row in dataset.iterrows():
        sentence = row[1]['Errneous Sentence']
        if len(sentence) > 1 and sentence[-1] in [".", "?", "!"]:
            sentence = sentence[:-1]
        text_split = sentence.split()
        
        result = []
        spell = SpellChecker()

        # For each word in the sentance checks the spelling using PySpellChecker and if it wrong uses BERT model to predict the correct replacement
        for index, word in enumerate(text_split):
            if not spell.unknown([word]):
                # Word is spelled correctly and can be appended to result
                result += [word]
            else:
                # Word is spelled incorrectly and similar words (Levensthein distance) are used in BERT model
                if spell.correction(word):
                    candidates = list(spell.candidates(word))
                    text_split_mask = list(text_split)

                    # Incorrectly spelled word is masked
                    text_split_mask[index] = "[MASK]"
                    sentence_mask = " ".join(text_split_mask)
                    input_ids = tokenizer_BERT(sentence_mask, return_tensors="pt").input_ids
                    mask_index = (input_ids == tokenizer_BERT.mask_token_id).nonzero(as_tuple=True)[1].item()
                
                    # Tokenization of candidates words
                    candidate_ids = [tokenizer_BERT(word, add_special_tokens=False).input_ids[0] for word in candidates]

                    with torch.no_grad():
                        outputs = model_BERT(input_ids)
                        logits = outputs.logits[0, mask_index]

                    # Only predictions of the candidate words are used
                    candidate_scores = {word: logits[token_id].item() for word, token_id in zip(candidates, candidate_ids)}

                    # The canditate word with highest score is appended to result
                    best_word = max(candidate_scores, key=candidate_scores.get) 
                    result += [best_word]
                else:
                    # PySpellChecker did not provide any replacements
                    result += [word]

        res_len = len(result)
        if res_len > 1:
            result[0] = result[0].capitalize()
        result = ' '.join(result)
        if res_len > 1 and result[-1] != ".":
            result += "."

        if result == row[1]['Correct Sentence']:
            sum += 1

        score = scorer.score(row[1]['Correct Sentence'], result)
        rouge1.append(score['rouge1'])

    f1 = np.mean([s.fmeasure for s in rouge1])
    print("Rouge F1 score: ", str(round(f1, 2)*100) + " %")

    print("Sentence accuracy: " + str(round(sum*100/data_len, 2)) + " %")

#### Testing on my own data

In [19]:
Combination_spell_checker(dataset_mine)

Rouge F1 score:  97.0 %
Sentence accuracy: 80.95 %


#### Testing on downloaded dataset

In [14]:
Combination_spell_checker(dataset_downloaded)

Rouge F1 score:  97.0 %
Sentence accuracy: 57.0 %


The result is marginally better than the previous two methods, but the improvement is minor.
#### Pros:
- The model understands context and modifies only misspelled words.
#### Cons:
- This model may have longer inference times.