In [4]:
import pandas as pd
import language_tool_python
from spellchecker import SpellChecker
from sklearn.metrics import precision_recall_fscore_support


In [5]:

def correct_spelling(text, spell):
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]
    return ' '.join(corrected_words)

def check_grammar(text, tool):
    matches = tool.check(text)
    return language_tool_python.utils.correct(text, matches)

def evaluate_predictions(y_true, y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    return precision, recall, f1

In [6]:
df = pd.read_csv('./training_true.csv', header= 0,
                        encoding= 'unicode_escape')
df.rename(columns={"Has_Correction":"label"},inplace=True)
train_df = df.sample(frac=0.5 , random_state=42)

In [7]:
train_df.head()

Unnamed: 0,Input,Output,label
70401,Four is presentation .,Fourth was presentation .,1
371361,I do n't often criticize but carry out .,I do n't often criticize but carry out .,0
93334,Thank you .,Thank you .,0
616375,"I 'm trying to learn English at the moment , I...",I 'm trying to learn English at the moment . I...,1
824780,I really appreciate your help .,I really appreciate your help .,0


In [9]:
spell = SpellChecker()
tool = language_tool_python.LanguageTool('en-US')

In [10]:
train_df['corrected_text'] = train_df['Input'].apply(lambda x: correct_spelling(x, spell))
train_df['final_text'] = train_df['corrected_text'].apply(lambda x: check_grammar(x, tool))

In [11]:
from sklearn.metrics import accuracy_score

In [12]:
train_df['predicted_label'] = train_df.apply(lambda row: 1 if row['final_text'] != row['Input'] else 0, axis=1)

In [13]:

precision, recall, f1 = evaluate_predictions(train_df['label'], train_df['predicted_label'])
accuracy = accuracy_score(train_df['label'], train_df['predicted_label'])
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)
print(train_df[['Input', 'final_text', 'label', 'predicted_label']])

Precision: 0.563477431249436
Recall: 0.9533521655196484
F1 Score: 0.7083095298869995
Accuracy: 0.5878082434916037
                                                    Input  \
70401                              Four is presentation .   
371361           I do n't often criticize but carry out .   
93334                                         Thank you .   
616375  I 'm trying to learn English at the moment , I...   
824780                    I really appreciate your help .   
...                                                   ...   
588819  but he was like `` oh really . `` mannn that '...   
50153    I eat Tempura today , the Japanese famous food .   
918613                      But I want to talk fluently .   
578187                     It was really quickly for me !   
831315  So I rode a bicycle , I strongly pedaled with ...   

                                               final_text  label  \
70401                               Four is presentation.      1   
371361           