In [None]:
from google.colab import drive
drive.mount('/content/drive')

**install the necessary libraries**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
file_path = '/content/drive/MyDrive/grammar10.txt'
with open(file_path, 'r', encoding='utf-16') as file:
    data = [line.strip().split(" ", 1) for line in file.readlines()]

data_frame = pd.DataFrame(data, columns=["category", "text"])
data_frame["category"] = data_frame["category"].astype(int)
data_frame['text'] = data_frame['text'].fillna('')

text_data = data_frame['text']
labels = data_frame['category']

In [None]:
X_train_data, X_test_data, y_train_labels, y_test_labels = train_test_split(text_data, labels, test_size=0.2, random_state=42)

In [None]:
vectorizer_instance = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
X_train_vec = vectorizer_instance.fit_transform(X_train_data)
X_test_vec = vectorizer_instance.transform(X_test_data)

In [None]:
from sklearn.svm import SVC

classifier_model = SVC(kernel='linear', random_state=42)
classifier_model.fit(X_train_vec, y_train_labels)

In [None]:
predictions = classifier_model.predict(X_test_vec)
model_accuracy = accuracy_score(y_test_labels, predictions)
print(f"Accuracy: {model_accuracy:.2f}")

In [None]:
def grammar_check(input_text, vectorizer, model, dataframe):
    """
    Check the grammar of a given input text and return the corrected text.
    """
    sentences = nltk.sent_tokenize(input_text)
    corrected_sentences = []
    all_correct = True

    for sentence in sentences:
        vectorized_text = vectorizer.transform([sentence])
        result = model.predict(vectorized_text)[0]

        if result == 1:
            corrected_sentences.append(sentence)  # Keep the original sentence if correct
        else:
            all_correct = False
            correct_texts = dataframe[dataframe['category'] == 1]['text']
            best_match = None
            highest_similarity = 0

            for correct_text in correct_texts:
                input_set = set(sentence.split())
                correct_set = set(correct_text.split())
                similarity_score = len(input_set.intersection(correct_set)) / len(input_set.union(correct_set)) if input_set.union(correct_set) else 0
                if similarity_score > highest_similarity:
                    highest_similarity = similarity_score
                    best_match = correct_text

            if best_match:
                corrected_sentences.append(best_match)
            else:
                corrected_sentences.append(sentence)

    corrected_text = " ".join(corrected_sentences)
    return corrected_text, all_correct

In [None]:
cross_val_scores = cross_val_score(classifier_model, X_train_vec, y_train_labels, cv=5, scoring='accuracy')
print(f"Cross-Validation Mean Accuracy: {cross_val_scores.mean():.2f}")

In [None]:
param_grid_values = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_model = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_values, cv=3, n_jobs=-1, verbose=2)
grid_search_model.fit(X_train_vec, y_train_labels)
best_classifier_model = grid_search_model.best_estimator_

In [None]:
conf_matrix = confusion_matrix(y_test_labels, predictions)
print(conf_matrix)
print(classification_report(y_test_labels, predictions))

In [None]:
def test_new_sentence(input_sentence, vectorizer, model, dataframe):
    corrected_sentence, all_correct = grammar_check(input_sentence, vectorizer, model, dataframe)

    if all_correct:
        print("The sentence is grammatically correct!")
    else:
        print("The sentence has grammar issues. Corrected version:")
        print(corrected_sentence)

new_sentence = "මම බත් යමු"
test_new_sentence(new_sentence, vectorizer_instance, classifier_model, data_frame)


In [None]:
import ipywidgets as widgets
from IPython.display import display

sentence_input = widgets.Textarea(
    value='',
    placeholder='Enter a Sinhala sentence for grammar check...',
    description='Sentence:',
    disabled=False,
    layout=widgets.Layout(width='80%', height='150px')
)

output_area = widgets.Output()

def on_button_click(b):
    input_sentence = sentence_input.value
    if input_sentence.strip():
        corrected_sentence, all_correct = grammar_check(input_sentence, vectorizer_instance, classifier_model, data_frame)

        with output_area:
            output_area.clear_output()
            if all_correct:
                print("The sentence is grammatically correct!")
            else:
                print("The sentence has grammar issues. Corrected version:")
                print(corrected_sentence)
    else:
        with output_area:
            output_area.clear_output()
            print("Please enter a sentence to check.")


check_button = widgets.Button(
    description="Check Grammar",
    disabled=False,
    button_style='success',
    tooltip="Click to check grammar"
)

check_button.on_click(on_button_click)
display(sentence_input, check_button, output_area)
