In [2]:
import os
import re
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dropout, LayerNormalization


# Normalisation for Arabic text
def normalize_arabic(text):
    text = re.sub(r'[ًٌٍَُِّْـ]', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ة', 'ه', text)
    return text

# Sentence splitter
def split_into_sentences(text):
    return [sentence.strip() for sentence in re.split(r'[.!؟\n]', text) if sentence.strip()]


In [3]:
# Arabic keyboard proximity map
arabic_keyboard = {
    'ا': ['أ', 'إ', 'ء', 'ى'],
    'ب': ['ن', 'ت'],
    'ت': ['ب', 'ن'],
    'ث': ['ت', 'س'],
    'ج': ['ح', 'خ'],
    'ح': ['ج', 'خ'],
    'خ': ['ح', 'ج'],
    'د': ['ذ'],
    'ذ': ['د', 'ر'],
    'ر': ['ذ', 'ز'],
    'ز': ['ر', 'س'],
    'س': ['ش', 'ص'],
    'ش': ['س', 'ص'],
    'ص': ['س', 'ش', 'ض'],
    'ض': ['ص', 'ط'],
    'ط': ['ض', 'ظ'],
    'ع': ['غ'],
    'غ': ['ع'],
    'ف': ['ق'],
    'ق': ['ف'],
    'ك': ['ل'],
    'ل': ['ك'],
    'م': ['ن'],
    'ن': ['م', 'ب'],
    'ه': ['ة'],
    'ة': ['ه'],
    'ي': ['ى', 'ب'],
    'ى': ['ي']
}

def keyboard_substitute(char):
    if char in arabic_keyboard:
        return random.choice(arabic_keyboard[char])
    return char

def insert_typo(word):
    if len(word) < 2:
        return word
    # typo_type = random.choice(['delete', 'insert', 'substitute'])
    typo_type = 'substitute'
    i = random.randint(0, len(word) - 1)
    arabic_chars = 'ابتثجحخدذرزسشصضطظعغفقكلمنهوي'

    if typo_type == 'delete':
        return word[:i] + word[i+1:]
    elif typo_type == 'insert':
        return word[:i] + random.choice(arabic_chars) + word[i:]
    elif typo_type == 'substitute':
        return word[:i] + keyboard_substitute(word[i]) + word[i+1:]
    return word

def corrupt_sentence(sentence, error_probability=0.1):
    words = sentence.split()
    corrupted = [insert_typo(w) if random.random() < error_probability else w for w in words]
    return ' '.join(corrupted)


In [5]:
# Load your .tsv dataset 
df = pd.read_csv('unbalanced_reviews.tsv', sep='\t', header=None)

# Column 4 contains the Arabic review text
texts = df[4].astype(str).tolist()

# Normalize and split into sentences
all_sentences = []
for text in texts:
    norm_text = normalize_arabic(text)
    all_sentences.extend(split_into_sentences(norm_text))

# Filter sentences by length
clean_sentences = [s for s in all_sentences if 5 < len(s) < 100]
clean_sentences = list(set(clean_sentences))  # Remove duplicates
noisy_sentences = [corrupt_sentence(s) for s in clean_sentences]

print(f"Clean samples: {len(clean_sentences)}")


Clean samples: 2182769


In [6]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(clean_sentences + noisy_sentences)

vocab_size = len(tokenizer.word_index) + 1
max_len = max(len(s) for s in clean_sentences)

def encode(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_len, padding='post')

X = encode(noisy_sentences)
Y = encode(clean_sentences)
Y = Y[..., None]  # Make shape (batch_size, sequence_length, 1)


In [21]:
embedding_dim = 300
lstm_units = 256
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, input_shape=(max_len,)),
    Bidirectional(LSTM(lstm_units, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(128, return_sequences=True)),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(**kwargs)


In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.models import load_model

# reducing learning rate when improvement decreases
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=2,
    min_lr=1e-6
)

# stops if validation accuracy doesn't increase for 5 consecutive epochs 
early_stop = EarlyStopping(
    patience=5,
    restore_best_weights=True,
    monitor='val_accuracy',
    mode='max'
)

# saves the best model, even if it isn't the last one
checkpoint = ModelCheckpoint(
    filepath='/kaggle/working/best_model.keras',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

# trains the model. epochs can be changed accordingly 
model.fit(
    X_train, Y_train,
    epochs=1,
    batch_size=64,
    validation_data=(X_test, Y_test),
    callbacks=[checkpoint, early_stop, reduce_lr]
)

[1m27285/27285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.9887 - loss: 0.0510
Epoch 1: val_accuracy improved from -inf to 0.99765, saving model to /kaggle/working/best_model.keras
[1m27285/27285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1765s[0m 64ms/step - accuracy: 0.9887 - loss: 0.0510 - val_accuracy: 0.9976 - val_loss: 0.0102 - learning_rate: 0.0010


<keras.src.callbacks.history.History at 0x7838fb15e550>

In [8]:
model = load_model('best_model.keras')

def decode_sequence(pred):
    pred_ids = np.argmax(pred, axis=-1)
    index_word = {v: k for k, v in tokenizer.word_index.items()}
    return ''.join([index_word.get(i, '') for i in pred_ids])

def autocorrect(input_text):
    input_text = normalize_arabic(input_text)
    seq = encode([input_text])
    pred = model.predict(seq)
    return decode_sequence(pred[0])


In [29]:
# Example test
sentences_with_typos = [
    "أنا أجب المدرسة",     # أحب → أجب
    "هو يقأ كتابًا",       # يقرأ → يقأ
    "الحو جميل اليوم",     # الجو → الحو
    "ذهبت إلى السىق",      # السوق → السىق
    "هي تطبخ الطعان",      # الطعام → الطعان
    "القط يحلس هناك",      # يجلس → يحلس
    "الولد يكتب الواخب",   # الواجب → الواخب
    "أين حخيبتي؟",         # حقيبتي → حخيبتي
    "أنا متعب قببلاً",     # قليلاً → قببلاً
    "نحن نلعب في الجديقة", # الحديقة → الجديقة
    "السماء زرقاء صافبة",  # صافية → صافبة
    "هل أكلت القبور؟",     # الفطور → القبور
    "أريد شرب الهاء",      # الماء → الهاء
    "السيارة سريعة جذاً",  # جداً → جذاً
    "الوقت متأحر الآن",    # متأخر → متأحر
    "أحب كرة القدن",       # القدم → القدن
    "هو يعمل بحد",         # بجد → بحد
    "الطفل نائك",          # نائم → نائك
    "أين المغاتيح؟",       # المفاتيح → المغاتيح
    "الشارع مزذحم"         # مزدحم → مزذحم
]
for test_input in sentences_with_typos:
    print("Noisy input:    ", test_input)
    print("Autocorrected:  ", autocorrect(test_input))




Noisy input:     أنا أجب المدرسة
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Autocorrected:   انا احب المدرسه
Noisy input:     هو يقأ كتابًا
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Autocorrected:   هو يقا كتابا
Noisy input:     الحو جميل اليوم
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Autocorrected:   الحو جميل اليوم
Noisy input:     ذهبت إلى السىق
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Autocorrected:   ذهبت الي السيق
Noisy input:     هي تطبخ الطعان
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Autocorrected:   هي تطبخ الطعان
Noisy input:     القط يحلس هناك
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Autocorrected:   القط يحلس هناك
Noisy input:     الولد يكتب الواخب
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Autocorrected:   الولد يكتب الواخب
Noisy input:     أين حخيبتي؟
[1m1/1[0m 

In [14]:
from difflib import SequenceMatcher

def sentence_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

scores = [sentence_similarity(clean, autocorrect(noisy)) for clean, noisy in zip(clean_sentences[:100], noisy_sentences[:100])]
print(f"Avg sentence similarity: {np.mean(scores):.4f}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26

In [33]:
def evaluate_sentence_level_accuracy(clean_list, noisy_list):
    correct_count = 0
    total = len(clean_list)
    for clean, noisy in zip(clean_list, noisy_list):
        prediction = autocorrect(noisy)
        if prediction == clean:
            correct_count +=1
        print(clean)
        print(noisy)
    print(f"Sentence-level accuracy: {correct_count / total:.4f}")

evaluate_sentence_level_accuracy(clean_sentences[500:700], noisy_sentences[500:700])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
معامله الاخر لك التحيزات الخفيه ووسايل اخفاء الضعف التي نتقنها
معامله الاخر لك التحيزات الخفيه ووسايل اخفاء الضعف التي نتقنها
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
- التقافز ما بين اوتار الزمن مربك، رغم انه بدا مبرر قرب النهايه
- التقافز ما تين اوتار الزمن مربك، رغم انة بدا مبرر قرب النهايه
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
كتيب لطيف لم تسلبني ادبيته كما عاده احلام
كتيت لطيف لم تسلبني ادبيته كما عاده احلام
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
مش قوي بالمره
مش قوي بالمره
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
فالكتاب عملي جدا و ذو اسلوب مباشر
فالكتاب عملي جدا و ذو اسلوب مباشر
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
توكل سينا ابراهيم
تولل سىنا ابراهيم
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
?- كثره الايحاءات الجنسيه 

In [17]:
def evaluate_autocorrect_model(clean_sentences, noisy_sentences, verbose=True):
    assert len(clean_sentences) == len(noisy_sentences), "Mismatched input lengths."
    
    sentence_correct = 0
    word_accuracies = []
    
    for clean, noisy in zip(clean_sentences, noisy_sentences):
        predicted = autocorrect(noisy)
        
        if predicted == clean:
            sentence_correct += 1
        
        # Word-level accuracy
        clean_words = clean.split()
        predicted_words = predicted.split()
        correct_words = sum(1 for cw, pw in zip(clean_words, predicted_words) if cw == pw)
        word_accuracy = correct_words / max(len(clean_words), 1)
        word_accuracies.append(word_accuracy)

        # if verbose and predicted != clean:
        print("🔸 Noisy:     ", noisy)
        print("🔁 Predicted:", predicted)
        print("✅ Target:   ", clean)
        if predicted == clean and noisy != clean: 
            print("Corrected a mistake!")
        if predicted == clean and noisy == clean: 
            print("Correct, but no mistake was found")
        print()

    total = len(clean_sentences)
    print(f"\n📏 Sentence-level accuracy: {sentence_correct}/{total} = {sentence_correct/total:.2%}")
    print(f"📊 Avg. word-level accuracy: {np.mean(word_accuracies):.2%}")


In [9]:
sentence1 = "أنا أجب المدرصة"
sentence2 = "أنا أشعر بالحوع"
sentence3 = "الحديقة حميلة جدًا"
sentence4 = "أدرس اللغة العزبية"
sentence5 = "أحب أتغلم البرمجة"
sentence6 = "السماء ززقاء صافية"
sentence7 = "مغالجة اللغات الطبيعية"
print(autocorrect(sentence7))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
معالجه اللغات الطبيعيه


In [10]:
import tkinter as tk
# handles button press
def correct_sentence():
    input_text = entry.get()
    corrected = autocorrect(input_text)
    result_label.config(text="🔧 التصحيح: " + corrected)

root = tk.Tk()
root.title("Arabic Sentence Autocorrector")
root.geometry("500x200")

# Label
label = tk.Label(root, text="أدخل الجملة العربية:", font=("Arial", 14))
label.pack(pady=10)

# Text entry
entry = tk.Entry(root, font=("Arial", 14), justify='right')
entry.pack(fill='x', padx=20)

# Button
button = tk.Button(root, text="تصحيح", font=("Arial", 12), command=correct_sentence)
button.pack(pady=10)

# Output label
result_label = tk.Label(root, text="", font=("Arial", 14), fg="green", wraplength=480, justify='right')
result_label.pack(pady=10)

# Run GUI loop
root.mainloop()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
