In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import  packeges

In [16]:
import random

import re


import torch
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from torch.nn.functional import softmax
import matplotlib.pyplot as plt

## Read File

In [3]:
# Read the Arabic file
with open("/content/drive/My Drive/ara-eg_newscrawl-OSIAN_2018_10K-sentences.txt", "r", encoding="utf-8") as file:
    sentences = file.readlines()

# modification function
##### to simulate types of spelling mistakes

In [None]:
def uncorrect_word(word, word_list):
    modification_type = random.choice(["change_letter", "replace_word", "swap_letters", "add_letter"])

    if len(word) <= 1:
        return word


    if modification_type == "change_letter":
        position = random.randint(0, len(word) - 1)
        new_letter = random.choice([letter for letter in "ابتثجحخدذرزسشصضطظعغفقكلمنهوي" if letter != word[position]])
        modified_word = word[:position] + new_letter + word[position + 1:]

    elif modification_type == "replace_word":
        modified_word = random.choice(word_list)
        while modified_word == word:
            modified_word = random.choice(word_list)

    elif modification_type == "swap_letters":

        position1 = random.randint(0, len(word) - 1)
        position2 = random.randint(0, len(word) - 1)
        word_list = list(word)
        word_list[position1], word_list[position2] = word_list[position2], word_list[position1]
        modified_word = ''.join(word_list)

    elif modification_type == "add_letter":
        position = random.randint(0, len(word))
        new_letter = random.choice([letter for letter in "ابتثجحخدذرزسشصضطظعغفقكلمنهوي"])
        modified_word = word[:position] + new_letter + word[position:]

    return modified_word

In [None]:
# create spelling mistakes file (labels file)

for index, sentence in enumerate(sentences):
    words = sentence.strip().split()
    word_index = random.randint(0, len(words) - 1)
    word_to_uncorrect = words[word_index]
    uncorrected_word = uncorrect_word(word_to_uncorrect, words)
    words[word_index] = uncorrected_word
    modified_sentence = ' '.join(words)

     # Write the modified sentence back to the file
    with open("ara-eg_newscrawl-OSIAN_2018_10K/ara-eg_newscrawl-OSIAN_2018_10K-labels.txt", "a", encoding="utf-8") as modified_file:
        modified_file.write(modified_sentence.strip() + "\n")

# Preprocessing data

### Remove noisy data

In [4]:
def clean_sentence(sentence):
    # Remove leading digits and any whitespace
    cleaned_sentence = re.sub(r'^\d+\s*', '', sentence)
    # Remove any leading or trailing whitespace
    cleaned_sentence = cleaned_sentence.strip()
    return cleaned_sentence


#### Read Files and store them in list

In [5]:
# Read the file and store sentences in a list
y = []
with open("/content/drive/My Drive/ara-eg_newscrawl-OSIAN_2018_10K-sentences.txt", "r", encoding="utf-8") as file:
    for line in file:
        cleaned_line = clean_sentence(line)
        if cleaned_line:
            y.append(cleaned_line)

In [6]:

x = []
with open("/content/drive/My Drive/ara-eg_newscrawl-OSIAN_2018_10K-labels.txt", "r", encoding="utf-8") as file:
    for line in file:
        cleaned_line = clean_sentence(line)
        if cleaned_line:
            x.append(cleaned_line)

##### remove noisy data

In [7]:
def clean_sentence(sentence):
    # Remove English words using regular expression
    cleaned_sentence = re.sub(r'\b[a-zA-Z]+\b', '', sentence)
    # Remove numbers and signs using regular expression
    cleaned_sentence = re.sub(r'\d+|\W+', ' ', cleaned_sentence)
    # Remove punctuation marks
    cleaned_sentence = re.sub(r'[^\w\s]', '', cleaned_sentence)
    # Normalize diacritics (fatha, kasra, damma)
    cleaned_sentence = cleaned_sentence.replace('ً', '').replace('ٌ', '').replace('ٍ', '').replace('َ', '').replace('ُ', '').replace('ِ', '')
    # Remove special characters
    cleaned_sentence= re.sub(r'[^\u0600-\u06FF\s\d]', '', cleaned_sentence)
    # Remove numbers
    cleaned_sentence= re.sub(r'\d+', '', cleaned_sentence)
    # Normalize whitespace
    cleaned_sentence= re.sub(r'\s+', ' ', cleaned_sentence.strip())
    # Remove extra whitespace
    cleaned_sentence = ' '.join(cleaned_sentence.split())
    return cleaned_sentence


x = [clean_sentence(sentence) for sentence in x]

y = [clean_sentence(sentence) for sentence in y]

In [8]:
# show some of data
print(x[:5])
print(y[:5])

['مليارات و مليون درهم خلال تكل الفترة', 'معلومات خاطئة نعتقد للأسنان جيدة للأسنان معلومات خاطئة نعتقد أنها جيدة للأسنان', 'مسلسل يونس ولد بطولة عمرو سعد وسيعرض على قناة مصر دراما المستقبل السومرية', 'الدكتورة منى بنت عبدالله بن سعيد آل مشيط', 'صودا الخبز هو معجون طبيعي للأسنان حثي ينصح بخلط ربع ملعقة صغيرة من صودا الخبز مع الماء وغسل الأسنان ببه']
['مليارات و مليون درهم خلال تلك الفترة', 'معلومات خاطئة نعتقد أنها جيدة للأسنان معلومات خاطئة نعتقد أنها جيدة للأسنان', 'مسلسل يونس ولد فضة بطولة عمرو سعد وسيعرض على قناة مصر دراما المستقبل السومرية', 'الدكتورة منى بنت عبدالله بن سعيد آل مشيط', 'صودا الخبز هو معجون طبيعي للأسنان حيث ينصح بخلط ربع ملعقة صغيرة من صودا الخبز مع الماء وغسل الأسنان ببه']


### Data Preparation and Model Training

In [None]:

# Load the pre-trained AraBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabert")
model = AutoModelForMaskedLM.from_pretrained("aubmindlab/bert-base-arabert")

In [10]:
max_length = 32
# Tokenize the sentences
tokenized_x = tokenizer(x, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
tokenized_y = tokenizer(y, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

In [11]:
# Convert tokenized sentences to PyTorch tensors
input_ids = tokenized_x.input_ids
labels = tokenized_y.input_ids

# Create a PyTorch Dataset
dataset = TensorDataset(input_ids, labels)

In [12]:
# Define training parameters
learning_rate = 5e-5
num_epochs = 5
batch_size = 8
num_folds = 5

In [13]:
# Initialize KFold
kf = KFold(n_splits=num_folds, shuffle=True)
train_losses = []
val_losses = []
test_losses = []
train_accuracies = []
val_accuracies = []
test_accuracies = []

In [14]:
# Cross-validation loop
for fold, (train_index, test_index) in enumerate(kf.split(dataset)):
    print(f"Fold {fold+1}/{num_folds}")

    # Split dataset into train and test sets for this fold
    train_dataset = torch.utils.data.Subset(dataset, train_index)
    test_dataset = torch.utils.data.Subset(dataset, test_index)

    # Create DataLoaders for train and test sets
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Set device to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Train the model for this fold
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_preds = []
        train_labels = []
        for batch in train_dataloader:
            # Move batch to device
            batch = tuple(t.to(device) for t in batch)
            input_ids, labels = batch

            # Forward pass
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            # Calculate predictions and true labels
            preds = logits.argmax(dim=-1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(labels.cpu().numpy())

        # Convert the train_labels and train_preds lists to NumPy arrays
        train_labels = np.array(train_labels)
        train_preds = np.array(train_preds)

        # Calculate average training loss for the epoch
        train_avg_loss = train_loss / len(train_dataloader)

        # Calculate the accuracy score
        train_accuracy = accuracy_score(train_labels.ravel(), train_preds.ravel())
        print(f"Fold {fold+1}, Epoch {epoch+1}, Train Loss: {train_avg_loss}, Train Accuracy: {train_accuracy}")

    # Test for this fold
    model.eval()
    test_loss = 0
    test_preds = []
    test_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, labels = batch

            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            test_loss += loss.item()

            # Calculate predictions and true labels
            preds = logits.argmax(dim=-1)
            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())

    # Convert the test_labels and test_preds lists to NumPy arrays
    test_labels = np.array(test_labels)
    test_preds = np.array(test_preds)

    # Calculate average test loss for the fold
    test_avg_loss = test_loss / len(test_dataloader)

    # Calculate the accuracy score
    test_accuracy = accuracy_score(test_labels.ravel(), test_preds.ravel())
    print(f"Fold {fold+1}, Test Loss: {test_avg_loss}, Test Accuracy: {test_accuracy}")

    # Append metrics for this fold to the lists
    test_losses.append(test_avg_loss)
    test_accuracies.append(test_accuracy)

# Print average test loss and accuracy across all folds
print(f"Average Test Loss: {np.mean(test_losses)}, Average Test Accuracy: {np.mean(test_accuracies)}")


Fold 1/5




Fold 1, Epoch 1, Train Loss: 1.5150324387550353, Train Accuracy: 0.728828125
Fold 1, Epoch 2, Train Loss: 1.0375603317320348, Train Accuracy: 0.77244140625
Fold 1, Epoch 3, Train Loss: 0.7404165369048714, Train Accuracy: 0.82978515625
Fold 1, Epoch 4, Train Loss: 0.5283033225052058, Train Accuracy: 0.87189453125
Fold 1, Epoch 5, Train Loss: 0.39420810176059606, Train Accuracy: 0.89934375
Fold 1, Test Loss: 1.441913595199585, Test Accuracy: 0.79703125
Fold 2/5




Fold 2, Epoch 1, Train Loss: 0.5768548252135515, Train Accuracy: 0.8749296875
Fold 2, Epoch 2, Train Loss: 0.3409107153499499, Train Accuracy: 0.9164765625
Fold 2, Epoch 3, Train Loss: 0.25301454787235705, Train Accuracy: 0.93281640625
Fold 2, Epoch 4, Train Loss: 0.20617747255042196, Train Accuracy: 0.9429921875
Fold 2, Epoch 5, Train Loss: 0.18161973569821566, Train Accuracy: 0.94958984375
Fold 2, Test Loss: 0.355958178024739, Test Accuracy: 0.919796875
Fold 3/5




Fold 3, Epoch 1, Train Loss: 0.2243199434815906, Train Accuracy: 0.9429140625
Fold 3, Epoch 2, Train Loss: 0.15198227676935494, Train Accuracy: 0.95836328125
Fold 3, Epoch 3, Train Loss: 0.11981610896391794, Train Accuracy: 0.966875
Fold 3, Epoch 4, Train Loss: 0.08938672306574881, Train Accuracy: 0.9748828125
Fold 3, Epoch 5, Train Loss: 0.08262216855539009, Train Accuracy: 0.97708203125
Fold 3, Test Loss: 0.14384007084788755, Test Accuracy: 0.962671875
Fold 4/5




Fold 4, Epoch 1, Train Loss: 0.11850827761809342, Train Accuracy: 0.96832421875
Fold 4, Epoch 2, Train Loss: 0.08248254560399801, Train Accuracy: 0.9775703125
Fold 4, Epoch 3, Train Loss: 0.05369452366384212, Train Accuracy: 0.98499609375
Fold 4, Epoch 4, Train Loss: 0.06879480622487609, Train Accuracy: 0.98235546875
Fold 4, Epoch 5, Train Loss: 0.053868913089332636, Train Accuracy: 0.98588671875
Fold 4, Test Loss: 0.13480126849142834, Test Accuracy: 0.96490625
Fold 5/5




Fold 5, Epoch 1, Train Loss: 0.07498310002079234, Train Accuracy: 0.9806328125
Fold 5, Epoch 2, Train Loss: 0.06652058501041029, Train Accuracy: 0.98307421875
Fold 5, Epoch 3, Train Loss: 0.05092586832679808, Train Accuracy: 0.98691796875
Fold 5, Epoch 4, Train Loss: 0.03917282380303368, Train Accuracy: 0.98996875
Fold 5, Epoch 5, Train Loss: 0.037189616818795915, Train Accuracy: 0.99064453125
Fold 5, Test Loss: 0.037265214257058685, Test Accuracy: 0.99053125
Average Test Loss: 0.42275566536413967, Average Test Accuracy: 0.9269874999999999


#### correct sentences based on the predictions of a pre-trained language model

In [17]:
def auto_correct_sentence(model, tokenizer, sentence, device):

    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)

    # Move inputs to device
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

    # Forward pass through the model
    outputs = model(**inputs)
    logits = outputs.logits

    # Get probabilities using softmax
    probabilities = softmax(logits, dim=2)

    # Get the predicted labels
    _, predicted_labels = torch.max(probabilities, dim=2)

    # Decode predicted labels to tokens
    corrected_tokens = [tokenizer.decode(tokens, skip_special_tokens=True) for tokens in predicted_labels]

    # Join tokens to form corrected sentence
    corrected_sentence = " ".join(corrected_tokens)

    return corrected_sentence

# Example usage
def auto_correct_sentences(model, tokenizer, sentences, device):
    corrected_sentences = []
    for sentence in sentences:
        corrected_sentence = auto_correct_sentence(model, tokenizer, sentence, device)
        corrected_sentences.append(corrected_sentence)
    return corrected_sentences

# Example usage
sentences = ['مليارات و مليون درهم خلال تكل الفترة', 'معلومات خاطئة نعتقد للأسنان جيدة للأسنان معلومات خاطئة نعتقد أنها جيدة للأسنان', 'مسلسل يونس ولد بطولة عمرو سعد وسيعرض على قناة مصر دراما المستقبل السومرية', 'الدكتورة منى بنت عبدالله بن سعيد آل مشيط', 'صودا الخبز هو معجون طبيعي للأسنان حثي ينصح بخلط ربع ملعقة صغيرة من صودا الخبز مع الماء وغسل الأسنان بب']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
corrected_sentences = auto_correct_sentences(model, tokenizer, sentences, device)
print("Original sentence:", sentences[:5])
print("Corrected sentence:", corrected_sentences[:5])


Original sentence: ['مليارات و مليون درهم خلال تكل الفترة', 'معلومات خاطئة نعتقد للأسنان جيدة للأسنان معلومات خاطئة نعتقد أنها جيدة للأسنان', 'مسلسل يونس ولد بطولة عمرو سعد وسيعرض على قناة مصر دراما المستقبل السومرية', 'الدكتورة منى بنت عبدالله بن سعيد آل مشيط', 'صودا الخبز هو معجون طبيعي للأسنان حثي ينصح بخلط ربع ملعقة صغيرة من صودا الخبز مع الماء وغسل الأسنان بب']
Corrected sentence: ['مليارات و مليون درهم خلال تلك الفترة', 'معلومات خاطئة نعتقد أنها جيدة للأسنان معلومات خاطئة نعتقد أنها جيدة للأسنانناتن', 'مسلسل يونس ولد فضة بطولة عمرو سعد وسيعرض على قناة مصر دراما السوم السوم', 'الدكتورة منى بنت عبدالله بن سعيد آل مشيط', 'صودا الخبز هو معجون طبيعي للأسنان حيث ينصح بخلط ربع ملعقة صغيرة من صودا الخبز مع الماء وحمر الأسنان بب ال']


In [None]:
# Cross-validation loop
for fold, (train_index, test_index) in enumerate(kf.split(dataset)):
    print(f"Fold {fold+1}/{num_folds}")

    # Split dataset into train and test sets for this fold
    train_dataset = torch.utils.data.Subset(dataset, train_index)
    test_dataset = torch.utils.data.Subset(dataset, test_index)

    # Create DataLoaders for train and test sets
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Set device to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Train the model for this fold
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_preds = []
        train_labels = []
        for batch in train_dataloader:
            # Move batch to device
            batch = tuple(t.to(device) for t in batch)
            input_ids, labels = batch

            # Forward pass
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

    # Evaluate the model on the test set for this fold
    model.eval()
    test_loss = 0
    test_preds = []
    test_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, labels = batch

            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss

            test_loss += loss.item()

            # Calculate predictions and true labels for accuracy calculation
            logits = outputs.logits
            preds = logits.argmax(dim=-1)
            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())

    # Convert the test_labels and test_preds lists to NumPy arrays
    test_labels = np.array(test_labels)
    test_preds = np.array(test_preds)

    # Calculate average test loss for the fold
    test_avg_loss = test_loss / len(test_dataloader)
    test_losses.append(test_avg_loss)

    # Calculate accuracy for the fold
    test_accuracy = accuracy_score(test_labels.ravel(), test_preds.ravel())
    test_accuracies.append(test_accuracy)

# Plotting
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_folds + 1), test_losses, label='Test Loss')
plt.xlabel('Fold')
plt.ylabel('Loss')
plt.title('Test Loss Across Folds')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, num_folds + 1), test_accuracies, label='Test Accuracy')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.title('Test Accuracy Across Folds')
plt.legend()

plt.tight_layout()
plt.show()