In [27]:
import re
from functools import lru_cache

@lru_cache(maxsize=None)
def remove_tashkeel(text):
    """
    Removes Arabic diacritics (tashkeel) and other non-letter marks from the text.
    This function targets a comprehensive set of Unicode characters associated with
    Arabic vocalization and annotation marks, including:
    - Standard tashkeel (Fatha, Damma, Kasra, Sukun, Shadda, Tanween).
    - Quranic annotation signs and small Kufic marks (e.g., small Fatha, Damma, Kasra).
    - Superscript Alef (الألف الخنجرية).
    - Other less common vowel signs and diacritics.

    Unicode ranges covered:
    - U+0610 to U+061A (Arabic Small Kufic Marks, Quranic annotation signs)
    - U+064B to U+065F (Standard Tashkeel and additional diacritics/vowel signs)
    - U+0670 (Arabic Letter Superscript Alef)
    - U+06D6 to U+06DC (Quranic Annoation Signs)
    - U+06DF to U+06E8 (Quranic Annoation Signs)
    - U+06EA to U+06ED (Quranic Annoation Signs)
    """
    return re.sub(r'[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E8\u06EA-\u06ED]', '', text)

@lru_cache(maxsize=None)
def normalize_aleft(text):
    """
    Normalizes different forms of the Arabic letter Alef (أ, إ, آ, ٱ) to a standard Alef (ا).
    This helps in standardizing text for consistent processing.
    """
    return re.sub(r'[أإآٱ]', 'ا', text)

@lru_cache(maxsize=None)
def remove_tatweel(text):
    """
    Removes the Tatweel (elongation) character (ـ) from the text.
    Tatweel is used to stretch words visually but carries no linguistic meaning in most NLP tasks.
    """
    return re.sub(r'[\u0640]', '', text)

@lru_cache(maxsize=None)
def remove_punctuation_and_non_arabic(text):
    """
    Removes punctuation and any non-Arabic characters, keeping only Arabic letters and spaces.
    This is a crucial step to clean text from irrelevant symbols.
    """
    return re.sub(r'[^\u0600-\u06FF\s]', '', text)

def preprocess_arabic_text(text):
    """
    Applies a series of preprocessing steps to Arabic text in a specific order:
    1.  `remove_tashkeel`: Eliminates all diacritics and Quranic annotation marks.
    2.  `normalize_aleft`: Converts various Alef forms to a single standard form.
    3.  `remove_tatweel`: Removes the Tatweel character.
    4.  `remove_punctuation_and_non_arabic`: Filters out any remaining punctuation or non-Arabic symbols.
    5.  `re.sub(r'\s+', ' ', text).strip()`: Normalizes multiple spaces into a single space
        and removes leading/trailing whitespace, ensuring clean word separation.

    Args:
        text (str): The input Arabic text string to be preprocessed.

    Returns:
        str: The preprocessed and cleaned Arabic text string.
    """
    text = remove_tashkeel(text)
    text = normalize_aleft(text)
    text = remove_tatweel(text)
    text = remove_punctuation_and_non_arabic(text)
    text = re.sub(r'\s+', ' ', text).strip() # Normalize spaces and strip leading/trailing whitespace
    return text

# --- Example Usage and File Operations ---

# Define file paths for input and output data.
# Ensure 'data.txt' exists in the same directory as this script,
# or provide its full path.
new_text_data_path = 'data.txt'
processed_file_path = 'processed_data.txt'

# Read, preprocess, and write text from the specified file.
try:
    # Open the input file for reading with UTF-8 encoding.
    with open(new_text_data_path, 'r', encoding='utf-8') as file:
        lines = file.readlines() # Read all lines into a list

    # Process each line using the preprocess_arabic_text function.
    # The lru_cache on sub-functions will optimize repeated processing of identical substrings.
    processed_lines = [preprocess_arabic_text(line) for line in lines]

    # Open the output file for writing with UTF-8 encoding.
    # Each processed line is written followed by a newline character.
    with open(processed_file_path, 'w', encoding='utf-8') as file:
        file.writelines(line + '\n' for line in processed_lines)

    print("✅ Arabic text preprocessing complete. Output saved to 'processed_data.txt'.")

except FileNotFoundError:
    # Handle the case where the input file does not exist.
    print(f"Error: The input file '{new_text_data_path}' was not found. Please ensure it exists.")
except Exception as e:
    # Catch any other potential errors during file operations or processing.
    print(f"An unexpected error occurred during file processing: {e}")



✅ Arabic text preprocessing complete. Output saved to 'processed_data.txt'.


In [28]:
#finetuning fasttext model on a custom dataset
import fasttext

new_model = fasttext.train_supervised( input=processed_file_path,)

In [29]:
model_path = 'fasttext_model.bin'
new_model.save_model(model_path)

In [30]:
load_model = fasttext.load_model(model_path)

In [31]:
load_model.get_words()

['</s>',
 'الرحمن',
 'الرحيم',
 'عليهم',
 'بسم',
 'الله',
 'الحمد',
 'لله',
 'رب',
 'العلمين',
 'ملك',
 'يوم',
 'الدين',
 'اياك',
 'نعبد',
 'واياك',
 'نستعين',
 'اهدناالصرط',
 'المستقيم',
 'صرط',
 'الذين',
 'انعمت',
 'غير',
 'المغضوب',
 'ولا',
 'الضالين']

In [32]:
word = 'الرحمن'
word_vector = load_model.get_word_vector(word)
print(f"Vector for '{word}': {word_vector}")

Vector for 'الرحمن': [ 0.00238093  0.00509113  0.00831976 -0.00927015  0.00661152  0.00807286
 -0.00648016  0.00133566 -0.00343636 -0.00210862 -0.00070896 -0.00633463
 -0.00485001  0.00221098 -0.00606992 -0.00717415  0.00338688  0.00603667
 -0.00688208 -0.00057144 -0.00488543  0.00987955  0.00397523  0.00694702
 -0.00861237 -0.00616428 -0.00983707 -0.00120694 -0.00220774 -0.00728686
 -0.00501538 -0.00077216  0.00396778 -0.00586287 -0.00635476 -0.00211214
 -0.00715075  0.00431303 -0.00053761 -0.0027656  -0.0099938   0.00944217
 -0.00514806  0.00308559  0.00880401  0.00192514  0.00807715 -0.00336937
 -0.00391913  0.00350796 -0.00100431 -0.00181935  0.0023661   0.00489011
 -0.00905311  0.00352073 -0.00963604  0.0006935   0.00876348 -0.00243911
 -0.00015379  0.00351502  0.00568376  0.00787363  0.00454237 -0.00356971
 -0.00709526 -0.00525634 -0.00576154 -0.00568993  0.00156353  0.00505632
  0.0058047  -0.00733124 -0.0023909  -0.00252901 -0.00390956  0.00289598
  0.00645355  0.00565575  0.00

In [46]:
fine_tune_model = fasttext.train_supervised(
    input='processed_data.txt',
    dim=300,
    pretrainedVectors='C:/Users/user/Desktop/hasob_win/Transfer learning/MyCode/005 Text processing/cc.ar.300.vec'
)
model_path = 'fasttext_model.bin'
fine_tune_model.save_model(model_path)

ValueError: C:/Users/user/Desktop/hasob_win/Transfer learning/MyCode/005 Text processing/cc.ar.300.vec cannot be opened for loading!

In [None]:


word = 'الرحمن'
word_vector = load_model.get_word_vector(word)
print(f"Vector for '{word}': {word_vector}")

Vector for 'الرحمن': [ 0.00238093  0.00509113  0.00831976 -0.00927015  0.00661152  0.00807286
 -0.00648016  0.00133566 -0.00343636 -0.00210862 -0.00070896 -0.00633463
 -0.00485001  0.00221098 -0.00606992 -0.00717415  0.00338688  0.00603667
 -0.00688208 -0.00057144 -0.00488543  0.00987955  0.00397523  0.00694702
 -0.00861237 -0.00616428 -0.00983707 -0.00120694 -0.00220774 -0.00728686
 -0.00501538 -0.00077216  0.00396778 -0.00586287 -0.00635476 -0.00211214
 -0.00715075  0.00431303 -0.00053761 -0.0027656  -0.0099938   0.00944217
 -0.00514806  0.00308559  0.00880401  0.00192514  0.00807715 -0.00336937
 -0.00391913  0.00350796 -0.00100431 -0.00181935  0.0023661   0.00489011
 -0.00905311  0.00352073 -0.00963604  0.0006935   0.00876348 -0.00243911
 -0.00015379  0.00351502  0.00568376  0.00787363  0.00454237 -0.00356971
 -0.00709526 -0.00525634 -0.00576154 -0.00568993  0.00156353  0.00505632
  0.0058047  -0.00733124 -0.0023909  -0.00252901 -0.00390956  0.00289598
  0.00645355  0.00565575  0.00