In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Arabic_Specific_Text_Processing

# Diacritization

**Diacritization** is the process of adding vowel marks (diacritics) to Arabic text to clarify pronunciation and meaning. In Arabic, words are often written without vowels, and diacritics help distinguish words with similar roots but different meanings. For example, "عَلَم" can mean "flag" or "knowledge" depending on the diacritics.

## Key Points:

1. **Arabic Diacritics**: 
   - Short vowels (Fatha, Kasra, Damma), 
   - Sukun (no vowel), 
   - Shadda (doubling consonants).
   
2. **Importance**: 
   - **Pronunciation**: Diacritics guide the correct pronunciation of words.
   - **Meaning**: They help disambiguate words with similar spelling but different meanings.

3. **Diacritization Process**: 
   - **Automatic Diacritization**: Using NLP and machine learning models to predict and add diacritics.
   - **Applications**: Used in text-to-speech systems, educational tools, and search engines.

4. **Challenges**: 
   - Ambiguity in pronunciation based on context.
   - Scarcity of annotated data for diverse Arabic dialects.
   
Diacritization is essential for accurate interpretation, especially in automated systems, and improves understanding and clarity in Arabic text.


In [16]:
import re  # Importing the regular expressions (re) module, which provides functions for working with regular expressions.

# Define the function to remove diacritics from Arabic text
def remove_diacritics(text):
    # Compile a regular expression pattern to match Arabic diacritics (from Unicode range U+064B to U+0652)
    arabic_diacritics = re.compile(r'[\u064B-\u0652]')
    
    # Use re.sub() to replace all matches (diacritics) in the input text with an empty string (i.e., remove them)
    return re.sub(arabic_diacritics, '', text)

# Sample Arabic text with diacritics
text = "فَصْلُ الشّتاءِ جَاءَ، وَهَبَّتِ الرِّيَاحُ البَارِدَةُ، فَحَجَزَتِ السُّحُبُ السَّمَاءَ عَنِ الشَّمْسِ، وَنَزَلَتِ الأمْطَارُ تَغْسِلُ الأَرْضَ وَتُعِيدُ الْحَيَاةَ لِلزُّرُوعِ."

# Call the remove_diacritics function to remove diacritics from the text
result = remove_diacritics(text)

# Print the result without diacritics
print(result)
  


فصل الشتاء جاء، وهبت الرياح الباردة، فحجزت السحب السماء عن الشمس، ونزلت الأمطار تغسل الأرض وتعيد الحياة للزروع.


# Dialect Handling in NLP 

**Dialect Handling** refers to addressing the variations in language forms, particularly in languages with significant dialectal diversity, such as Arabic. In Arabic, multiple dialects exist across regions, each with distinct vocabulary, grammar, and pronunciation, which complicates NLP tasks like translation and sentiment analysis.

## Key Points:
- **Arabic Dialects**: Variations in vocabulary and structure exist between dialects (e.g., Egyptian vs. Levantine Arabic).
- **Challenges**: Differences in dialects make it difficult for models to process language effectively. Additionally, there is often a lack of dialect-specific training data.
- **Techniques**: Approaches like **Dialect Identification**, **Dialect-to-MSA Translation**, and **Multilingual Models** help tackle dialect challenges.
- **Applications**: Key applications include **Machine Translation**, **Sentiment Analysis**, **Text-to-Speech**, and **Speech Recognition**.
- **Advancements**: Models like **multilingual BERT** and datasets like **MADAR corpus** have improved the handling of dialects.
- **Challenges for NLP Models**: Dialectal differences can cause performance issues, and fine-tuning models on specific dialects irious fields.


In [15]:
pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [17]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Model name
model_name = "riotu-lab/Baligh"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define the translation function
def translate_to_msa(text):
    # Convert the input text into tokens (tokenization) with a maximum length of 512 and truncation
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate the translation using the model (beam search is used to explore multiple possibilities)
    outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
    
    # Decode the generated tokens back into readable text, skipping special tokens like padding
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example text in Egyptian Arabic (عامية)
text = """النهاردة كنت فاضي شوية، فقررت أروح الكافيه مع صحابي. الجلسة كانت حلوة، والجو كان لطيف. 
اتكلمنا عن حاجات زي شغلنا وأخبار الناس. بعد شوية، قررنا نتمشى في الحديقة. الجو كان جميل.

رجعنا البيت كل واحد على بيته. وأنا فكرت في اليوم واتمنيت لو كان عندي وقت أكتر علشان أخرج وأتمشى."""

# Translate the text to Modern Standard Arabic (MSA)
msa_translation = translate_to_msa(text)

# Print the original text (in Egyptian Arabic) and the translated text (in MSA)
print("النص بالعامية:", text)
print("النص بالفصحى:", msa_translation)


النص بالعامية: النهاردة كنت فاضي شوية، فقررت أروح الكافيه مع صحابي. الجلسة كانت حلوة، والجو كان لطيف. 
اتكلمنا عن حاجات زي شغلنا وأخبار الناس. بعد شوية، قررنا نتمشى في الحديقة. الجو كان جميل.

رجعنا البيت كل واحد على بيته. وأنا فكرت في اليوم واتمنيت لو كان عندي وقت أكتر علشان أخرج وأتمشى.
النص بالفصحى: كان وقتاً طيباً عندما قررت أن أذهب للتجوّل في مقهى. كان ذلك ممتعاً. بدأنا جولة حيث لم يكن لدينا وقت لنتحدث مع بعضنا البعض. كان لطيفاً أن أتحدث مع بعضنا البعض أثناء الجلسة. كان لطيفاً أن أتحدث مع بعضنا البعض أثناء الجلسة. لقد كنت مسروراً حقاً بأن بيننا فرصة طيبة لكي أذهب وأنام.


# English_Specific_Text_Processing

# Stemming

**Stemming** is the process of shortening words to their root form, so they all match the same base word. This helps in understanding different forms of a word as one.

## Key Points:
- **What it does**: Stemming reduces words like "running" and "runner" to the base word "run".
- **Why use it**: It helps machines focus on the core meaning of words without worrying about their different forms.
- **Common methods**: Two common algorithms are the **Porter Stemmer** and **Snowball Stemmer**.
- **Where it's used**: Stemming is useful in tasks like:
  - **Searching**: Helps find documents with similar meanings even if the words are different.
  - **Classifying text**: Reduces the number of word variations.
  - **Understanding sentiment**: Helps analyze the main feelings behind words.

## Pros:
- **Improves efficiency**: It makes the text simpler by reducing word forms.
- **Helps find related words**: Even different forms of a word can be matched together.

## Cons:
- **Can over-simplify**: It may change words too much, losing important meaning.
- **Not always accurate**: Sometimes it reduces words too much or chep the meaning intact.


In [18]:
pip install nltk


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [19]:
# Importing the necessary libraries
import nltk
from nltk.stem import PorterStemmer

# Downloading the punkt tokenizer models (for word tokenization)
nltk.download('punkt')

# Initializing the PorterStemmer object, which will be used to stem words
porter_stemmer = PorterStemmer()

# Defining a sample text for stemming
text = """
I love programming a lot and I like to learn new skills.
Programming helps me improve my abilities in problem-solving.
I believe that learning programming is important for my future.
"""

# Tokenizing the text into individual words
words = nltk.word_tokenize(text)

# Stemming each word using the Porter Stemmer
# This will reduce words like 'programming' to 'program', 'skills' to 'skill', etc.
stemmed_words = [porter_stemmer.stem(word) for word in words]

# Printing the original word and its corresponding stemmed version
for word, stemmed_word in zip(words, stemmed_words):
    print(f"{word} -> {stemmed_word}")



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
I -> I
love -> love
programming -> program
a -> a
lot -> lot
and -> and
I -> I
like -> like
to -> to
learn -> learn
new -> new
skills -> skill
. -> .
Programming -> program
helps -> help
me -> me
improve -> improv
my -> my
abilities -> abil
in -> in
problem-solving -> problem-solv
. -> .
I -> I
believe -> believ
that -> that
learning -> learn
programming -> program
is -> is
important -> import
for -> for
my -> my
future -> futur
. -> .


# Lemmatization in Natural Language Processing

**Lemmatization** is a process in natural language processing (NLP) where words are reduced to their base or root form. Unlike stemming, lemmatization takes into account the context and meaning of the word, providing more accurate and meaningful results.

### Key Points of Lemmatization:
1. **Context-Aware**:
   - Lemmatization considers the part of speech of a word (e.g., verb, noun) and the word's context to return the correct base form.

2. **Produces Meaningful Words**:
   - Lemmatization results in a proper base form (lemma) of a word, unlike stemming, which may generate non-existent or incorrect words.

3. **More Accurate**:
   - It returns the correct canonical form of the word. For example:
     - "better" → "good"
     - "running" → "run"
     - "flies" → "fly"

4. **Slower than Stemming**:
   - Lemmatization is computationally more expensive because it involves dictionary lookups and morphological analysis.


In [20]:
# Importing the spaCy library
import spacy

# Loading the pre-trained English language model
nlp = spacy.load("en_core_web_sm")  

# Sample text for lemmatization
text = """
The quick brown foxes were jumping over the lazy dogs while the cats ran along the street. The dogs are bigger than the cats,
but the cats are much faster. Even though the foxes were tired, they still managed to run across the park. 
The foxes had been jumping for hours, and they felt exhausted, but they were happy to have spent time in nature. 
"""

# Processing the text with spaCy to tokenize and analyze it
doc = nlp(text)

# Loop through each token (word) in the processed text (doc)
for token in doc:
    # Print the original word and its lemmatized form
    print(f"Original word: {token.text}, Lemma: {token.lemma_}")



Original word: 
, Lemma: 

Original word: The, Lemma: the
Original word: quick, Lemma: quick
Original word: brown, Lemma: brown
Original word: foxes, Lemma: fox
Original word: were, Lemma: be
Original word: jumping, Lemma: jump
Original word: over, Lemma: over
Original word: the, Lemma: the
Original word: lazy, Lemma: lazy
Original word: dogs, Lemma: dog
Original word: while, Lemma: while
Original word: the, Lemma: the
Original word: cats, Lemma: cat
Original word: ran, Lemma: run
Original word: along, Lemma: along
Original word: the, Lemma: the
Original word: street, Lemma: street
Original word: ., Lemma: .
Original word: The, Lemma: the
Original word: dogs, Lemma: dog
Original word: are, Lemma: be
Original word: bigger, Lemma: big
Original word: than, Lemma: than
Original word: the, Lemma: the
Original word: cats, Lemma: cat
Original word: ,, Lemma: ,
Original word: 
, Lemma: 

Original word: but, Lemma: but
Original word: the, Lemma: the
Original word: cats, Lemma: cat
Original word

# Handling Abbreviations 

Abbreviations are commonly used in natural language, and correctly handling them is important for improving the performance of various NLP tasks. This includes tasks like text classification, named entity recognition (NER), and machine translation. In this document, we will explore the methods to handle abbreviations in NLP.

## 1. Expanding Abbreviations

Expanding abbreviations into their full forms is one way to handle them. For example:
- "Dr." → "Doctor"
- "e.g." → "for example"
- "i.e." → "that is"
  
This can be done manually using a dictionary of common abbreviations or automatically using pre-trained models for domain-specific expansions.

In [None]:
# Dictionary containing common abbreviations and their full forms
abbreviations = {
    "Dr.": "Doctor",
    "Mr.": "Mister",
    "Mrs.": "Misses",
    "Ms.": "Miss",
    "St.": "Saint",
    "TV": "Television",
    "CEO": "Chief Executive Officer",
    "ATM": "Automated Teller Machine",
    "USA": "United States of America",
    "UK": "United Kingdom",
    "EU": "European Union",
    "ID": "Identification",
    "DOB": "Date of Birth",
    "FAQ": "Frequently Asked Questions",
    "RSVP": "Répondez s'il vous plaît",
    "P.S.": "Postscript",
    "AKA": "Also Known As",
    "BFF": "Best Friends Forever",
    "FYI": "For Your Information",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "BRB": "Be Right Back",
    "GTG": "Got To Go",
    "OMG": "Oh My God",
    "TMI": "Too Much Information",
    "YOLO": "You Only Live Once",
    "WTF": "What The F***",
    "TGIF": "Thank God It's Friday",
    "DIY": "Do It Yourself",
    "LMAO": "Laughing My Ass Off",
    "TBD": "To Be Determined",
    "TBC": "To Be Continued",
    "SMH": "Shaking My Head",
    "FOMO": "Fear Of Missing Out",
    "BRB": "Be Right Back",  # Duplicated entry
    "LMAO": "Laughing My Ass Off",  # Duplicated entry
    "B2B": "Business to Business",
    "B2C": "Business to Consumer",
    "CCTV": "Closed-Circuit Television",
    "MIA": "Missing In Action",
    "WIFI": "Wireless Fidelity",
    "NFT": "Non-Fungible Token",
    "SEO": "Search Engine Optimization",
    "URL": "Uniform Resource Locator",
    "IP": "Internet Protocol",
    "HTTP": "Hypertext Transfer Protocol",
    "DNS": "Domain Name System",
}

# Input text with several abbreviations
text = """In today’s tech-driven world, Wi-Fi has become a common feature everywhere, from homes to public places. 
The CEO of major companies uses the internet to keep track of everything, from the CRM system to daily reports. 
If you need to withdraw money from an ATM, you can simply enter your ID, and sometimes your DOB might be required for verification.

If you’re planning to travel to the USA or the UK, don’t forget to check the FAQ section on major companies' websites 
to confirm the required documents. If you're into technology, you’ve probably heard of NFTs and how to invest in them.

When browsing the internet, make sure you're using the correct URL to ensure you’re reaching the site you want. 
For secure browsing, always prefer HTTPS. SEO also plays a huge role in improving search engine rankings. 
And for keeping in touch with friends, you can always send a quick SMS or use OMG when you're feeling surprised.

But sometimes, one may feel FOMO (Fear of Missing Out) if they’re not keeping up with the news or experiencing 
TMI (Too Much Information), so it’s always good to check the CCTV to see what’s happening around you."""

# Loop through the dictionary to replace abbreviations with their full forms in the text
for abbrev, full_form in abbreviations.items():
    text = text.replace(abbrev, full_form)

# Print the text after replacing abbreviations
print(text)



# Advanced_Text_Processing

## Multilingual_Text_Handling

In [8]:
pip install googletrans

Note: you may need to restart the kernel to use updated packages.


In [7]:
import re
from googletrans import Translator  # Importing the Google Translator library

# The text containing both Arabic and English languages
text = """الصديق الأول: مرحبًا! How are you today? أتمنى أن تكون بخير.
الصديق الثاني: أهلاً! I'm fine, شكراً على سؤالك. How about you? كيف حالك أنت؟
الصديق الأول: الحمد لله، بخير. كان لدي يوم طويل في العمل، ولكن الآن أنا مرتاح. Do you have any plans for the weekend?
الصديق الثاني: في الواقع، لا، لكنني كنت أفكر في الخروج لتناول القهوة مع بعض الأصدقاء. Do you wanna join?
الصديق الأول: فكرة رائعة! Always love hanging out with friends. أين نلتقي؟
الصديق الثاني: دعنا نلتقي في المقهى على الزاوية، لديهم قهوة رائعة.
الصديق الأول: يبدو مثاليًا! سأكون هناك في الخامسة.
الصديق الثاني: رائع! أراك في الخامسة إذًا. Take care!"""

# A dictionary to store the translations of English words into Arabic
translation_dict = {}

# Function to extract English words from the text
def extract_english(text):
    # Use regular expressions to extract English words from the text
    english_words = re.findall(r'\b[a-zA-Z]+\b', text)
    return english_words

# Function to replace English words with their translations in the text
def replace_english_with_translation(text, translation_dict):
    # Loop through the translation dictionary and replace English words with their Arabic translations
    for eng_word, translated_word in translation_dict.items():
        text = re.sub(rf'\b{eng_word}\b', translated_word, text)
    return text

# Extract all English words from the input text
english_words = extract_english(text)

# Create an instance of the Translator
translator = Translator()

# Translate each English word to Arabic and add it to the translation dictionary
for word in english_words:
    # Translate each word from English to Arabic
    translation = translator.translate(word, src='en', dest='ar').text
    translation_dict[word] = translation

# Replace English words with their Arabic translations in the original text
translated_text = replace_english_with_translation(text, translation_dict)

# Print the original text and the translated text
print("Original text:")
print(text)
print("\nTranslated text:")
print(translated_text)



النص الأصلي:
الصديق الأول: مرحبًا! How are you today? أتمنى أن تكون بخير.
الصديق الثاني: أهلاً! I'm fine, شكراً على سؤالك. How about you? كيف حالك أنت؟
الصديق الأول: الحمد لله، بخير. كان لدي يوم طويل في العمل، ولكن الآن أنا مرتاح. Do you have any plans for the weekend?
الصديق الثاني: في الواقع، لا، لكنني كنت أفكر في الخروج لتناول القهوة مع بعض الأصدقاء. Do you wanna join?
الصديق الأول: فكرة رائعة! Always love hanging out with friends. أين نلتقي؟
الصديق الثاني: دعنا نلتقي في المقهى على الزاوية، لديهم قهوة رائعة.
الصديق الأول: يبدو مثاليًا! سأكون هناك في الخامسة.
الصديق الثاني: رائع! أراك في الخامسة إذًا. Take care!

النص بعد الترجمة:
الصديق الأول: مرحبًا! كيف نكون أنت اليوم? أتمنى أن تكون بخير.
الصديق الثاني: أهلاً! أنا'م بخير, شكراً على سؤالك. كيف عن أنت? كيف حالك أنت؟
الصديق الأول: الحمد لله، بخير. كان لدي يوم طويل في العمل، ولكن الآن أنا مرتاح. يفعل أنت يملك أي خطط ل ال عطلة نهاية الأسبوع?
الصديق الثاني: في الواقع، لا، لكنني كنت أفكر في الخروج لتناول القهوة مع بعض الأصدقاء. يفعل أنت 