## Data Cleaning

In [1]:
# import needed libraries
import pandas as pd 
import numpy as np 
import re
import string 
import nltk
import spacy

# To install the Arabic Model
# !python -m spacy download xx_ent_wiki_sm

# For arabic diacritics
# pip install pyarabic 
from pyarabic.araby import strip_tashkeel, normalize_hamza
from pyarabic.araby import is_tashkeel, is_arabicrange

# For converting emojis and emot
# pip install emoji
import emoji
# pip install emot
import emot

In [2]:
# Load the Arabic language model in spacy
nlp = spacy.load('xx_ent_wiki_sm')

In [3]:
# read data
data = pd.read_csv("../data/interim/feature_selected.csv")

In [4]:
data.shape

(17306, 10)

In [5]:
data.head()

Unnamed: 0,id,message,from_id,reply_to,year,month,day,hour,minute,second
0,409414,.,5774169366,-1,2023,3,7,13,48,25
1,409427,حواره للخارج من نابلس ؟,325418964,-1,2023,3,7,13,50,2
2,409428,سالك سالك,1827882971,409427,2023,3,7,13,50,9
3,409429,حوارة في عليها مستوطنين,5633332941,-1,2023,3,7,13,50,18
4,409430,وين,1332172310,409429,2023,3,7,13,50,26


### Remove Arabic diacritics

In [6]:
def remove_diacritics(text: str) -> str:
    """
    Remove diacritics from Arabic text and reshape it to proper Arabic script.

    Args:
        text (str): The input Arabic text.

    Returns:
        str: The Arabic text with diacritics removed and reshaped to proper Arabic script.
    """
    if isinstance(text, str):
        # Normalize the text
        text = normalize_hamza(text)
        # Remove diacritics using regex
        diacritic_pattern = re.compile("[ًٌٍَُِّْ]+")
        text_without_diacritics = diacritic_pattern.sub("", text)
        # Reshape the text to proper Arabic script
        reshaped_text = ""
        for i, char in enumerate(text_without_diacritics):
            if is_arabicrange(char):
                if is_tashkeel(char):
                    reshaped_text += char
                else:
                    reshaped_text += strip_tashkeel(char)
            else:
                reshaped_text += char
        return reshaped_text
    else:
        return text


In [7]:
remove_diacritics("مرحباً حوارة للأسف مغلقةٌ اليَوم")

'مرحبا حوارة للءسف مغلقة اليوم'

### Convert Emojis and Emoticons

In [8]:
def convert_emojis(text: str) -> str:
    """
    Convert emojis in text to their corresponding text representation.

    Args:
        text (str): The input text with emojis.

    Returns:
        str: The input text with emojis replaced by their corresponding text representation.
    """
    return emoji.demojize(text)

In [9]:
convert_emojis("🤡 ههه")

':clown_face: ههه'

In [10]:
def convert_emoticons(text: str) -> str:
    """
    Convert emoticons in text to their corresponding text representation.

    Args:
        text (str): The input text with emoticons.

    Returns:
        str: The input text with emoticons replaced by their corresponding text representation.
    """
    emo_obj = emot.EMOTICONS_EMO
    for emot1 in emo_obj:
        escaped_emot = re.escape(emot1)
        text = re.sub(u'({})'.format(escaped_emot), "_".join(emo_obj[emot1].replace(",","").split()), text)
    return text

In [11]:
convert_emoticons("^_^ وه")

'Joyful وه'

### Remove links

In [12]:
def remove_urls(text: str) -> str:
    """
    Remove URLs from text.

    Args:
        text (str): The input text with URLs.

    Returns:
        str: The input text with URLs removed.
    """
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [13]:
remove_urls("اللينك https://hellooos.com")

'اللينك '

### Remove Phone Numbers

In [14]:
def remove_phone_numbers(text: str) -> str:
    """
    Remove phone numbers from text.

    Args:
        text (str): The input text with phone numbers.

    Returns:
        str: The input text with phone numbers removed.
    """
    text = re.sub(r'(\+?\d{2,4}[ -]?)?\d{9,10}', '', text)
    return text

In [15]:
remove_phone_numbers("هاي, my phone number is +971-551234567. You can also reach me at 00971551234567 or 0551234567.")

'هاي, my phone number is . You can also reach me at  or .'

### Remove Punctuations

In [16]:
PUNCT_TO_REMOVE = string.punctuation + '؛،؟«»٪٫٬٭'
def remove_punctuations(text: str) -> str:
     """
     Remove punctuations from text.

     Args:
          text (str): The input text with punctuations.

     Returns:
          str: The input text with punctuations removed.
     """
     return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [17]:
remove_punctuations("حسنا، يا الهي! ؟ ? هذا رائع.")

'حسنا يا الهي   هذا رائع'

### Remove MultiMedia

In [18]:
def remove_multimedia(text: str) -> str:
    """
    Remove multimedia (images, videos, etc.) from text.

    Args:
        text (str): The input text with multimedia.

    Returns:
        str: The input text with multimedia removed.
    """
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'pic\.twitter\.com/\S+', '', text)
    text = re.sub(r'@(\w+)', '', text)
    return text

### Apply the cleaning process to the messages. and delete empty messages from the df

In [19]:
for i, row in data.iterrows():
    if isinstance(row["message"], float):  # check if value is float
        data.at[i, "message"] = ""  # convert float to empty string
    else:
        text = row["message"]
        # apply cleaning functions to the text
        text = remove_diacritics(text)
        text = convert_emojis(text)
        text = convert_emoticons(text)
        text = remove_urls(text)
        text = remove_phone_numbers(text)
        text = remove_punctuations(text)
        text = remove_multimedia(text)
        if text == "":  # check if cleaned text is empty
            data = data.drop(i)  # delete row if cleaned text is empty
        else:
            data.at[i, "message"] = text  # store the cleaned text back in the dataframe

In [20]:
data["message"]

1             حواره للخارج من نابلس 
2                          سالك سالك
3            حوارة في عليها مستوطنين
4                                وين
5        كيف طريق من طوباس لرام الله
                    ...             
17301                   كيف طريق صرة
17302                  و دوار كدوميم
17303                     شو وضع dso
17304                  جيت صرة سالكة
17305                 واد قانا سالك 
Name: message, Length: 16880, dtype: object