### Libraries

In [6]:
import nltk
#import Stemmer
import numpy as np
import unicodedata
import pandas as pd
import tensorflow as tf
import re, string, emoji
import qalsadi.lemmatizer 
import pyarabic.araby as ar
import langid # => English Text

#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

from emoji import demojize
from nltk.stem import ISRIStemmer
from gensim.models import FastText
#from imblearn.over_sampling import SMOTE
from langid.langid import LanguageIdentifier, model
from sklearn.model_selection import train_test_split

from keras.regularizers import l2
from keras.optimizers import Adam
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Embedding, Dense, Dropout, LSTM, BatchNormalization#, SimpleRNN

### Reading/Loading the Data

In [7]:
file_path = "train.xlsx" # Path on Kaggle: /kaggle/input/arabic-sentiment-analysis-nlp/train.xlsx
df_train = pd.read_excel(file_path)
df_train

Unnamed: 0,review_description,rating
0,شركه زباله و سواقين بتبرشم و مفيش حتي رقم للشك...,-1
1,خدمة الدفع عن طريق الكي نت توقفت عندي اصبح فقط...,1
2,تطبيق غبي و جاري حذفه ، عاملين اكواد خصم و لما...,-1
3,فعلا تطبيق ممتاز بس لو فى امكانية يتيح لمستخدم...,1
4,سيء جدا ، اسعار رسوم التوصيل لا تمت للواقع ب ص...,-1
...,...,...
32031,التطبيق اصبح سيء للغايه نقوم بطلب لا يتم وصول ...,-1
32032,y love you,1
32033,الباقه بتخلص وبشحن مرتين باقه اضافيه ١٠٠ جنيه,-1
32034,تطبيق فاشل وصلني الطلب ناقص ومش ينفع اعمل حاجة...,-1


### Checking/Exploring the Data

In [8]:
# Checking for NULL/Missing Values
missing_values = df_train.isnull().sum()
print("Missing values in each column:")
missing_values

Missing values in each column:


review_description    0
rating                0
dtype: int64

In [9]:
# Checking for Imbalances/Bias in Data
rating_distribution = df_train['rating'].value_counts()
print("Distribution of values in the 'rating' column:")
rating_distribution

Distribution of values in the 'rating' column:


rating
 1    19189
-1    11340
 0     1507
Name: count, dtype: int64

##### Imbalanced data => We can manipulate class weights later..

In [10]:
# Arabic Only?
#english_rows = df_train[df_train['review_description'].apply(lambda x: langid.classify(x)[0] == 'en')]
#english_rows[['review_description']]
print("Rows with English text: 780 rows × 1 columns")

Rows with English text: 780 rows × 1 columns


##### Data is Mixed

### Text Processing

In [11]:
arabic_stop_words = [
    "و", "في", "من", "على", "إلى", "لا", "أو", "هو", "هي", "يكون",
    "أنا", "أنت", "هو", "هي", "نحن", "أنتم", "هم",
    "عن", "مع", "كما", "مثل", "بين", "إذا", "حتى", "منذ",
    "و", "أو", "لكن", "إذا", "إن",
    "اليوم", "غداً", "الآن", "ثم", "بعد",
    "كان", "يكون", "أصبح", "صار", "ليس", "لم",
    "هذا", "هذه", "ذلك", "تلك", 
    "كل", "على", "فيه", "منه", "عنه", "له", "به", "إليه", "لها", "فيها",
    "بها", "منها", "عنها", "إليها", "الذي", "التي", "اللذين", "اللذان", "اللتان",
    "اللتين", "هؤلاء", "ذلك", "هذه", "هذا", "تلك", "تحت", "فوق", "معه", "لديه",
    "عليه", "عليها", "أي", "هل", "إذا", "ماذا", "هناك", "هنالك", "إلى",
    "يناير", "فبراير", "مارس", "إبريل", "مايو", "يونيو", "يوليو", "أغسطس", "سبتمبر", "أكتوبر", "نوفمبر", "ديسمبر",
    "الأحد", "الاثنين", "الثلاثاء", "الأربعاء", "الخميس", "الجمعة", "السبت"
]

english_stop_words = set(stopwords.words('english'))

negation_words = [
    "لا", "لما", "لن", "ليس", "ما", "لم", "لات", "غير", "لنعم", "ليست", "لست", "مطلقاً",
    "no", "not", "never", "none", "nobody", "nothing", "nowhere", "neither", "nor",
    "cannot", "can't", "won't", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "doesn't", "don't",
    "didn't", "shouldn't", "wouldn't", "couldn't", "mustn't", "mightn't", "ain't"
]

def handle_negations(tokens):
    negation_flag = False
    result_tokens = []

    for token in tokens:
        if token.lower() in negation_words:
            negation_flag = True
        elif negation_flag and token.isalpha():
            result_tokens.append(f"NOT_{token}")
            negation_flag = False
        else:
            result_tokens.append(token)

    return result_tokens

In [18]:
def text_processing_pipeline(text, stemmer, lemmatizer, tokenizer, stop_words):
    # Unicode Normalization
    text = unicodedata.normalize('NFD', text.lower())
    
    # Convert emojis to text
    text = demojize(text)
    # Add space between emojis
    text = re.sub(r"(:[a-zA-Z0-9_]+:)", r" \1 ", text)
    # Handling cases like "♡♡♡"
    text = re.sub(r"[♥☆★♡🖒]+", " good_review ", text)
    
    # Handling extra whitespaces
    text = re.sub(r"\s+", " ", text)
    
    # Handling numeric ratings like "100/100" or "10/10"
    match = re.search(r"(\d+)\s*(?:من|\s|\/)\s*(\d+)", text)
    if match:
        left_number = float(match.group(1))
        right_number = float(match.group(2))
        if left_number <= 0.4 * right_number:
            text = "bad_rating"
        else:
            text ="good_rating"
            
    text = re.sub(r"\b(?:\d{1,3}(?:\d{3})*(?:\.\d+)?%?|100٪)\b", "good_review", text)
            
    # Removing Digits
    text = re.sub(r"\d", "", text)
        
    # Removing Punctuation
    text = text.translate(str.maketrans("", "", string.punctuation)) # !"#$%&'()*+,./:;<=>?@[\]^`{|}~
    
    # src = https://github.com/linuxscout/pyarabic/blob/master/doc/features.md
    text = ar.strip_tashkeel(text)
    text = ar.normalize_hamza(text, method="tasheel")
    text = ar.strip_tatweel(text) # العـــــربية -> العربية
    text = text.replace("اا", "ا")
    
    # Language Identification
    lang, confidence = langid.classify(text)
    
    if lang == 'ar':
        # Arabic text processing
        stemmed_tokens = stemmer.lemmatize_text(text)
        # Stop-Words-Removal
        filtered_tokens = [token for token in stemmed_tokens if token.lower() not in stop_words]
    else:
        # Tokenization
        tokens = tokenizer.tokenize(text)
        
        # Stop-Words-Removal
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
        
        # English text processing
        stemmed_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Handling negations
    final_tokens = handle_negations(stemmed_tokens)
    
    text = " ".join(final_tokens)
        
    return text

In [19]:
# Initializating/Preparing Stemmer, Lemmatizer, Tokenizer & Stop-Words
#stemmer = ISRIStemmer()
#stemmer = FarasaStemmer()
#stemmer = Stemmer.Stemmer("arabic")
stemmer = qalsadi.lemmatizer.Lemmatizer()
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r"\w+")
stop_words = set(english_stop_words).union(set(arabic_stop_words)) 

In [20]:
# Applying all the Processing+Visualization
df_train['final_tokens'] = df_train['review_description'].apply(
    lambda x: text_processing_pipeline(x, stemmer, lemmatizer, tokenizer, stop_words)
)
df_train.head(20)

Unnamed: 0,review_description,rating,final_tokens
0,شركه زباله و سواقين بتبرشم و مفيش حتي رقم للشك...,-1,شرك زبال سواق بتبرشم مفيش حت رقم للشكاوي سواق ...
1,خدمة الدفع عن طريق الكي نت توقفت عندي اصبح فقط...,1,خدم دفع طريق كي نات توقف عند صبح فقط دفع نقد
2,تطبيق غبي و جاري حذفه ، عاملين اكواد خصم و لما...,-1,تطبيق غب جار حذف عامل اكواد خصم NOT_استخدم اكت...
3,فعلا تطبيق ممتاز بس لو فى امكانية يتيح لمستخدم...,1,علا تطبيق ممتاز بس لو فى امكانية أتاح مستخدم ت...
4,سيء جدا ، اسعار رسوم التوصيل لا تمت للواقع ب ص...,-1,ساء جدا اسعار رسوم توصيل أمات واقع ب صل
5,قعد عشرين سنة يدور على سائق بس اما عن توصيل ال...,0,قعد عشر سنة دار ساياق بس اما توصيل الاشياء جيد...
6,احلئ تطبيق,1,حلى تطبيق
7,رائع واو مدهش,1,راياع واو مدهش
8,مکو بس البحرین وعمان وغیرهه بس العراق مکو یعنی...,-1,مکو بس البحرین عمان وغیرهه بس عراق مکو یعنی نج...
9,تطبيق جميل يستاهل الخمس نجوم👍👍👍,1,تطبيق جميل يستاهل خمس نجوم thumbsup thumbsup t...


In [12]:
# Dropping Empty Rows
df_train = df_train[df_train['final_tokens'].apply(lambda x: len(x) > 0)]
df_train.reset_index(drop=True, inplace=True)
df_train.loc[:, 'rating'] = df_train['rating'][df_train['final_tokens'].apply(lambda x: len(x) > 0)]

### Vocabulary Preparation/Sequencing/Padding

In [13]:
X = df_train['final_tokens']
y = df_train["rating"].astype(int) + 1

num_classes = 3

tokenizerr = Tokenizer(oov_token="<UNK>")
tokenizerr.fit_on_texts(X)
total_words = len(tokenizerr.word_index) + 1

sequences = tokenizerr.texts_to_sequences(X)
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len)

### Data Splitting Train/Validation

In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, y, test_size=0.2, random_state=333, stratify=y
)

### Handling Class Imbalances/Augmentation

In [15]:
# 0 leeh 1.5k row w 1 leeh 20k row .. f b handle da enii a-adjust el weights :D '''
#class_weights = dict(zip(y_train.value_counts().index, len(y_train) / (y_train.value_counts() * len(y_train.unique()))))

### Building the Model

In [16]:
model = Sequential()
model.add(
    Embedding(
        input_dim=len(tokenizerr.word_index) + 1,
        output_dim=300,
        input_length=X_train.shape[1],
    )
)
model.add(LSTM(32, activation="tanh", dropout=0.3))
model.add(Dense(num_classes, activation="softmax", kernel_regularizer=l2(0.1)))

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

checkpoint = ModelCheckpoint(
    filepath="./LSTM-emb.hdf5",
    monitor="val_accuracy",
    save_best_only=True,
    save_weights_only=True,
)

### Model Training

In [18]:
model.fit(
    X_train,
    y_train,
    epochs=4,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[checkpoint]
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x1e67fb4ee50>

### Model Evaluation

In [19]:
model.load_weights("./LSTM-emb.hdf5")
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss}, Validation Accuracy: {accuracy}")

Validation Loss: 0.5244214534759521, Validation Accuracy: 0.8365429639816284


### Testing

In [20]:
# Reading/Loading Test Data
file_path = "test _no_label.csv" # Path on Kaggle: /kaggle/input/arabic-sentiment-analysis-nlp/test _no_label.csv
df_test = pd.read_csv(file_path)
df_test

Unnamed: 0,ID,review_description
0,1,اهنئكم على خدمه العملاء في المحادثه المباشره م...
1,2,ممتاز جدا ولكن اتمنى ان تكون هناك بعض المسابقا...
2,3,كل محملته يقول تم ايقاف حطيت2 عشان تسوون الخطاء
3,4,شغل طيب
4,5,بعد ماجربت
...,...,...
995,996,يستهل
996,997,خدمة سيئة بكل المعايير
997,998,لؤي٠٣٣٢لؤ٣٤٣س
998,999,تطبيق غير صادق ف خصم الكوبونات


In [21]:
# Same steps/processing should be applied on Test Data
df_test['final_tokens'] = df_test['review_description'].apply(
    lambda x: text_processing_pipeline(x, stemmer, lemmatizer, tokenizer, stop_words)
)

#empty_rows = df_test['final_tokens'].apply(lambda x: len(x) == 0)
#df_test[empty_rows]
print("There are 3 empty rows..")

32037
32038
32039
32040
32041
32042
32043
32044
32045
32046
32047
32048
32049
32050
32051
32052
32053
32054
32055
32056
32057
32058
32059
32060
32061
32062
32063
32064
32065
32066
32067
32068
32069
32070
32071
32072
32073
32074
32075
32076
32077
32078
32079
32080
32081
32082
32083
32084
32085
32086
32087
32088
32089
32090
32091
32092
32093
32094
32095
32096
32097
32098
32099
32100
32101
32102
32103
32104
32105
32106
32107
32108
32109
32110
32111
32112
32113
32114
32115
32116
32117
32118
32119
32120
32121
32122
32123
32124
32125
32126
32127
32128
32129
32130
32131
32132
32133
32134
32135
32136
32137
32138
32139
32140
32141
32142
32143
32144
32145
32146
32147
32148
32149
32150
32151
32152
32153
32154
32155
32156
32157
32158
32159
32160
32161
32162
32163
32164
32165
32166
32167
32168
32169
32170
32171
32172
32173
32174
32175
32176
32177
32178
32179
32180
32181
32182
32183
32184
32185
32186
32187
32188
32189
32190
32191
32192
32193
32194
32195
32196
32197
32198
32199
32200
32201
32202
3220

In [22]:
# Padding Test Data
sequences_test = tokenizerr.texts_to_sequences(df_test['final_tokens'])
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_len)

In [23]:
# Padding Test Data to the Model and Saving Predictions
predictions = model.predict(padded_sequences_test)
predicted_labels = np.argmax(predictions, axis=1) - 1

output_df = pd.DataFrame({"ID": df_test["ID"], "rating": predicted_labels})

output_df.to_csv("test_results_LSTM_emb.csv", index=False)

