In [1]:
import nltk
import unicodedata
import numpy as np
import pandas as pd
from joblib import dump
from nltk.stem import ISRIStemmer, WordNetLemmatizer
from gensim.models import FastText
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, LSTM, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import models, regularizers


In [2]:
file_path = "train.xlsx" 
df_train = pd.read_excel(file_path)
df_train

Unnamed: 0,review_description,rating
0,شركه زباله و سواقين بتبرشم و مفيش حتي رقم للشك...,-1.0
1,خدمة الدفع عن طريق الكي نت توقفت عندي اصبح فقط...,1.0
2,تطبيق غبي و جاري حذفه ، عاملين اكواد خصم و لما...,-1.0
3,فعلا تطبيق ممتاز بس لو فى امكانية يتيح لمستخدم...,1.0
4,سيء جدا ، اسعار رسوم التوصيل لا تمت للواقع ب ص...,-1.0
...,...,...
32031,التطبيق اصبح سيء للغايه نقوم بطلب لا يتم وصول ...,-1.0
32032,y love you,1.0
32033,الباقه بتخلص وبشحن مرتين باقه اضافيه ١٠٠ جنيه,-1.0
32034,تطبيق فاشل وصلني الطلب ناقص ومش ينفع اعمل حاجة...,-1.0


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32036 entries, 0 to 32035
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   review_description  32036 non-null  object 
 1   rating              32036 non-null  float64
dtypes: float64(1), object(1)
memory usage: 500.7+ KB


In [4]:
df_train.isnull().sum()

review_description    0
rating                0
dtype: int64

In [5]:
arabic_stop_words = [
    "و", "في", "من", "على", "إلى", "لا", "أو", "هو", "هي", "يكون",
    "أنا", "أنت", "هو", "هي", "نحن", "أنتم", "هم",
    "عن", "مع", "كما", "مثل", "بين", "إذا", "حتى", "منذ",
    "و", "أو", "لكن", "إذا", "إن",
    "اليوم", "غداً", "الآن", "ثم", "بعد",
    "كان", "يكون", "أصبح", "صار", "ليس", "لم",
    "هذا", "هذه", "ذلك", "تلك", 
    "كل", "على", "فيه", "منه", "عنه", "له", "به", "إليه", "لها", "فيها",
    "بها", "منها", "عنها", "إليها", "الذي", "التي", "اللذين", "اللذان", "اللتان",
    "اللتين", "هؤلاء", "ذلك", "هذه", "هذا", "تلك", "تحت", "فوق", "معه", "لديه",
    "عليه", "عليها", "أي", "هل", "إذا", "ماذا", "هناك", "هنالك", "إلى",
    "يناير", "فبراير", "مارس", "إبريل", "مايو", "يونيو", "يوليو", "أغسطس", "سبتمبر", "أكتوبر", "نوفمبر", "ديسمبر",
    "الأحد", "الاثنين", "الثلاثاء", "الأربعاء", "الخميس", "الجمعة", "السبت"
]

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import string


def text_cleaning(text, stemmer=ISRIStemmer(), lemmatizer=WordNetLemmatizer()):
    # remove stop words and punctuation
    text = text.lower()
    tokenizer = RegexpTokenizer(r"\w+")
    words = tokenizer.tokenize(text)

    stop_words = set(stopwords.words("english")).union(
        set(string.punctuation).union(set(arabic_stop_words))
    )

    filtered_list = [word for word in words if word.casefold() not in stop_words]

    # word stemming
    stem_words = [stemmer.stem(word) for word in filtered_list]

    # lemmatized_words = [lemmatizer.lemmatize(word)for word in filtered_list]

    return " ".join(map(str, stem_words))  # + lemmatized_words


df_train["new review_description"] = df_train["review_description"].apply(
    lambda text: text_cleaning(text)
)
df_train.head(20)

Unnamed: 0,review_description,rating,new review_description
0,شركه زباله و سواقين بتبرشم و مفيش حتي رقم للشك...,-1.0,شرك زبل سوق رشم فيش حتي رقم شكو سوق يسب يمش مي...
1,خدمة الدفع عن طريق الكي نت توقفت عندي اصبح فقط...,1.0,خدم دفع طرق الك نت وقف عند اصبح فقط دفع نقد
2,تطبيق غبي و جاري حذفه ، عاملين اكواد خصم و لما...,-1.0,طبق غبي جري حذف عمل كود خصم لما خدم كتر مرة عم...
3,فعلا تطبيق ممتاز بس لو فى امكانية يتيح لمستخدم...,1.0,فعل طبق متز بس لو فى مكن يتح خدم طبق ان تطع غي...
4,سيء جدا ، اسعار رسوم التوصيل لا تمت للواقع ب ص...,-1.0,سيء جدا سعر رسم وصل تمت وقع ب صله
5,قعد عشرين سنة يدور على سائق بس اما عن توصيل ال...,0.0,قعد عشر سنة يدر سئق بس اما وصل شيء جيد جدا
6,احلئ تطبيق,1.0,حلئ طبق
7,رائع واو مدهش,1.0,رئع واو دهش
8,مکو بس البحرین وعمان وغیرهه بس العراق مکو یعنی...,-1.0,مکو بس حری وعم غیر بس عرق مکو یعنی نجم وحد علی...
9,تطبيق جميل يستاهل الخمس نجوم👍👍👍,1.0,طبق جمل تهل خمس نجم


In [7]:
tfidf = TfidfVectorizer(analyzer="word", max_features=1000, use_idf=True)
tfidf.fit(df_train["new review_description"])
X = tfidf.transform(df_train["new review_description"])

terms = tfidf.get_feature_names_out()
X1_print = pd.DataFrame(X.toarray(), columns=terms)
print(X1_print.shape)
X1_print.head()

(32036, 10000)


Unnamed: 0,00,000,00000⁰00مو,000omr,007,00l8l90l,011,0155_,064ssr100xtjuenezqg2ftbkoa6vg91pi21273620,08,...,ﻹعل,ﻻحظ,ﻻزم,ﻻسباب,ﻻسف,ﻻصل,ﻻطﻻق,ﻻعل,ﻻنه,ﻻيتم
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
X = X.toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
y = np.array(df_train["rating"])
y

array([-1.,  1., -1., ..., -1., -1.,  1.])

In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=1000, random_state=42, stratify=y
)
print("X_train shape :", X_train.shape)
print("y_train shape :", y_train.shape)
print("X_val shape :", X_val.shape)
print("y_val shape :", y_val.shape)

X_train shape : (25628, 10000)
y_train shape : (25628,)
X_val shape : (6408, 10000)
y_val shape : (6408,)


In [15]:
num_classes = 3

X_train_reshaped = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_val_reshaped = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))

y_train_shifted = y_train + 1
y_val_shifted = y_val + 1

In [16]:
model = Sequential()

# Adjust the input_shape based on the features in your data
model.add(SimpleRNN(64, activation="relu", input_shape=(1, X_train.shape[1])))
model.add(tf.keras.layers.Dropout(0.6))
model.add(Dense(32, activation="tanh", kernel_regularizer=regularizers.l2(0.01),
    bias_regularizer=regularizers.l2(0.01)))
model.add(Dense(16, activation="tanh"))
# Output layer with softmax activation for multi-class classification
model.add(Dense(num_classes, activation="softmax"))

# Compile the model with 'sparse_categorical_crossentropy' loss
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

checkpoint1 = ModelCheckpoint(
    filepath="./models_hdf5/RNN.hdf5",
    monitor="val_accuracy",
    save_best_only=True,
    save_weights_only=True,
)

In [17]:
# Train the model
model.fit(
    X_train_reshaped,
    y_train_shifted,
    epochs=10,
    batch_size=32,
    validation_data=(X_val_reshaped, y_val_shifted),
    callbacks=[checkpoint1],
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x229d6bdc7f0>

In [19]:
model.load_weights("./models_hdf5/RNN.hdf5")

# Evaluate the model
loss, accuracy = model.evaluate(X_val_reshaped, y_val_shifted)
print(f"Validation Loss: {loss}, Validation Accuracy: {accuracy}")

Validation Loss: 0.49415090680122375, Validation Accuracy: 0.8314606547355652


In [30]:
model2 = Sequential()

model2.add(LSTM(128, activation="tanh", dropout=0.5))
model2.add(Dense(68, activation="tanh"))
model2.add(Dense(32, activation="tanh"))
model2.add(Dense(num_classes, activation="softmax"))

model2.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

checkpoint2 = ModelCheckpoint(
    filepath="./models_hdf5/LSTM.hdf5",
    monitor="val_accuracy",
    save_best_only=True,
    save_weights_only=True,
)

In [31]:
# Train the model
model2.fit(
    X_train_reshaped,
    y_train_shifted,
    epochs=10,
    batch_size=32,
    validation_data=(X_val_reshaped, y_val_shifted),
    callbacks=[checkpoint2],
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x22983d6dc10>

In [32]:
model2.load_weights("./models_hdf5/LSTM.hdf5")

# Evaluate the model
loss, accuracy = model2.evaluate(X_val_reshaped, y_val_shifted)
print(f"Validation Loss: {loss}, Validation Accuracy: {accuracy}")

Validation Loss: 0.51282799243927, Validation Accuracy: 0.8213170766830444


In [33]:
file_path = "test _no_label.csv"
df_test = pd.read_csv(file_path)
df_test

Unnamed: 0,ID,review_description
0,1,اهنئكم على خدمه العملاء في المحادثه المباشره م...
1,2,ممتاز جدا ولكن اتمنى ان تكون هناك بعض المسابقا...
2,3,كل محملته يقول تم ايقاف حطيت2 عشان تسوون الخطاء
3,4,شغل طيب
4,5,بعد ماجربت
...,...,...
995,996,يستهل
996,997,خدمة سيئة بكل المعايير
997,998,لؤي٠٣٣٢لؤ٣٤٣س
998,999,تطبيق غير صادق ف خصم الكوبونات


In [34]:
df_test["review_description"] = df_test["review_description"].apply(
    lambda text: text_cleaning(text)
)
df_test

Unnamed: 0,ID,review_description
0,1,هنئ خدم عملاء حدث بشر ما قصرو الل وفق يعط الف عفي
1,2,متز جدا ولكن منى ان تكون بعض سبق جئز طلب سعد ف...
2,3,حمل يقل تم يقف حطيت2 عشن تسو خطء
3,4,شغل طيب
4,5,اجرب
...,...,...
995,996,سهل
996,997,خدم سيئ بكل معايير
997,998,لؤي٠٣٣٢لؤ٣٤٣س
998,999,طبق غير صدق ف خصم كوبو


In [35]:
X_test = tfidf.transform(df_test["review_description"])
X_test = X_test.toarray()
X_test_reshaped = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

In [36]:
y_pred_lstm = model2.predict(X_test_reshaped)
y_pred_rnn = model.predict(X_test_reshaped)



In [37]:
def return_prediction(y_pred):
    prediction = []
    for predict in y_pred:
        maxi = np.argmax(predict)
        if maxi == 2:
            prediction.append(1)
        elif maxi == 1:
            prediction.append(0)
        else:
            prediction.append(-1)
    return prediction


prediction_lstm = return_prediction(y_pred_lstm)
prediction_rnn = return_prediction(y_pred_rnn)

print(len(prediction_lstm), len(prediction_rnn))

1000 1000


In [38]:
results_df = pd.DataFrame({'ID': df_test['ID'], 'rating': prediction_lstm})
results_df.to_csv('./results/test_results_LSTM.csv', index=False)
results_df

Unnamed: 0,ID,rating
0,1,1
1,2,1
2,3,-1
3,4,1
4,5,-1
...,...,...
995,996,1
996,997,-1
997,998,1
998,999,-1


In [39]:
results_df = pd.DataFrame({'ID': df_test['ID'], 'rating': prediction_rnn})
results_df.to_csv('./results/test_results_rnn.csv', index=False)
results_df

Unnamed: 0,ID,rating
0,1,1
1,2,1
2,3,-1
3,4,1
4,5,1
...,...,...
995,996,1
996,997,-1
997,998,1
998,999,-1


In [40]:
comparison_result = np.array(prediction_lstm) == np.array(prediction_rnn)
print("The equal values are : ", np.sum(comparison_result))

The equal values are :  910


In [41]:
not_equal_idx = np.where(comparison_result != True)
print(f"Indexes where the values are not equal: {not_equal_idx}")


Indexes where the values are not equal: (array([  4,  14,  20,  26,  36,  58,  80,  88, 103, 110, 119, 125, 127,
       180, 222, 228, 248, 258, 259, 261, 263, 264, 284, 295, 296, 312,
       320, 325, 331, 336, 360, 364, 381, 391, 398, 410, 427, 442, 459,
       481, 483, 490, 494, 499, 506, 508, 514, 517, 542, 569, 580, 591,
       592, 601, 607, 623, 626, 630, 635, 648, 658, 669, 673, 678, 696,
       709, 733, 754, 787, 788, 795, 802, 826, 831, 835, 847, 852, 857,
       872, 878, 893, 897, 899, 913, 920, 931, 935, 969, 970, 982],
      dtype=int64),)


In [None]:
for idx in not_equal_idx[0]:
    text = df_test.iloc[idx, 1]
    rnn_prediction = prediction_rnn[idx]
    lstm_prediction = prediction_lstm[idx]
    print(f"Text: {text}, RNN: {rnn_prediction}, LSTM: {lstm_prediction}")

Text: يا ليت وصل سلط عمن, RNN: 1, LSTM: -1
Text: طلب يتز وبج ماك شهر تمر عرب خله دي نور دنا نزل ا, RNN: 1, LSTM: -1
Text: طلب اخر وقت ومف تبع درة, RNN: 1, LSTM: -1
Text: رئع درج كبر تخل اني جوع ومف احد عشن يرح شري اقم اخذ جول طلب اول, RNN: 1, LSTM: -1
Text: بطل يجي عند رمز لام لما طلب ولا حدا برن شو شكل, RNN: 1, LSTM: -1
Text: فضل ليه افش حفظ نيا ركز رجء ضعه خطه, RNN: 1, LSTM: -1
Text: حدث, RNN: 1, LSTM: -1
Text: لو م هينفع حد سعد ف شكل بروموكود حجز رحل همسح رنمج, RNN: -1, LSTM: 1
Text: فضل رنمج ما ترد ما شهي, RNN: 1, LSTM: -1
Text: موعجييييييييييييبة ولا طلعو, RNN: -1, LSTM: 1
Text: اقم نجم لأن مايعرض طعم, RNN: 1, LSTM: -1
Text: طبق روووووعة قيم 100000 نجم, RNN: -1, LSTM: 1
Text: حاج طور نفس طبق لبد ان تكون شرط لجد على طعم نشر عرف دقق وجب وأض عدم همل صور وقع همل جعل طبق تخلف وأض علم ما فئد حدد وقع كان طعم خدم تصل وصف, RNN: 1, LSTM: -1
Text: طبق لسي هربو بس عيز ءكد نكم مش حرم, RNN: 1, LSTM: -1
Text: زعجتو بدع طفل وهذا الل خلا الغ تبع, RNN: -1, LSTM: 1
Text: طبق جمل بس كنت عيز عرف انا ن