In [1]:
import random 
import numpy as np
import torch
import tensorflow as tf 
seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")    

In [2]:
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()
    # Remove links
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove usernames
    text = re.sub(r'@\w+', '', text)

    # Replace underscores with space
    text = text.replace('_', ' ')

    
    # Remove hashtag symbol but keep the word
    text = re.sub(r'#', '', text)

    # Remove emojis (basic unicode emoji pattern)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\u2600-\u26FF"          # miscellaneous symbols
                               u"\u2700-\u27BF"          # dingbats
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove numbers and punctuations
    del_chars = string.punctuation + "0123456789" + "،؛؟ـ“”"
    text = ''.join(char for char in text if char not in del_chars)

    # Strip extra whitespace
    text = text.strip()

    # Tokenization and stopword removal
    stop_words = set(stopwords.words('arabic'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    return ' '.join(tokens)

In [4]:
import pandas as pd

# read the CSV file arabic sentiment analysis dataset
df = pd.read_csv('./data/Arabic Sentiment Analysis Dataset - SS2030.csv',sep=";", encoding='utf-8', on_bad_lines='skip')

df

Unnamed: 0,text,Sentiment
0,حقوق المرأة 💚💚💚 https://t.co/Mzf90Ta5g1,1
1,RT @___IHAVENOIDEA: حقوق المرأة في الإسلام. ht...,1
2,RT @saud_talep: Retweeted لجنة التنمية بشبرا (...,1
3,RT @MojKsa: حقوق المرأة التي تضمنها لها وزارة ...,1
4,RT @abm112211: ولي امر الزوجة او ولي الزوجة او...,1
...,...,...
4247,#غرد_بحبك_لمحمد_بن_سلمان ❤️,1
4248,#غرد_بحبك_لمحمد_بن_سلمان \n محمدبن سلمان احبه ...,1
4249,#غرد_بحبك_لمحمد_بن_سلمان \n الله يحفظك يا ذخر ...,1
4250,#غرد_بحبك_لمحمد_بن_سلمان \n \n الله يحفظه ويحم...,1


In [5]:
# Apply it to your dataframe
df['cleaned_text'] = df['text'].apply(preprocess_text)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4252 entries, 0 to 4251
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          4252 non-null   object
 1   Sentiment     4252 non-null   int64 
 2   cleaned_text  4252 non-null   object
dtypes: int64(1), object(2)
memory usage: 99.8+ KB


In [7]:
df

Unnamed: 0,text,Sentiment,cleaned_text
0,حقوق المرأة 💚💚💚 https://t.co/Mzf90Ta5g1,1,حقوق المرأة
1,RT @___IHAVENOIDEA: حقوق المرأة في الإسلام. ht...,1,rt حقوق المرأة الإسلام
2,RT @saud_talep: Retweeted لجنة التنمية بشبرا (...,1,rt retweeted لجنة التنمية بشبرا زال التسجيل مس...
3,RT @MojKsa: حقوق المرأة التي تضمنها لها وزارة ...,1,rt حقوق المرأة تضمنها وزارة العدل
4,RT @abm112211: ولي امر الزوجة او ولي الزوجة او...,1,rt ولي امر الزوجة او ولي الزوجة او ولي المراة ...
...,...,...,...
4247,#غرد_بحبك_لمحمد_بن_سلمان ❤️,1,غرد بحبك لمحمد بن سلمان
4248,#غرد_بحبك_لمحمد_بن_سلمان \n محمدبن سلمان احبه ...,1,غرد بحبك لمحمد بن سلمان محمدبن سلمان احبه الله...
4249,#غرد_بحبك_لمحمد_بن_سلمان \n الله يحفظك يا ذخر ...,1,غرد بحبك لمحمد بن سلمان الله يحفظك ذخر الوطن و...
4250,#غرد_بحبك_لمحمد_بن_سلمان \n \n الله يحفظه ويحم...,1,غرد بحبك لمحمد بن سلمان الله يحفظه ويحميه ويقو...


In [8]:
X = df['cleaned_text'].values
y = df['Sentiment'].values


In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padding = pad_sequences(X_seq,maxlen=128 ,padding='post')
X_padding.shape

(4252, 128)

In [10]:
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 23556


In [16]:
import numpy as np

np.unique(y)

array([0, 1], dtype=int64)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test_val, y_train, y_test_val = train_test_split(X_padding, y, test_size=0.3, random_state=seed, stratify=y)
X_test,X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=seed, stratify=y_test_val)

In [19]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from keras.losses import BinaryCrossentropy

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=300, input_length=128))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification, change to 'softmax' for multi-class

# Compile the model
model.compile(optimizer='adam',
              loss=BinaryCrossentropy(),
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(X_val, y_val))


Epoch 1/10




[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 150ms/step - accuracy: 0.6483 - loss: 0.6152 - val_accuracy: 0.8511 - val_loss: 0.3420
Epoch 2/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 143ms/step - accuracy: 0.9286 - loss: 0.1956 - val_accuracy: 0.8746 - val_loss: 0.4128
Epoch 3/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 141ms/step - accuracy: 0.9927 - loss: 0.0279 - val_accuracy: 0.8715 - val_loss: 0.5470
Epoch 4/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 143ms/step - accuracy: 0.9979 - loss: 0.0095 - val_accuracy: 0.8793 - val_loss: 0.6164
Epoch 5/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 147ms/step - accuracy: 0.9939 - loss: 0.0158 - val_accuracy: 0.8809 - val_loss: 0.5807
Epoch 6/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 148ms/step - accuracy: 0.9953 - loss: 0.0193 - val_accuracy: 0.8840 - val_loss: 0.5288
Epoch 7/10
[1m93/93[0m [32m━━━

In [20]:
eval =model.evaluate(X_test, y_test)
print(f"Test Loss: {eval[0]}, Test Accuracy: {eval[1]}")

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.8458 - loss: 1.0129
Test Loss: 1.0796666145324707, Test Accuracy: 0.8354231715202332
