In [2]:
import fasttext
import fasttext.util
fasttext.util.download_model('ar', if_exists='ignore')  # 
ft = fasttext.load_model('cc.ar.300.bin')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz


In [7]:
import fasttext

# Load the binary model
model = fasttext.load_model('cc.ar.300.bin')

# Get vector for a word
word_vector = model.get_word_vector("كلمة")  # Replace with any Arabic word

print(word_vector)


[ 1.15856072e-02 -4.86626327e-02  2.67462805e-02  6.43483251e-02
  1.54979438e-01 -4.81307730e-02  3.12162098e-02  2.80798487e-02
  2.06149798e-02 -1.22332178e-01  3.65159474e-03 -6.15036115e-02
  2.28887852e-02  1.02174781e-01  6.18360899e-02  6.00447953e-02
  4.79197428e-02  4.96810079e-02 -1.48373721e-02  8.60449523e-02
 -1.20737165e-01  5.29689640e-02 -1.19802587e-01  1.51716378e-02
  1.61436141e-01  2.32068747e-02 -6.02198057e-02  1.45112768e-01
 -5.14922440e-02  6.14693314e-02 -6.02710247e-02  2.02061757e-02
 -4.28374074e-02  3.94743681e-02 -3.56632471e-02 -3.01022232e-02
 -3.54948491e-02 -2.08179727e-02  9.90509242e-02  2.57957517e-03
  2.83871852e-02  1.56411771e-02  2.99154036e-02  2.48067882e-02
  4.78997752e-02  7.03856200e-02 -1.06203228e-01 -5.09965234e-02
  1.31247099e-04  4.30895165e-02  1.59014329e-01  1.01993106e-01
  4.66434434e-02  1.46038048e-02 -1.33927315e-01 -1.99756008e-02
  5.91958985e-02  1.10708416e-01 -7.65340626e-02 -3.63989249e-02
  9.87872705e-02 -1.08713

In [8]:
len(model.get_words())

2000000

In [9]:
import random 
import numpy as np
import torch
import tensorflow as tf 
seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")    

In [10]:
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()
    # Remove links
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove usernames
    text = re.sub(r'@\w+', '', text)

    # Replace underscores with space
    text = text.replace('_', ' ')

    
    # Remove hashtag symbol but keep the word
    text = re.sub(r'#', '', text)

    # Remove emojis (basic unicode emoji pattern)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\u2600-\u26FF"          # miscellaneous symbols
                               u"\u2700-\u27BF"          # dingbats
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove numbers and punctuations
    del_chars = string.punctuation + "0123456789" + "،؛؟ـ“”"
    text = ''.join(char for char in text if char not in del_chars)

    # Strip extra whitespace
    text = text.strip()

    # Tokenization and stopword removal
    stop_words = set(stopwords.words('arabic'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    return ' '.join(tokens)

In [12]:
import pandas as pd

# read the CSV file arabic sentiment analysis dataset
df = pd.read_csv('./data/Arabic Sentiment Analysis Dataset - SS2030.csv',sep=";", encoding='utf-8', on_bad_lines='skip')

df

Unnamed: 0,text,Sentiment
0,حقوق المرأة 💚💚💚 https://t.co/Mzf90Ta5g1,1
1,RT @___IHAVENOIDEA: حقوق المرأة في الإسلام. ht...,1
2,RT @saud_talep: Retweeted لجنة التنمية بشبرا (...,1
3,RT @MojKsa: حقوق المرأة التي تضمنها لها وزارة ...,1
4,RT @abm112211: ولي امر الزوجة او ولي الزوجة او...,1
...,...,...
4247,#غرد_بحبك_لمحمد_بن_سلمان ❤️,1
4248,#غرد_بحبك_لمحمد_بن_سلمان \n محمدبن سلمان احبه ...,1
4249,#غرد_بحبك_لمحمد_بن_سلمان \n الله يحفظك يا ذخر ...,1
4250,#غرد_بحبك_لمحمد_بن_سلمان \n \n الله يحفظه ويحم...,1


In [13]:
# Apply it to your dataframe
df['cleaned_text'] = df['text'].apply(preprocess_text)

In [14]:
X = df['cleaned_text'].values
y = df['Sentiment'].values

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padding = pad_sequences(X_seq,maxlen=128 ,padding='post')
X_padding.shape

(4252, 128)

In [16]:
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 23556


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test_val, y_train, y_test_val = train_test_split(X_padding, y, test_size=0.3, random_state=seed, stratify=y)
X_test,X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=seed, stratify=y_test_val)

In [18]:
embedding_dim = 300 

embedding_matrix = np.zeros((vocab_size, embedding_dim))

In [19]:
word_index = tokenizer.word_index
for word, i in word_index.items():
    embedding_vector = model.get_word_vector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [20]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from keras.losses import BinaryCrossentropy

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=300,weights= embedding_matrix,trainable= False ,input_length=128))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification, change to 'softmax' for multi-class

# Compile the model
model.compile(optimizer='adam',
              loss=BinaryCrossentropy(),
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(X_val, y_val))




Epoch 1/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 103ms/step - accuracy: 0.6448 - loss: 0.6275 - val_accuracy: 0.8229 - val_loss: 0.3797
Epoch 2/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 93ms/step - accuracy: 0.8001 - loss: 0.4134 - val_accuracy: 0.8370 - val_loss: 0.4574
Epoch 3/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 91ms/step - accuracy: 0.8159 - loss: 0.3987 - val_accuracy: 0.8730 - val_loss: 0.3321
Epoch 4/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 92ms/step - accuracy: 0.8376 - loss: 0.3585 - val_accuracy: 0.8730 - val_loss: 0.3015
Epoch 5/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 91ms/step - accuracy: 0.8700 - loss: 0.3157 - val_accuracy: 0.8777 - val_loss: 0.3035
Epoch 6/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 87ms/step - accuracy: 0.8642 - loss: 0.3048 - val_accuracy: 0.8793 - val_loss: 0.2983
Epoch 7/10
[1m93/93[0m [32m━━

In [23]:
eval = model.evaluate(X_test, y_test)
print(f"Test Loss: {eval[0]}, Test Accuracy: {eval[1]}")

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.8353 - loss: 0.4012
Test Loss: 0.4560665488243103, Test Accuracy: 0.8213165998458862


In [24]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from keras.losses import BinaryCrossentropy

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=300,weights= embedding_matrix,trainable= True ,input_length=128))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification, change to 'softmax' for multi-class

# Compile the model
model.compile(optimizer='adam',
              loss=BinaryCrossentropy(),
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(X_val, y_val))


Epoch 1/10




[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 155ms/step - accuracy: 0.6749 - loss: 0.5805 - val_accuracy: 0.8339 - val_loss: 0.3604
Epoch 2/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 168ms/step - accuracy: 0.9221 - loss: 0.2169 - val_accuracy: 0.8730 - val_loss: 0.3815
Epoch 3/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 156ms/step - accuracy: 0.9812 - loss: 0.0679 - val_accuracy: 0.8966 - val_loss: 0.3895
Epoch 4/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 168ms/step - accuracy: 0.9936 - loss: 0.0240 - val_accuracy: 0.8918 - val_loss: 0.4053
Epoch 5/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 155ms/step - accuracy: 0.9964 - loss: 0.0104 - val_accuracy: 0.8887 - val_loss: 0.4774
Epoch 6/10
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 143ms/step - accuracy: 0.9982 - loss: 0.0070 - val_accuracy: 0.8856 - val_loss: 0.5279
Epoch 7/10
[1m93/93[0m [32m━━━

In [27]:
eval = model.evaluate(X_test, y_test)
print(f"Test Loss: {round(eval[1],5)}, Test Accuracy: {round(eval[1],2)}")

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.8580 - loss: 0.8661
Test Loss: 0.84639, Test Accuracy: 0.85
