In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import json
import requests
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import qalsadi.lemmatizer
import numpy as np
import fasttext
import fasttext.util
from sklearn.metrics import mean_squared_error

## Chargement de Donnees

In [2]:
# Sample Arabic text
df = pd.read_csv('textes.csv')
df.head()

Unnamed: 0,text
0,أكدت صحيفة واشنطن بوست اليوم الثلاثاء أن إسرائ...
1,وصل اليوم الثلاثاء، إلى مدينة قم المقدسة لدى ا...
2,يؤدي التعب والإرهاق الناتج عن الرحلات المتتالي...
3,شاعر ناهض النظام الناصري، ولم يهادن نظام الساد...
4,قال وزير الخارجية الأميركي أنتوني بلينكن -اليو...


## Score data with topic "Gaza"

In [3]:
texts = [text for text in df['text']]

query = " ".join(["غزة حماس", "إسرائيل", "الاحتلال", "الصراع", "القتلى", "الجرحى", "الهدنة", "المقاومة", "قصف", "ضربات", "مقاتلين"])


# Vectorize the Texts
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts + [query])

# Calculate Relevance Scores
query_vector = tfidf_matrix[-1]
text_vectors = tfidf_matrix[:-1]

# Compute cosine similarity between the query and each text
cosine_similarities = cosine_similarity(query_vector, text_vectors).flatten()

# Normalize Scores
normalized_scores = (cosine_similarities - cosine_similarities.min()) / (cosine_similarities.max() - cosine_similarities.min()) * 10

df['score']  = normalized_scores
df.head(6)

Unnamed: 0,text,score
0,أكدت صحيفة واشنطن بوست اليوم الثلاثاء أن إسرائ...,3.117674
1,وصل اليوم الثلاثاء، إلى مدينة قم المقدسة لدى ا...,0.0
2,يؤدي التعب والإرهاق الناتج عن الرحلات المتتالي...,0.0
3,شاعر ناهض النظام الناصري، ولم يهادن نظام الساد...,2.003642
4,قال وزير الخارجية الأميركي أنتوني بلينكن -اليو...,2.882009
5,أثار فيديو سقوط أحد الفلسطينيين أسفل عجلات شاح...,3.038653


## Pre-traitement des donnees 

In [4]:
def preprocessing(texte):

    # Supprimer les caractères spéciaux
    texte = re.sub(r'[^\w\s]', '', texte)

    # Supprimer les retours à la ligne
    texte = re.sub(r'\n', ' ', texte)

    # Supprimer les espaces doubles
    texte = re.sub(r'\s+', ' ', texte)

    return texte

## lemmatization

In [5]:
def lemmatization(text):
    lemmer = qalsadi.lemmatizer.Lemmatizer()
    return lemmer.lemmatize_text(text, return_pos=False)

## Stop Word

In [6]:
def stopWord(text):
    arabic_stopwords = set(stopwords.words('arabic'))
    filtered_tokens = [word for word in text if word not in arabic_stopwords]
    return filtered_tokens


In [7]:
def process_text(text):
    text = preprocessing(text).split()
    text = stopWord(text)
    return text

In [8]:
df['text_cleaned'] = df['text'].apply(process_text)

In [9]:
df.head()

Unnamed: 0,text,score,text_cleaned
0,أكدت صحيفة واشنطن بوست اليوم الثلاثاء أن إسرائ...,3.117674,"[أكدت, صحيفة, واشنطن, بوست, اليوم, الثلاثاء, إ..."
1,وصل اليوم الثلاثاء، إلى مدينة قم المقدسة لدى ا...,0.0,"[وصل, اليوم, الثلاثاء, مدينة, قم, المقدسة, الإ..."
2,يؤدي التعب والإرهاق الناتج عن الرحلات المتتالي...,0.0,"[يؤدي, التعب, والإرهاق, الناتج, الرحلات, المتت..."
3,شاعر ناهض النظام الناصري، ولم يهادن نظام الساد...,2.003642,"[شاعر, ناهض, النظام, الناصري, ولم, يهادن, نظام..."
4,قال وزير الخارجية الأميركي أنتوني بلينكن -اليو...,2.882009,"[قال, وزير, الخارجية, الأميركي, أنتوني, بلينكن..."


## Data embendded

In [10]:
ft = fasttext.load_model('cc.ar.300.bin\cc.ar.300.bin')

In [11]:
def vectorize_sentence(sentence):
    word_vectors = [ft.get_word_vector(word) for word in sentence]
    # Calculer la moyenne des vecteurs de mots
    sentence_vector = np.mean(word_vectors, axis=0)
    return sentence_vector


In [12]:
df['text_vectorized'] = df['text_cleaned'].apply(vectorize_sentence)

In [13]:
df.head()

Unnamed: 0,text,score,text_cleaned,text_vectorized
0,أكدت صحيفة واشنطن بوست اليوم الثلاثاء أن إسرائ...,3.117674,"[أكدت, صحيفة, واشنطن, بوست, اليوم, الثلاثاء, إ...","[0.012120141, 0.041061074, -0.016316786, 0.038..."
1,وصل اليوم الثلاثاء، إلى مدينة قم المقدسة لدى ا...,0.0,"[وصل, اليوم, الثلاثاء, مدينة, قم, المقدسة, الإ...","[-0.006007621, -0.0028243384, -0.0027309244, 0..."
2,يؤدي التعب والإرهاق الناتج عن الرحلات المتتالي...,0.0,"[يؤدي, التعب, والإرهاق, الناتج, الرحلات, المتت...","[-0.02321628, 0.0063483943, -0.011471126, 0.03..."
3,شاعر ناهض النظام الناصري، ولم يهادن نظام الساد...,2.003642,"[شاعر, ناهض, النظام, الناصري, ولم, يهادن, نظام...","[-0.0022376042, 0.028561369, 0.012915291, 0.04..."
4,قال وزير الخارجية الأميركي أنتوني بلينكن -اليو...,2.882009,"[قال, وزير, الخارجية, الأميركي, أنتوني, بلينكن...","[0.022290997, 0.023376264, -0.02258841, 0.0464..."


In [14]:
df.to_csv('textes_vectorization.csv', index=False)

In [15]:
y = df['score'].values
X = np.array(df['text_vectorized'].tolist())

In [16]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## RNN

In [18]:
class TextDataset(Dataset):
    def __init__(self, texts, scores):
        self.texts = [torch.tensor(text, dtype=torch.float32).unsqueeze(0) for text in texts]
        self.scores = torch.tensor(scores, dtype=torch.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.scores[idx]



In [19]:
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [20]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out


input_size = len(X_train[0])
hidden_size = 64
output_size = 1

model = SimpleRNN(input_size, hidden_size, output_size)


In [21]:
from torch import optim

In [22]:
criterion = nn.MSELoss()  # Si vous faites une régression, sinon utilisez CrossEntropyLoss pour la classification
optimizer = optim.Adam(model.parameters(), lr=0.001)


  from .autonotebook import tqdm as notebook_tqdm


In [23]:
num_epochs = 100
def train(train_loader,test_loader,model,num_epochs,optimizer,criterion):
    for epoch in range(num_epochs):
        model.train()
        for texts, scores in train_loader:
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs.squeeze(), scores)
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    model.eval()
    with torch.no_grad():
        test_loss = 0
        for texts, scores in test_loader:
            outputs = model(texts)
            loss = criterion(outputs.squeeze(), scores)
            test_loss += loss.item()

        print(f'Test Loss: {test_loss / len(test_loader):.4f}')
    return outputs


In [24]:
outputs = train(train_loader,test_loader,model,100,optimizer,criterion)

Epoch [1/100], Loss: 18.7548
Epoch [2/100], Loss: 1.0405
Epoch [3/100], Loss: 1.4478
Epoch [4/100], Loss: 5.3831
Epoch [5/100], Loss: 1.3429
Epoch [6/100], Loss: 0.3651
Epoch [7/100], Loss: 0.3930
Epoch [8/100], Loss: 0.4668
Epoch [9/100], Loss: 0.6961
Epoch [10/100], Loss: 0.4449
Epoch [11/100], Loss: 16.6715
Epoch [12/100], Loss: 1.1875
Epoch [13/100], Loss: 0.4123
Epoch [14/100], Loss: 0.6041
Epoch [15/100], Loss: 4.3253
Epoch [16/100], Loss: 4.3701
Epoch [17/100], Loss: 0.7145
Epoch [18/100], Loss: 0.5718
Epoch [19/100], Loss: 1.1308
Epoch [20/100], Loss: 4.7605
Epoch [21/100], Loss: 1.7571
Epoch [22/100], Loss: 0.4038
Epoch [23/100], Loss: 0.6347
Epoch [24/100], Loss: 1.0951
Epoch [25/100], Loss: 4.1749
Epoch [26/100], Loss: 0.4867
Epoch [27/100], Loss: 0.3954
Epoch [28/100], Loss: 0.4173
Epoch [29/100], Loss: 0.2913
Epoch [30/100], Loss: 0.2755
Epoch [31/100], Loss: 0.3656
Epoch [32/100], Loss: 0.4857
Epoch [33/100], Loss: 0.4929
Epoch [34/100], Loss: 0.4178
Epoch [35/100], Loss:

In [25]:
mean_squared_error(outputs,y_test)

0.7233337499079836

## LSTM

In [26]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

In [27]:
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

In [28]:
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))

  super().__init__(**kwargs)


In [29]:
model.compile(optimizer='RMSProp', loss='mse')

In [30]:
history = model.fit(X_train, y_train, epochs=89, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/89
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 178ms/step - loss: 3.6723 - val_loss: 1.3951
Epoch 2/89
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 3.2950 - val_loss: 1.3188
Epoch 3/89
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 2.9765 - val_loss: 1.2534
Epoch 4/89
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 3.5816 - val_loss: 1.2122
Epoch 5/89
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 2.8352 - val_loss: 1.1635
Epoch 6/89
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 3.2330 - val_loss: 1.1414
Epoch 7/89
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 2.1663 - val_loss: 1.1000
Epoch 8/89
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 3.2346 - val_loss: 1.0749
Epoch 9/89
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [31]:
loss = model.evaluate(X_test, y_test)
print('Test loss:', loss)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 0.6205
Test loss: 0.6204981207847595


## GRU

In [32]:
import numpy as np
from keras.models import Sequential
from keras.layers import GRU, Dense

In [33]:
model = Sequential()
model.add(GRU(64, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))

In [34]:
model.compile(optimizer='adam', loss='mse')

In [35]:
history = model.fit(X_train, y_train, epochs=66, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/66
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 177ms/step - loss: 3.0113 - val_loss: 1.3397
Epoch 2/66
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 3.7316 - val_loss: 1.2593
Epoch 3/66
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 3.7614 - val_loss: 1.1944
Epoch 4/66
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 2.6925 - val_loss: 1.1415
Epoch 5/66
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 2.3915 - val_loss: 1.0964
Epoch 6/66
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 2.5983 - val_loss: 1.0643
Epoch 7/66
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 3.3378 - val_loss: 1.0444
Epoch 8/66
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 3.2658 - val_loss: 1.0316
Epoch 9/66
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [36]:
loss = model.evaluate(X_test, y_test)
print('Test loss:', loss)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 0.6878
Test loss: 0.6877862215042114
