In [1]:
import sys
import os
path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if path not in sys.path:
    sys.path.append(path)
import pandas as pd
from constants import HAWAJEZ
from utils.helpers import is_talking_about_it



In [2]:
df = pd.read_csv("../data/data_for_hajez_classifier.csv")

In [3]:
# check null values
df.fillna('', inplace=True)
df.isnull().sum()


full_text                0
message_is_question      0
reply_is_question        0
is_giving_information    0
dtype: int64

In [4]:
df.head(1)

Unnamed: 0,full_text,message_is_question,reply_is_question,is_giving_information
0,الدي سي اوه مشاكل ؟؟,1,0,0


In [5]:
def get_hajez_name(row):
    if row['is_giving_information'] == 0 :
        return "no_hajez"
    if row["message_is_question"] == 1 : 
        return "no_hajez"
    for hajez in HAWAJEZ:
        if is_talking_about_it(row['full_text'], hajez):
            return hajez
    return "no_hajez"     

In [6]:
df['hajez_name'] = df.apply(get_hajez_name, axis=1)

In [7]:
df.sample(10)

Unnamed: 0,full_text,message_is_question,reply_is_question,is_giving_information,hajez_name
31010,شو وضع يستسهار؟؟,1,0,0,no_hajez
7708,دير شرف سالكة؟,1,0,0,no_hajez
28376,حدا ياكدلنا دير شرف انزل؟ لهسه دير شرف سالكة,0,0,1,دير شرف
7999,حاجز حواره مسكر,0,0,1,حوارة
70809,شو وضع صره للخارج ازمة وتفتيش,0,1,1,صره
57029,حد عندو علم ايش اقتحام,0,1,1,no_hajez
72198,فش اشي,1,0,0,no_hajez
33268,اشي عزعتره واقفين جيش بنص شارع نمره ضفه بحولو ...,0,1,0,no_hajez
39041,لمربعة,0,0,0,no_hajez
63872,استنفار كبير لقوات الاحتلال حوارة,0,0,0,no_hajez


In [8]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Concatenate, Input, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model

In [9]:
# Tokenize the Arabic text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['full_text'])
sequences = tokenizer.texts_to_sequences(df['full_text'])
word_index = tokenizer.word_index

In [10]:
# Pad the sequences
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

In [11]:
max_sequence_length

205

In [71]:
# Prepare the binary features
binary_features = df[['message_is_question', 'reply_is_question', 'is_giving_information']].values

# Prepare the target variable
target = pd.get_dummies(df['hajez_name']).values

In [72]:
# Split the data into training and testing sets
X_train_text, X_test_text, X_train_binary, X_test_binary, y_train, y_test = train_test_split(
    padded_sequences, binary_features, target, test_size=0.2
)

In [13]:
# RNN model
embedding_dim = 100
vocab_size = len(word_index) + 1
num_classes = target.shape[1]


In [14]:
# Define the input layers
text_input = Input(shape=(max_sequence_length,), dtype='int32', name='text_input')
binary_input = Input(shape=(3,), dtype='float32', name='binary_input')

In [15]:
# Define the text processing layers
text_embed = Embedding(vocab_size, embedding_dim, input_length=max_sequence_length)(text_input)
text_lstm = Bidirectional(LSTM(256, return_sequences=True))(text_embed)
text_lstm2 = Bidirectional(LSTM(256))(text_lstm)

In [16]:
# Concatenate text features and binary features
merged = Concatenate()([text_lstm2, binary_input])

In [17]:
# Define the output layer with dropout
dropout = Dropout(0.5)(merged)
output = Dense(num_classes, activation='softmax')(dropout)

In [18]:
# Create the model
model = Model(inputs=[text_input, binary_input], outputs=output)

In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [20]:
# Train the model
model.fit(
    {'text_input': X_train_text, 'binary_input': X_train_binary},
    y_train,
    epochs=20,
    batch_size=1024,
    validation_data=({'text_input': X_test_text, 'binary_input': X_test_binary}, y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x22f8971b408>

In [21]:
# save the model
model.save('../models/hajez_classifier.h5')
# save the tokenizer
import joblib
joblib.dump(tokenizer, '../models/hajez_tokenizer.pkl')


['../models/hajez_tokenizer.pkl']

In [73]:
# Assuming you have a new sample as follows:
sample_text = "  سالك سيبسي بسيب سيب سيبسش صرة"
sample_message_is_question = 0
sample_reply_is_question = 0
sample_is_giving_information = 1

# Preprocess the text
sample_sequence = tokenizer.texts_to_sequences([sample_text])
sample_padded_sequence = pad_sequences(sample_sequence, maxlen=max_sequence_length)

# Preprocess the binary features
sample_binary_features = np.array([[sample_message_is_question, sample_reply_is_question, sample_is_giving_information]])

# Predict the category
prediction = model.predict([sample_padded_sequence, sample_binary_features])

# Get the index of the predicted category
predicted_index = np.argmax(prediction)

# Map the index to the correct category
index_to_category = {index: category for index, category in enumerate(pd.get_dummies(df['hajez_name']).columns)}
predicted_category = index_to_category[predicted_index]

print("Predicted category:", predicted_category)


Predicted category: صرة


In [12]:
index_to_category = {index: category for index, category in enumerate(pd.get_dummies(df['hajez_name']).columns)}


In [13]:
index_to_category

{0: 'no_hajez',
 1: 'الحمرا',
 2: 'الطنيب',
 3: 'العيزرية',
 4: 'الفندق',
 5: 'الكونتينر',
 6: 'المربعه',
 7: 'بيت ايل',
 8: 'بيت فوريك',
 9: 'بيتا',
 10: 'تل',
 11: 'جبع',
 12: 'جيت',
 13: 'حزما',
 14: 'حوارة',
 15: 'حومش',
 16: 'دي سي او',
 17: 'دير شرف',
 18: 'زعترة',
 19: 'سلمان',
 20: 'شافي شمرون',
 21: 'صرة',
 22: 'صره',
 23: 'عراق بورين',
 24: 'عناتا',
 25: 'عورتا',
 26: 'عوفرا',
 27: 'عين يبرود',
 28: 'قرني شمرون',
 29: 'قلنديا',
 30: 'كدوميم ',
 31: 'يتسهار'}