# Load FAQs Dataset

In [40]:
import pandas as pd
import os

Questions_PATH="./"

def load_dataset(dataset_path=Questions_PATH):
    csv_path = os.path.join(dataset_path, "faqs.csv")
    return pd.read_csv(csv_path)

data_original = load_dataset()
data = load_dataset()

# Dataset Preprocessing - Input

In [42]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem.isri import ISRIStemmer
from nltk.corpus import stopwords
from string import punctuation

data['stemmed_question'] = ''

# Define stemmer
stemmer = ISRIStemmer()

# Define stopwords
stopwords = set(stopwords.words('arabic') + list(punctuation))

# Prepare data with stemming and remove stopwords
def preprocess(text):
    words = word_tokenize(str(text))
    words = [word for word in words if word not in stopwords]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

for index, row in data.iterrows():
    data['stemmed_question'][index] = preprocess(row['question'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Dataset Preprocessing - Output

In [43]:
from sklearn.preprocessing import OneHotEncoder

label_cat = data[['answer']]

cat_encoder = OneHotEncoder()
label_cat_1hot = cat_encoder.fit_transform(label_cat).toarray()
label_cat_1hot

array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])

# Shuffle and Split into train and test sets

In [44]:
import numpy as np

def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

sentences = data['stemmed_question'].to_numpy()
X, Y = unison_shuffled_copies(sentences, label_cat_1hot)

training_size = int(len(X) * 0.8)
train_X, test_X = X[:training_size], X[training_size:]
train_Y, test_Y = Y[:training_size], Y[training_size:]

print(len(train_X))
print(len(train_Y))

print(len(test_X))
print(len(test_Y))

8
8
3
3


# Sequening and Padding

In [45]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define tokenizer
oov_tok = "<OOV>"
tokenizer = Tokenizer(oov_token=oov_tok)

# Fit tokenizer on data
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index

# Convert data to sequences
train_X_seq = tokenizer.texts_to_sequences(train_X)
test_X_seq = tokenizer.texts_to_sequences(test_X)

# Pad sequences
trunc_type='post'
padding_type='post'
max_length = 100   #max([len(seq) for seq in sequences])

train_X_pad = pad_sequences(train_X_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
test_X_pad = pad_sequences(test_X_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(test_X)
print(test_X_seq)
print(train_X)
print(test_X_pad)

['قيم قبل الي شرك نصة' 'الب سوق توفر نصة'
 'يمكن سجل كثر حسب نفس شخص علي نصة']
[[20, 21, 22, 23, 2], [24, 25, 26, 2], [3, 6, 27, 7, 28, 29, 4, 2]]
['يمك طلع علي عمل سبق' 'نصة' 'يمكن سجل ورد علي نصة' 'يمكن طلع فصل حسب'
 'علن' 'خبر' 'يمكن سجل حسب بدن رقم هتف' 'يمكن طلع علي فصل وثق طبع']
[[20 21 22 23  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [24 25 26  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [ 3  6 27  7 28 29  4  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  

# Build Deep Learning Model

In [48]:
import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.optimizers import Adamax


# Define model architecture
out_dim = 64
lstm_dim = 64
dense_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=100, output_dim=out_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim)),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dense(11, activation='softmax')
])

# Compile model
opt = Adamax(learning_rate=0.03, beta_1=0.8, beta_2=0.9999)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 100, 64)           6400      
                                                                 
 bidirectional_10 (Bidirecti  (None, 128)              66048     
 onal)                                                           
                                                                 
 dense_20 (Dense)            (None, 64)                8256      
                                                                 
 dense_21 (Dense)            (None, 11)                715       
                                                                 
Total params: 81,419
Trainable params: 81,419
Non-trainable params: 0
_________________________________________________________________


# Train & Validate Model

In [49]:
# Define hyperparameters
batch_size = 64
epochs = 100

# Train model
model.fit(train_X_pad, train_Y, batch_size=batch_size, epochs=epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f3b019cdac0>

# Testing

In [51]:
sen = preprocess('ما هي المنصة')
train_X_seq = tokenizer.texts_to_sequences([sen])
train_X_pad = pad_sequences(train_X_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(train_X_pad)
result = model.predict(train_X_pad)
print(result)
print(cat_encoder.categories_[0][result.argmax()])
print(result.max())

[[2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[1.2645840e-15 4.7119960e-12 1.9210051e-09 5.1304269e-12 1.9900085e-08
  6.0202225e-11 5.2022700e-05 9.9988377e-01 2.3839291e-07 6.3859712e-05
  5.8871339e-09]]
هي بوابة رقمية تحتوى علي العديد من الخدمات الالكترونية
0.9998838
