In [169]:
import pandas as pd
import os
import nltk
nltk.download('stopwords')
from nltk.stem.isri import ISRIStemmer
from nltk import word_tokenize
from string import punctuation

# Load FAQs
Questions_PATH="./"

def load_dataset(dataset_path=Questions_PATH):
    csv_path = os.path.join(dataset_path, "faqs.csv")
    return pd.read_csv(csv_path)

data_original = load_dataset()
data = load_dataset()

# Define stemmer
stemmer = ISRIStemmer()

# Define stopwords
stopwords = set(nltk.corpus.stopwords.words('arabic') + list(punctuation))

# Prepare data with stemming and remove stopwords
def preprocess(text):
    words = word_tokenize(str(text))
    words = [word for word in words if word not in stopwords]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

for index, row in data.iterrows():
    data['question'][index] = preprocess(row['question'])
    data['answer'][index] = preprocess(row['answer'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [170]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.regularizers import l2
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.optimizers import Adamax

# Define tokenizer
tokenizer = Tokenizer()

# Fit tokenizer on data
tokenizer.fit_on_texts(data)

# Convert data to sequences
sequences = tokenizer.texts_to_sequences(data)

# Pad sequences
trunc_type='post'
padding_type='post'
max_len = max([len(seq) for seq in sequences])
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

# Define model architecture
out_dim = 64
lstm_dim = 64
dense_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=out_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim)),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dense(1, activation='softmax')
])

# Compile model
opt = Adamax(learning_rate=0.03, beta_1=0.8, beta_2=0.9999)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

# Define hyperparameters
batch_size = 64
epochs = 40

# Train model
model.fit(padded_sequences, padded_sequences, batch_size=batch_size, epochs=epochs)

Model: "sequential_69"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_69 (Embedding)    (None, 1, 64)             192       
                                                                 
 bidirectional_69 (Bidirecti  (None, 128)              66048     
 onal)                                                           
                                                                 
 dense_138 (Dense)           (None, 64)                8256      
                                                                 
 dense_139 (Dense)           (None, 1)                 65        
                                                                 
Total params: 74,561
Trainable params: 74,561
Non-trainable params: 0
_________________________________________________________________
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoc

<keras.callbacks.History at 0x7fae2b2b1b80>

In [172]:
sen = preprocess('يعني ايه اعلانات')
train_X_seq = tokenizer.texts_to_sequences([sen])
train_X_pad = tf.keras.preprocessing.sequence.pad_sequences(train_X_seq, maxlen=max_len, padding=padding_type, truncating=trunc_type)

result = model.predict(train_X_pad)
print(result)
print(data_original.answer[result.argmax()])


[[1.]]
هي بوابة رقمية تحتوى علي العديد من الخدمات الالكترونية
