In [5]:
PATH = r'df_preprocces.csv'

In [6]:
#Load libraries 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
import re
from tensorflow.keras.callbacks import EarlyStopping





In [10]:
# normalize some char
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

In [11]:
#Read dataframe
df = pd.read_csv(PATH)

In [13]:
df['name'] = df['name'].apply(lambda x: normalize_arabic(x)) #apply normalize

In [14]:
df.shape

(31734, 3)

In [15]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,name,status
13089,9186,عدي مبارك زين,1.0
19501,29692,عشني نصر علاء ادين,0.0
21425,3377,سبيعه سعيد جمال,1.0
5001,28853,خزسنه ناهيه هاءي,0.0
21796,23235,نهاف ائاويد عبدالرحمن,0.0
26771,26459,خلف اشبم وهب,0.0
29226,20476,سفيان بمي عباده,0.0
17960,27148,ايهم مجاهد اخلقص,0.0
17078,27314,مقك ميساء وسيمظ,0.0
8479,18273,سفيهن عطا ناجده,0.0


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    df['name'], df['status'], test_size=0.15, random_state=1000) #Split train and test

In [17]:
#preprocces
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['name'].values)

X_train_tok = tokenizer.texts_to_sequences(X_train)
X_test_tok = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(X_train_tok[2])
print(X_train[2])

[683, 707, 522]
عليان روكن جليل


In [18]:
maxlen = 3
X_train = pad_sequences(X_train_tok, padding='post',maxlen=maxlen,truncating='post')
X_test = pad_sequences(X_test_tok, padding='post', maxlen=maxlen,truncating='post')
vocab_size

20636

In [19]:
embedding_dim = 32

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(LSTM(128, activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(.4))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(.5))


model.add(layers.Dense(1, activation='sigmoid'))




model_checkpoint_callback = ModelCheckpoint(
    save_weights_only=False,
    monitor='val_loss',
    mode='max',
    save_best_only=True,
    filepath= 'name.h5')

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 32)             660352    
                                                                 
 lstm (LSTM)                 (None, 128)               82432     
                                                                 
 flatten (Flatten)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 128)               8320      
                                                                 
 dropout_1 (Dropout)         (None, 128)               0

In [None]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    validation_data=(X_test, y_test),
                    batch_size=32,
                    callbacks=[model_checkpoint_callback])
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print(f"Training Accuracy: {accuracy}")
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print(f"Testing Accuracy:  {accuracy}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

In [None]:
# Saving the objects:
with open('tokenizer.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump(tokenizer, f)