In [32]:
PATH = r'df_preprocces.csv'

In [33]:
#Load libraries 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
import re
from tensorflow.keras.callbacks import EarlyStopping





In [34]:
# normalize some char
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

In [35]:
#Read dataframe
df = pd.read_csv(PATH)

In [36]:
df['name'] = df['name'].apply(lambda x: normalize_arabic(x)) #apply normalize

In [37]:
df.shape

(31734, 3)

In [38]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,name,status
13934,30725,طءهر ساعد ناجده,0.0
6169,22509,مئمون عاليه نشمي,0.0
26351,12744,ساره جمعان صدام,1.0
12205,26044,نزيه ايثم لوث,0.0
23176,21946,حفصه كذاري انبهاج,0.0
22770,16643,ديمه منههي هطير,0.0
23397,15312,نصر اغلب زيدان,1.0
27541,17759,وليفه رئيف بطاح,0.0
22376,17867,تيسير برج زكيه,0.0
2360,26784,مبخلت خطاب المثني,0.0


In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    df['name'], df['status'], test_size=0.15, random_state=1000) #Split train and test

In [40]:
#preprocces
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['name'].values)

X_train_tok = tokenizer.texts_to_sequences(X_train)
X_test_tok = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(X_train_tok[2])
print(X_train[2])

[683, 707, 522]
عليان روكن جليل


In [41]:
maxlen = 3
X_train = pad_sequences(X_train_tok, padding='post',maxlen=maxlen,truncating='post')
X_test = pad_sequences(X_test_tok, padding='post', maxlen=maxlen,truncating='post')
vocab_size

20636

In [42]:
# Model architecture

embedding_dim = 32

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(LSTM(128, activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(.4))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(.5))


model.add(layers.Dense(1, activation='sigmoid'))




model_checkpoint_callback = ModelCheckpoint(
    save_weights_only=False,
    monitor='val_loss',
    mode='max',
    save_best_only=True,
    filepath= 'name.h5')

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 3, 32)             660352    
                                                                 
 lstm_3 (LSTM)               (None, 128)               82432     
                                                                 
 flatten_3 (Flatten)         (None, 128)               0         
                                                                 
 dense_9 (Dense)             (None, 64)                8256      
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                                 
 dense_10 (Dense)            (None, 128)               8320      
                                                                 
 dropout_7 (Dropout)         (None, 128)              

In [43]:
# Trainning
history = model.fit(X_train, y_train,
                    epochs=6,
                    validation_data=(X_test, y_test),
                    batch_size=512,
                    callbacks=[model_checkpoint_callback])
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print(f"Training Accuracy: {accuracy}")
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print(f"Testing Accuracy:  {accuracy}")


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Training Accuracy: 0.9918066263198853
Testing Accuracy:  0.9252257943153381


In [44]:
# Saving the objects:
with open('tokenizer.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump(tokenizer, f)

In [46]:
%%time

# Testing
name = ['محمد علي السيد']
x = tokenizer.texts_to_sequences(normalize_arabic(name[0]).split("-*-"))
x = pad_sequences(x, padding='post',maxlen=3,truncating='post')
model.predict(x)[0][0]

CPU times: total: 46.9 ms
Wall time: 297 ms


0.99323195

In [47]:
model.save("name.h5")