In [5]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split

from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM
from keras.callbacks import EarlyStopping
import keras

In [6]:
data = pd.read_csv('../Data/final_names.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,name,status
0,118425,أسارير اعتماد رصينة,0.0
1,109231,سامح مزتور شروق,0.0
2,167416,سصر مصطفى أغاريد,0.0
3,59137,أمينة رمزي مدلج,1.0
4,132777,عبدالمغعم فائقة رائد,0.0


In [7]:
data.shape

(197626, 3)

In [8]:
x= data['name']
y=data['status']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1000)

In [9]:
tok = Tokenizer(oov_token="<OOV>")
#fit the data on the text
tok.fit_on_texts(x)
# Tokenize the data 
X_train_tokenized = tok.texts_to_sequences(X_train)
X_test_tokenized = tok.texts_to_sequences(X_test)
# add one for the oov token 
vocab_size = len(tok.word_index) + 1  

max_len = 3
embdding_dim=20

In [64]:
# print(embdding_dim)

20


In [10]:
X_train = pad_sequences(X_train_tokenized, padding='post',maxlen=max_len,truncating='post')
X_test = pad_sequences(X_test_tokenized, padding='post', maxlen=max_len,truncating='post')

In [11]:
dropout=0.4
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size,output_dim=embdding_dim,input_length=max_len))
model.add(LSTM(128, activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))



In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 20)             873440    
                                                                 
 lstm (LSTM)                 (None, 128)               76288     
                                                                 
 flatten (Flatten)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 3

In [13]:
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    save_weights_only=False,
    monitor='val_loss',
    save_best_only=True,
    filepath= 'model.h5')

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [17]:
history = model.fit(X_train, y_train,
                    epochs=20,
                    validation_data=(X_test, y_test),
                    batch_size=32,
                    callbacks=[model_checkpoint_callback])
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training Accuracy: 0.9783
Testing Accuracy:  0.9291


In [71]:
# import re 
# def normalize_arabic(text):
#     text = re.sub("[إأآا]", "ا", text)
#     text = re.sub("ى", "ي", text)
#     text = re.sub("ة", "ه", text)
#     text = re.sub("گ", "ك", text)
#     return text

# test the model before saving it 

In [78]:
name = ['ياسمين أحمد محمود']
one = tok.texts_to_sequences(name)
y = pad_sequences(one, padding='post',maxlen=3,truncating='post')
y

array([[569, 170, 318]], dtype=int32)

In [79]:
model.predict(y)



array([[0.3930118]], dtype=float32)

# save the tokenizer into pickle file 

In [33]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tok, handle)

In [None]:
# model.save('name.h5')

# Retest the model 

In [40]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer_1 = pickle.load(handle)

In [43]:
my_model = keras.models.load_model('name.h5')



In [80]:
testing = ['محمد أحمد محمود']
test1= tokenizer_1.texts_to_sequences(testing)
test = pad_sequences(test1, padding='post',maxlen=3,truncating='post')
test

array([[520, 170, 318]], dtype=int32)

In [81]:
my_model.predict(test)



array([[0.78517634]], dtype=float32)

In [82]:
testing = ['محمد احمد محمود']
test1= tokenizer_1.texts_to_sequences(testing)
test = pad_sequences(test1, padding='post',maxlen=3,truncating='post')
test

array([[520,   1, 318]], dtype=int32)

In [83]:
my_model.predict(test)



array([[0.07445119]], dtype=float32)