In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split

from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM
from keras.callbacks import EarlyStopping
import keras
import re

In [2]:
data = pd.read_csv('../Data/final_names.csv')
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,name,status
0,0,118425,أسارير اعتماد رصينة,0.0
1,1,109231,سامح مزتور شروق,0.0
2,2,167416,سصر مصطفى أغاريد,0.0
3,3,59137,أمينة رمزي مدلج,1.0
4,4,132777,عبدالمغعم فائقة رائد,0.0


In [3]:
data.shape

(197626, 4)

In [4]:
# import re 
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

In [5]:
data['name'] = data['name'].apply(lambda x:normalize_arabic(x) )

In [6]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,name,status
0,0,118425,اسارير اعتماد رصينه,0.0
1,1,109231,سامح مزتور شروق,0.0
2,2,167416,سصر مصطفي اغاريد,0.0
3,3,59137,امينه رمزي مدلج,1.0
4,4,132777,عبدالمغعم فائقه رائد,0.0


In [7]:
x= data['name']
y=data['status']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=1000)

In [8]:
tok = Tokenizer(oov_token="<OOV>")
#fit the data on the text
tok.fit_on_texts(x)
# Tokenize the data 
X_train_tokenized = tok.texts_to_sequences(X_train)
X_test_tokenized = tok.texts_to_sequences(X_test)
# add one for the oov token 
vocab_size = len(tok.word_index) + 1  

max_len = 3
embdding_dim=20

In [9]:
X_train = pad_sequences(X_train_tokenized, padding='post',maxlen=max_len,truncating='post')
X_test = pad_sequences(X_test_tokenized, padding='post', maxlen=max_len,truncating='post')

In [10]:
dropout=0.4
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size,output_dim=embdding_dim,input_length=max_len))
model.add(LSTM(128, activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))



In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 20)             806900    
                                                                 
 lstm (LSTM)                 (None, 128)               76288     
                                                                 
 flatten (Flatten)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 3

In [12]:
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    save_weights_only=False,
    monitor='val_loss',
    save_best_only=True,
    filepath= 'model.h5')

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [13]:
history = model.fit(X_train, y_train,
                    epochs=20,
                    validation_data=(X_test, y_test),
                    batch_size=64,
                    callbacks=[model_checkpoint_callback])
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training Accuracy: 0.9723
Testing Accuracy:  0.9272


In [14]:
name = ['ياسمين أحمد محمود']
one = tok.texts_to_sequences(name)
y = pad_sequences(one, padding='post',maxlen=3,truncating='post')
y

array([[578,   1, 328]], dtype=int32)

In [15]:
import pickle

# saving
with open('normalized_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tok, handle)

In [16]:
model.save('normalized_model.h5')

In [17]:
# loading
with open('normalized_tokenizer.pickle', 'rb') as handle:
    tokenizer_1 = pickle.load(handle)

In [18]:
my_model = keras.models.load_model('normalized_model.h5')



In [19]:
testing = ['ياسمين احمد محمود']
test1= tokenizer_1.texts_to_sequences(testing)
test = pad_sequences(test1, padding='post',maxlen=3,truncating='post')
test

array([[578, 177, 328]], dtype=int32)

In [20]:
my_model.predict(test)



array([[0.60952985]], dtype=float32)