In [14]:
# imports
import tensorflow as tf
import keras.layers as layers
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import re
import string
import numpy as np
import pandas as pd

# Dataset

In [2]:
df = pd.read_csv("data/dataset.csv")
print(df.shape)
df.sample(10)

(20000, 2)


Unnamed: 0,Name,class
5734,رعد كامل عون,1.0
5827,أنسام شهاب مسلم,1.0
15643,العطاء إقتباسات محبب,0.0
17998,للاج رمداغ مرح,0.0
417,كنزى عواد عبيدة,1.0
2153,سيرين كلاب ساري,1.0
1725,آيات باسق نظمي,1.0
1423,نوار ظفير عبدالحكيم,1.0
2465,إكليل كاسر عبدالظاهر,1.0
7920,بارع زياد عبدالرشيد,1.0


In [3]:
# max number of words per name
df['Name'].str.split().str.len().max()

4

## Document Preprocessing

In [4]:
# adding start and end tokens
names = ['<SOS> ' + x + ' <EOS>' for x in df['Name'].tolist()]
corpus = " ".join(names)
labels = df['class'].tolist()
corpus[:1000]

'<SOS> نصر ساعف ياسر <EOS> <SOS> إيمان أخطب قطامي <EOS> <SOS> سعيد عبدالرشيد ضياء <EOS> <SOS> ولهانة صادر جرير <EOS> <SOS> منتظر داني زاكي <EOS> <SOS> منيبة أحنف رافد <EOS> <SOS> مأمون عبودة أزهر <EOS> <SOS> بلقيس برهان مغيث <EOS> <SOS> عماد عبدالرحيم أدهم <EOS> <SOS> هنوف ماهر منير <EOS> <SOS> عبدالعليم صارم نديم <EOS> <SOS> أمل رأفت نصوح <EOS> <SOS> أشهب قسام صلاح <EOS> <SOS> ديمة مأمون راسم <EOS> <SOS> منيع صبيح فلوح <EOS> <SOS> قنوت فواز ساجر <EOS> <SOS> مرحب سليم فوزي <EOS> <SOS> بائقة هواري عزب <EOS> <SOS> خاتم ملحم عفيف <EOS> <SOS> مهرة مسلمة محجن <EOS> <SOS> حصيف خزعل نوري <EOS> <SOS> راجحة غياث سيار <EOS> <SOS> مسفر هيثم غافر <EOS> <SOS> رزان قاهر تيم <EOS> <SOS> بديع بليغ عبدالشكور <EOS> <SOS> أسارير فهد زياد <EOS> <SOS> فوزي ساجد وسام <EOS> <SOS> عفاف تليد صلاح الدين <EOS> <SOS> ساعد فارض سعدون <EOS> <SOS> ميار فتحي راضي <EOS> <SOS> ركين حافظ تغلب <EOS> <SOS> توليب معين نواس <EOS> <SOS> وهبة علاء رسمي <EOS> <SOS> عزة فاضل جواد <EOS> <SOS> ناجي قطامي بهاء <EOS> <SOS> لارا حمد

## Tokenization 

In [5]:
tokenize = tf.keras.preprocessing.text.Tokenizer(oov_token="<OOV>", filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenize.fit_on_texts(names)
x = tokenize.texts_to_sequences(names)
max_len = 6
x = tf.keras.utils.pad_sequences(x, maxlen=max_len, padding='post')
total_words = len(tokenize.word_index)+1
print(f"Total Words: {total_words}")
print("Example of a Sequence")
print(x[:3])
print("Some Tokens")
print(list(tokenize.word_index)[:10])

Total Words: 5096
Example of a Sequence
[[  2  34 814  86   3   0]
 [  2 934 287 403   3   0]
 [  2 177 815 151   3   0]]
Some Tokens
['<OOV>', '<sos>', '<eos>', 'الدين', 'بكر', 'صلاح', 'معين', 'ناصر', 'نجم', 'نظمي']


## Splitting data and making dataset 

In [6]:
x_train, x_valid, y_train, y_valid = train_test_split(x, df['class'].values, test_size=.2, random_state=23)
print("Train and Validation sizes")
len(x_train), len(x_valid), len(y_train), len(y_valid)

Train and Validation sizes


(16000, 4000, 16000, 4000)

In [7]:
ds_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(128)
ds_valid = tf.data.Dataset.from_tensor_slices((x_valid, y_valid)).batch(128)

2022-12-14 09:33:32.993449: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-14 09:33:33.118736: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-14 09:33:33.119025: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-14 09:33:33.120002: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropri

In [8]:
for x,y in ds_train.take(1):
  print(x[:2], y[:2])

tf.Tensor(
[[   2 2874  518  615    3    0]
 [   2 4163 2980 2468    3    0]], shape=(2, 6), dtype=int32) tf.Tensor([1. 0.], shape=(2,), dtype=float64)


# Build and Train a Model: 

In [9]:
vocab_size = total_words
embedding_dim = 128
sequence_length = max_len
rnn_units = 512
batch_size=64

In [15]:
model = tf.keras.Sequential([
  layers.Embedding(vocab_size, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)],
  )

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         652288    
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 652,417
Trainable params: 652,417
Non-trainable params: 0
__________________________________________________

In [16]:
model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy())

In [17]:
epochs = 10
history = model.fit(
    ds_train,
    validation_data=ds_valid,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Test Examples 

In [18]:
#new
example = '<sos> ' + "معاذ طه عوض" + ' <eos>'
example_vect = tokenize.texts_to_sequences([example])
tf.nn.sigmoid(model.predict(example_vect)).numpy()



array([[0.9642587]], dtype=float32)

In [19]:
#new
example = "باسم وحيد السيد"
example = '<sos> ' + example + ' <eos>'
example_vect = tokenize.texts_to_sequences([example])
tf.nn.sigmoid(model.predict(example_vect)).numpy()



array([[0.87165827]], dtype=float32)

In [20]:
#new
example = "شسي شسي شسي"
example = '<sos> ' + example + ' <eos>'
example_vect = tokenize.texts_to_sequences([example])
tf.nn.sigmoid(model.predict(example_vect)).numpy()



array([[0.12584206]], dtype=float32)

In [21]:
#new
example = "باسمم وحةد السد"
example = '<sos> ' + example + ' <eos>'
example_vect = tokenize.texts_to_sequences([example])
tf.nn.sigmoid(model.predict(example_vect)).numpy()



array([[0.12584206]], dtype=float32)

In [22]:
#new
example = "مريم محمد محمد"
example = '<sos> ' + example + ' <eos>'
example_vect = tokenize.texts_to_sequences([example])
tf.nn.sigmoid(model.predict(example_vect)).numpy()



array([[0.99351066]], dtype=float32)

In [23]:
#new
example = "محمد مريم محمد"
example = '<sos> ' + example + ' <eos>'
example_vect = tokenize.texts_to_sequences([example])
tf.nn.sigmoid(model.predict(example_vect)).numpy()



array([[0.99351066]], dtype=float32)

In [24]:
#new
example = "مريم محمد الرمال"
example = '<sos> ' + example + ' <eos>'
example_vect = tokenize.texts_to_sequences([example])
tf.nn.sigmoid(model.predict(example_vect)).numpy()



array([[0.6659465]], dtype=float32)

In [25]:
#new
example = "محمد طه عوض"
example = '<sos> ' + example + ' <eos>'
example_vect = tokenize.texts_to_sequences([example])
tf.nn.sigmoid(model.predict(example_vect)).numpy()



array([[0.96503353]], dtype=float32)

In [26]:
#new
example = "زيادد عبدالرحمنت محمد"
example = '<sos> ' + example + ' <eos>'
example_vect = tokenize.texts_to_sequences([example])
tf.nn.sigmoid(model.predict(example_vect)).numpy()



array([[0.75257754]], dtype=float32)

In [28]:
#new
example = "محمد طه عوض لاشين"
example = '<sos> ' + example + ' <eos>'
example_vect = tokenize.texts_to_sequences([example])
tf.nn.sigmoid(model.predict(example_vect)).numpy()



array([[0.9415555]], dtype=float32)

# Saving and loading the trained model 

In [29]:
model.save('models/base_model', )

INFO:tensorflow:Assets written to: models/base_model/assets


In [31]:
new_model = tf.keras.models.load_model('models/base_model')
new_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         652288    
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 652,417
Trainable params: 652,417
Non-trainable params: 0
__________________________________________________

## Saving and loading the tokenizer

In [34]:
import pickle

# saving
with open('models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('models/tokenizer.pickle', 'rb') as handle:
    new_tokenizer = pickle.load(handle)

In [35]:
#new
example = "محمد طه عوض لاشين"
example = '<sos> ' + example + ' <eos>'
example_vect = new_tokenizer.texts_to_sequences([example])
tf.nn.sigmoid(new_model.predict(example_vect)).numpy()



array([[0.9415555]], dtype=float32)