## Imports

In [2]:
!pip install arabert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting arabert
  Downloading arabert-1.0.1-py3-none-any.whl (179 kB)
[K     |████████████████████████████████| 179 kB 4.6 MB/s 
[?25hCollecting emoji==1.4.2
  Downloading emoji-1.4.2.tar.gz (184 kB)
[K     |████████████████████████████████| 184 kB 55.7 MB/s 
[?25hCollecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Collecting PyArabic
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 44.0 MB/s 
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.4.2-py3-none-any.whl size=186469 sha256=44c8b16336e4ae2ea00ad47f394126917f129ec721664c80eb03ec980496083a
  Stored in directory: /root/.cache/pip/wheels/71/4d/3c/cada364d4ea0026deee7208dee1e61bcebd20aa2ae5dc154ba
Successfully built emoji
Installing collected packages: 

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 65.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 43.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [1]:
import pandas as pd 
import numpy as np
import random
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from arabert.preprocess import ArabertPreprocessor

## Load Datasets

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# load arabic names
real_arabic_names = pd.read_csv('drive/MyDrive/Names web dataset/Arabic_names.csv')
# load male names
male = pd.read_csv('drive/MyDrive/Names web dataset/mnames.txt', sep='\n', header=None, names=['Name'])
# load female names
female = pd.read_csv('drive/MyDrive/Names web dataset/fnames.txt', sep='\n', header=None, names=['Name'])

In [4]:
# view first rows of real arabic names
real_arabic_names.head()

Unnamed: 0,Name,Gender
0,ابتسام,F
1,ابتهاج,F
2,ابتهال,F
3,اجتهاد,F
4,ازدهار,F


In [5]:
# view last 5 rows of arabic names
real_arabic_names.tail()

Unnamed: 0,Name,Gender
1400,وهبة,M
1401,ياسر,M
1402,يحيى,M
1403,يزيد,M
1404,يعقوب,M


In [6]:
# view first 5 rows of male names
male.head()

Unnamed: 0,Name
0,أبان
1,أبو بكر
2,أبو حمزة
3,أحد
4,أحمد


In [7]:
# view last 5 rows of male names
male.tail()

Unnamed: 0,Name
635,وفيق
636,وهاب
637,وهبة
638,ياسر
639,يحيى


In [8]:
# view first 5 rows of female names
female.head()

Unnamed: 0,Name
0,آيات
1,آلاء
2,آمال
3,آمنة
4,آيات


In [9]:
# view last 5 rows of female names
female.tail()

Unnamed: 0,Name
411,وميض
412,وهبة
413,يارا
414,ياسمين
415,يسرا


In [10]:
# add gender column to male and female names
male['Gender'] = 'M'
female['Gender'] = 'F'

In [11]:
# merge male names with female names
names = pd.concat([male,female])
# merge the result with arabic names
real_names = pd.concat([real_arabic_names,names])

In [12]:
# add label column to the arabic names datasets that identifies that these names are real names
real_names['Label'] = 'Real'

In [13]:
# shape of the result of merging all real arabic names datasets
real_names.shape

(2461, 3)

In [14]:
def check_duplicates(data, text):
  df = data
  # check to see if there is duplicates in rows
  print('Number of duplicated names are: ', df['Name'].duplicated().sum())
  # create mask
  duplicated_names_mask_fake = df['Name'].duplicated()
  # new arabic names dataframe without any duplicated names
  df = df[~ duplicated_names_mask_fake]
  print('The shape of ', text, 'after deleting duplicates is:', df.shape)
  # reset index
  df = df.reset_index(drop=True)
  return df

In [15]:
# check and remove duplicates
names = check_duplicates(real_names, 'real names')

Number of duplicated names are:  909
The shape of  real names after deleting duplicates is: (1552, 3)


In [16]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1552 entries, 0 to 1551
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1552 non-null   object
 1   Gender  1552 non-null   object
 2   Label   1552 non-null   object
dtypes: object(3)
memory usage: 36.5+ KB


In [17]:
names.describe()

Unnamed: 0,Name,Gender,Label
count,1552,1552,1552
unique,1552,2,1
top,ابتسام,M,Real
freq,1,929,1552


## Data generation

In [18]:
arabic_alphabet = ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 
                     'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'أ', 'ؤ', 
                     'ة', 'آ', 'ء', 'ئ', 'إ']

In [19]:
class DataGenerator:
    
    def __init__(self, alpha):
        self.alphabet = alpha
    
    def generate(self, mode, num_of_words, name, gender):
        generated_data = dict()
        generated_names = []
        generated_gender = []
        generated_label = ['Fake'] * num_of_words
        i = 0
        while i < num_of_words:
            random_index = random.randint(0, (len(name) - 1))
            if mode == 'replace_char':
                alpha_random_index = random.randint(0, (len(self.alphabet) - 1))
                fake_name = name.replace(name[random_index], self.alphabet[alpha_random_index])
            elif mode == 'delete_char':
                fake_name = name.replace(name[random_index], "")
            elif mode == 'add_char':
                alpha_random_index = random.randint(0, (len(self.alphabet) - 1))
                fake_name = name[:random_index] + self.alphabet[alpha_random_index] + name[random_index:]
            if fake_name in generated_names:
                continue
            generated_names.append(fake_name)
            generated_gender.append(gender)
            i = i + 1
        generated_data['Name'] = generated_names
        generated_data['Gender'] = generated_gender
        generated_data['Label'] = generated_label        
        return generated_data
    
    def generate_full_name(self, mode, data, num_of_names):
        generated_data = dict()
        generated_first_last_names = []
        generated_full_names = []
        generated_gender = []
        generated_label = []
        # first name and second name
        names = list(data['Name'].values)
        genders = list(data['Gender'].values)
        for name, gender in zip(names, genders):
          counter1 = 0
          while True:
            if counter1 >= num_of_names:
              break
            index2 = random.randint(0,(len(names) - 1))
            if genders[index2] == 'F':
              continue
            first_last_name = name + ' ' + names[index2]
            generated_first_last_names.append(first_last_name)
            generated_gender.append(gender)
            if mode == 'Real':
              generated_label.append('Correct')
            elif mode == 'Fake':
              generated_label.append('Incorrect')
            counter1 = counter1 + 1
        # full name
        for name in  generated_first_last_names:
          while True:
            index2 = random.randint(0,(len(names) - 1))
            if genders[index2] == 'F':
              continue
            else:
              break
          full_name = name + ' ' + names[index2]
          generated_full_names.append(full_name)
        generated_data['Name'] = generated_full_names
        generated_data['Gender'] = generated_gender
        generated_data['Label'] = generated_label  
        return pd.DataFrame.from_dict(generated_data)

In [20]:
# create object of the DataGenerator class
generator = DataGenerator(arabic_alphabet)

For each name in the **_Arabic Names Dataset_**, we will generate **two** fake names by deleting a random character from the original name at random position, then generate **six** fake names by adding a  wrong character to the original name at random position, and **four** fake names by replacing a random single character from the original name with another wrong random character

**IMPORTANT NOTE**, we will not consider all possibilities of generating a fake name from an original name, as this will lead us to huge fake names maybe reach to more than **500,000!** fake name, and that will lead to **Imbalanced Dataset Problem**, So as a start it will be enough to generate **12x** fake names from the real names dataset

In [21]:
# generate 2 fake names by delete a char
f_names_deleted_char = names.apply(lambda x: generator.generate(mode='delete_char', num_of_words=2, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)
# generate 4 fake names by replace a char
f_names_replaced_char = names.apply(lambda x: generator.generate(mode='replace_char', num_of_words=4, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)
# generate 6 fake names by adding a wrong char
f_names_adding_wrong_char = names.apply(lambda x: generator.generate(mode='add_char', num_of_words=6, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)

In [22]:
# this function is used to re-represent the output of the generator to make it one dictionary instead of a 
# dictionary for each name, to convert the dictionary to a dataframe
def one_dict(multi_dict):
    dic = dict()
    name = []
    gender = []
    label = []
    for row in multi_dict:
        for names in row['Name']:
            name.append(names)
        for genders in row['Gender']:
            gender.append(genders)
        for labels in row['Label']:
            label.append(labels)  
    dic['Name'] = name
    dic['Gender'] = gender
    dic['Label'] = label
    return dic

In [23]:
# convert the output of the generator to one dictionary containing all names
deleted_char_dict = one_dict(f_names_deleted_char)
replaced_char_dict = one_dict(f_names_replaced_char)
added_char_dcit = one_dict(f_names_adding_wrong_char)
# convert the dictionaries into dataframe form
deleted_char_df = pd.DataFrame.from_dict(deleted_char_dict)
replaced_char_df = pd.DataFrame.from_dict(replaced_char_dict)
added_char_df = pd.DataFrame.from_dict(added_char_dcit)
# concatenate the 3 dataframe to form the Fake Names Dataset
f_names = pd.concat([added_char_df, replaced_char_df, deleted_char_df],ignore_index=True)

In [24]:
# view first 5 rows of fake names
f_names.head()

Unnamed: 0,Name,Gender,Label
0,ابتمسام,F,Fake
1,ابعتسام,F,Fake
2,ابتساام,F,Fake
3,ابتسازم,F,Fake
4,ابتصسام,F,Fake


In [25]:
# view last 5 rows of fake names
f_names.tail()

Unnamed: 0,Name,Gender,Label
18619,مناح,F,Fake
18620,ينا,F,Fake
18621,مين,F,Fake
18622,نصاح,F,Fake
18623,نصاة,F,Fake


In [26]:
f_names.shape

(18624, 3)

In [27]:
f_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18624 entries, 0 to 18623
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    18624 non-null  object
 1   Gender  18624 non-null  object
 2   Label   18624 non-null  object
dtypes: object(3)
memory usage: 436.6+ KB


In [28]:
f_names.describe()

Unnamed: 0,Name,Gender,Label
count,18624,18624,18624
unique,17843,2,1
top,سام,M,Fake
freq,7,11148,18624


In [29]:
# check and remove duplicates
f_names2 = check_duplicates(f_names, 'fake names')

Number of duplicated names are:  781
The shape of  fake names after deleting duplicates is: (17843, 3)


In [30]:
# check to see if there is a fake name exist in real names
# False indicates that this name will be deleted from the fake names dataset
# True indicates that this name will remain exist in the fake names dataset
fake_real_mask = [False if name in names['Name'] else True for name in f_names2['Name']]
print('Number of fake names that exist in real arabic names', len([fake_real_mask == False]))
fake_names = f_names2[fake_real_mask]

Number of fake names that exist in real arabic names 1


We have our real arabic names dataset and generated fake names dataset from our real arabic names, We will now create a **Full Name Dataset** consists of 3 names separated by space

We will create **Two Datasets**:
1. The first is full names with real arabic names, by generating **random 162 name** for each name in the real arabic names dataset
2. the second is full names with fake arabic names, by generating **random 15 name** for each name in the fake arabic names dataset

In [31]:
# full names with real arabic names
full_names_real = generator.generate_full_name(mode='Real', data=names,num_of_names=162)
# full names with fake arabic names
full_names_fake = generator.generate_full_name(mode='Fake', data=fake_names,num_of_names=15)

In [56]:
# combine the two dataset together into one
full_names = pd.concat([full_names_real,full_names_fake])
# shuffle the data
full_names = full_names.sample(frac=1)
# reset index
full_names = full_names.reset_index(drop=True)

In [33]:
# view first 5 rows of the full_names dataset
full_names.head()

Unnamed: 0,Name,Gender,Label
0,جلاء ثواذب رمال,F,Incorrect
1,شاسفع قفاضل كصهيب,M,Incorrect
2,زيءنة عبدالحمصيد مرحآ,F,Incorrect
3,ظبافرة حكيعم خرقان,F,Incorrect
4,حضسين ضاةي أقمير,M,Incorrect


In [34]:
# view last 5 rows of the full_names dataset
full_names.tail()

Unnamed: 0,Name,Gender,Label
519064,مححسن مرؤشدي غان,M,Incorrect
519065,صديق مكين عامر,M,Correct
519066,هان تنظمي رتسيم,M,Incorrect
519067,خازضن طاع مقبخل,M,Incorrect
519068,كحيم كلم عبدشلرشيد,M,Incorrect


In [35]:
full_names.shape

(519069, 3)

In [36]:
full_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519069 entries, 0 to 519068
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Name    519069 non-null  object
 1   Gender  519069 non-null  object
 2   Label   519069 non-null  object
dtypes: object(3)
memory usage: 11.9+ MB


In [37]:
full_names.describe()

Unnamed: 0,Name,Gender,Label
count,519069,519069,519069
unique,519050,2,2
top,نايف نوفل حسني,M,Incorrect
freq,2,309168,267645


In [38]:
# check and remove duplicates
full_names = check_duplicates(full_names, 'full names')

Number of duplicated names are:  19
The shape of  full names after deleting duplicates is: (519050, 3)


## Core Model

For the Core Model part i will use a **Pretrained Model** that is **AraBERT** and it is an Arabic pretrained language model based on Google's BERT architechture but for  Arabic Language Understanding

We will use the **Tokenizer** of **AraBERT** to convert our full names into tokens after that ids that represent each word to make data ready for my own Deep Learning Model

In [41]:
# load the model, tokenizer and preprocessor
model_name = 'aubmindlab/bert-base-arabertv02'
tokenizer =  AutoTokenizer.from_pretrained(model_name)
preprocessor = ArabertPreprocessor(model_name=model_name)

Downloading:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/825k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Prepare the data for the **Model**

In [57]:
# drop the gender column as it wouldn't be useful anymore
full_names = full_names.drop(columns=['Gender'])
# divide data into x that represents features and y that represent target
X = full_names.drop(columns=['Label'])
y = full_names[['Label']]
# divide the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [58]:
# reset index for all data
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

This method apply all transofrmation and preprocessing for the names data, as it converts it to **Tokens** and convert tokens into **IDs** and **Encodes** the labels of the data by converting them to **One Hot Encoding** return to us the data ready to be fed into my own Deep Learning Model

In [59]:
def transform_data(X,y):
  # apply preprocessor on the X data
  X = X['Name'].apply(lambda x: preprocessor.preprocess(x))
  # convert back to dataframe and reset_index
  X = pd.DataFrame(X,columns=['Name'])
  # tokenize X data
  X = [tokenizer.tokenize(name ,max_length=3, truncation=True) for name in X['Name'].tolist()]
  # convert tokens in X to ids
  X = [tokenizer.convert_tokens_to_ids(name) for name in X]
  # convert back to dataframe
  X = pd.DataFrame({'Name':X},columns=['Name'])
  # encode labels in y data
  y = pd.get_dummies(y['Label'])
  return X, y

In [60]:
# transofrm X_train and y_train data
X_train , y_train = transform_data(X_train,y_train)
# transform X_test and y_test data
X_test , y_test = transform_data(X_test,y_test)

In [61]:
X_train

Unnamed: 0,Name
0,"[50175, 11224, 43757]"
1,"[10110, 19370, 18344]"
2,"[110, 26656, 16738]"
3,"[4355, 192, 4526]"
4,"[630, 1082, 15742]"
...,...
441203,"[3418, 253, 1958]"
441204,"[27235, 3218, 138]"
441205,"[28370, 9335, 4673]"
441206,"[4771, 20627, 1821]"
