## Imports

In [42]:
import pandas as pd 
from sklearn.model_selection import train_test_split
%run utils.ipynb import *

## Load Datasets

In [43]:
loader = DataLoader()

In [44]:
# load arabic names
real_arabic_names = loader.load_data("input", "Arabic_names.csv")
# load male names
male = loader.load_data("input", "mnames.txt", header=None, names=['Name'])
# load female names
female = loader.load_data("input", "fnames.txt", header=None, names=['Name'])

In [45]:
# view first rows of real arabic names
real_arabic_names.head()

Unnamed: 0,Name,Gender
0,ابتسام,F
1,ابتهاج,F
2,ابتهال,F
3,اجتهاد,F
4,ازدهار,F


In [46]:
# view last 5 rows of arabic names
real_arabic_names.tail()

Unnamed: 0,Name,Gender
1400,وهبة,M
1401,ياسر,M
1402,يحيى,M
1403,يزيد,M
1404,يعقوب,M


In [47]:
# view first 5 rows of male names
male.head()

Unnamed: 0,Name
0,أبان
1,أبو بكر
2,أبو حمزة
3,أحد
4,أحمد


In [48]:
# view last 5 rows of male names
male.tail()

Unnamed: 0,Name
635,وفيق
636,وهاب
637,وهبة
638,ياسر
639,يحيى


In [49]:
# view first 5 rows of female names
female.head()

Unnamed: 0,Name
0,آيات
1,آلاء
2,آمال
3,آمنة
4,آيات


In [50]:
# view last 5 rows of female names
female.tail()

Unnamed: 0,Name
411,وميض
412,وهبة
413,يارا
414,ياسمين
415,يسرا


In [51]:
# add gender column to male and female names
male['Gender'] = 'M'
female['Gender'] = 'F'

In [52]:
# merge male names with female names
names = pd.concat([male,female])
# merge the result with arabic names
real_names = pd.concat([real_arabic_names,names])

In [53]:
# add label column to the arabic names datasets that identifies that these names are real names
real_names['Label'] = 'Real'

In [54]:
# shape of the result of merging all real arabic names datasets
real_names.shape

(2461, 3)

In [56]:
# check and remove duplicates
duplicator = DataDuplicateChecker()
names = duplicator.check_duplicates(real_names, 'real names')

Number of duplicated names are:  909
The shape of  real names after deleting duplicates is: (1552, 3)


In [57]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1552 entries, 0 to 1551
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1552 non-null   object
 1   Gender  1552 non-null   object
 2   Label   1552 non-null   object
dtypes: object(3)
memory usage: 36.5+ KB


In [58]:
names.describe()

Unnamed: 0,Name,Gender,Label
count,1552,1552,1552
unique,1552,2,1
top,ابتسام,M,Real
freq,1,929,1552


In [59]:
## Data generation

In [60]:
# create object of the DataGenerator class
generator = DataGenerator()

For each name in the **_Arabic Names Dataset_**, we will generate **two** fake names by deleting a random character from the original name at random position, then generate **six** fake names by adding a  wrong character to the original name at random position, and **four** fake names by replacing a random single character from the original name with another wrong random character

**IMPORTANT NOTE**, we will not consider all possibilities of generating a fake name from an original name, as this will lead us to huge fake names maybe reach to more than **1,000,000!** fake name, So as a start it will be enough to generate **12x** fake names from the real names dataset

In [61]:
# generate 2 fake names by delete a char
f_names_deleted_char = names.apply(lambda x: generator.generate_fake_names_delete_char(num_of_words=2, name=x['Name'],
                                                                gender=x['Gender']), axis=1)
# generate 4 fake names by replace a char
f_names_replaced_char = names.apply(lambda x: generator.generate_fake_names_replace_char(num_of_words=4, name=x['Name'],
                                                                gender=x['Gender']), axis=1)
# generate 6 fake names by adding a wrong char
f_names_adding_wrong_char = names.apply(lambda x: generator.generate_fake_names_add_char(num_of_words=6, name=x['Name'],
                                                                gender=x['Gender']), axis=1)

In [62]:
# convert the output of the generator to one dictionary containing all names
deleted_char_df = generator.flatten_dict(f_names_deleted_char)
replaced_char_df = generator.flatten_dict(f_names_replaced_char)
added_char_df = generator.flatten_dict(f_names_adding_wrong_char)
# concatenate the 3 dataframe to form the Fake Names Dataset
f_names = pd.concat([added_char_df, replaced_char_df, deleted_char_df],ignore_index=True)

In [63]:
# view first 5 rows of fake names
f_names.head()

Unnamed: 0,Name,Gender,Label
0,ابتسوام,F,Fake
1,ابتظسام,F,Fake
2,اآبتسام,F,Fake
3,اخبتسام,F,Fake
4,ابتسغام,F,Fake


In [64]:
# view last 5 rows of fake names
f_names.tail()

Unnamed: 0,Name,Gender,Label
18619,ناحة,F,Fake
18620,ميا,F,Fake
18621,مين,F,Fake
18622,نصاة,F,Fake
18623,صاحة,F,Fake


In [65]:
f_names.shape

(18624, 3)

In [66]:
f_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18624 entries, 0 to 18623
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    18624 non-null  object
 1   Gender  18624 non-null  object
 2   Label   18624 non-null  object
dtypes: object(3)
memory usage: 436.6+ KB


In [67]:
f_names.describe()

Unnamed: 0,Name,Gender,Label
count,18624,18624,18624
unique,17846,2,1
top,ري,M,Fake
freq,8,11148,18624


In [69]:
# check and remove duplicates
f_names2 = duplicator.check_duplicates(f_names, 'fake names')

Number of duplicated names are:  778
The shape of  fake names after deleting duplicates is: (17846, 3)


In [70]:
# check to see if there is a fake name exist in real names
# False indicates that this name will be deleted from the fake names dataset
# True indicates that this name will remain exist in the fake names dataset
fake_real_mask = [False if name in names['Name'] else True for name in f_names2['Name']]
print('Number of fake names that exist in real arabic names', len([fake_real_mask == False]))
fake_names = f_names2[fake_real_mask]

Number of fake names that exist in real arabic names 1


We have our real arabic names dataset and generated fake names dataset from our real arabic names, We will now create a **Full Name Dataset** consists of 3 names separated by space

We will create **Two Datasets**:
1. The first is full names with real arabic names, by generating **random 324 name** for each name in the real arabic names dataset
2. the second is full names with fake arabic names, by generating **random 30 name** for each name in the fake arabic names dataset

There is a difference in number of names in each dataset and that is because the dataframe of real names is smaller than by **12X** from the dataframe that contains fake names, so above numbers will give us balanced dataset of full names with equal number of examples for each class (Correct and Incorrect)

And if we choose to generate same number of names in the two dataset this will lead to **Imbalanced Dataset Problem**, and a memory problem may occur, so above numbers are enough

In [72]:
# full names with real arabic names
full_names_real = generator.generate_full_name(full_name_type='Real', data=names,num_of_names=324)
# full names with fake arabic names
full_names_fake = generator.generate_full_name(full_name_type='Fake', data=fake_names,num_of_names=30)

In [73]:
# combine the two dataset together into one
full_names = pd.concat([full_names_real,full_names_fake])
# shuffle the data
full_names = full_names.sample(frac=1)
# reset index
full_names = full_names.reset_index(drop=True)

In [74]:
# view first 5 rows of the full_names dataset
full_names.head()

Unnamed: 0,Name,Gender,Label
0,مؤيد عاكف شافع,M,Correct
1,بسوام صخديق ناصج,M,Incorrect
2,رعشاد وهف شاب,M,Incorrect
3,إككيك أشلرم عبدالعظيذم,F,Incorrect
4,أنظار صمصام عبدالغفور,F,Correct


In [75]:
# view last 5 rows of the full_names dataset
full_names.tail()

Unnamed: 0,Name,Gender,Label
1038223,هحانئة فاددي تشيبة,F,Incorrect
1038224,داطنيا كلصيب سخلم,F,Incorrect
1038225,ابتهايل ذيبال قغلب,F,Incorrect
1038226,أمئامة دنير لأيوب,F,Incorrect
1038227,جامخ ضيجغم الخضأر,M,Incorrect


In [76]:
full_names.shape

(1038228, 3)

In [77]:
full_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1038228 entries, 0 to 1038227
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   Name    1038228 non-null  object
 1   Gender  1038228 non-null  object
 2   Label   1038228 non-null  object
dtypes: object(3)
memory usage: 23.8+ MB


In [78]:
full_names.describe()

Unnamed: 0,Name,Gender,Label
count,1038228,1038228,1038228
unique,1038141,2,2
top,أزهر مبروك صاعد,M,Incorrect
freq,2,618606,535380


In [79]:
# check and remove duplicates
full_names = duplicator.check_duplicates(full_names, 'full names')

Number of duplicated names are:  87
The shape of  full names after deleting duplicates is: (1038141, 3)


Prepare the data for the **Model**

In [81]:
# drop the gender column as it wouldn't be useful anymore
full_names = full_names.drop(columns=['Gender'])
# divide data into x that represents features and y that represent target
X = full_names.drop(columns=['Label'])
y = full_names[['Label']]
# divide the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [82]:
# reset index for all data
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

Save the data to csv files to be used in the next steps

In [83]:
# save the data
train_data = pd.concat([X_train,y_train],axis=1)
test_data = pd.concat([X_test,y_test],axis=1)
train_data.to_csv('C:/Users/mahmo/Desktop/Name-Verification-Model/input/processed_data/train_data.csv', index=False)
test_data.to_csv('C:/Users/mahmo/Desktop/Name-Verification-Model/input/processed_data/test_data.csv', index=False)