## Imports

In [43]:
import pandas as pd 
import numpy as np
import random

## Load Datasets

In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
# load arabic names
real_arabic_names = pd.read_csv('drive/MyDrive/Names web dataset/Arabic_names.csv')
# load male names
male = pd.read_csv('drive/MyDrive/Names web dataset/mnames.txt', sep='\n', header=None, names=['Name'])
# load female names
female = pd.read_csv('drive/MyDrive/Names web dataset/fnames.txt', sep='\n', header=None, names=['Name'])

In [46]:
# view first rows of real arabic names
real_arabic_names.head()

Unnamed: 0,Name,Gender
0,ابتسام,F
1,ابتهاج,F
2,ابتهال,F
3,اجتهاد,F
4,ازدهار,F


In [47]:
# view last 5 rows of arabic names
real_arabic_names.tail()

Unnamed: 0,Name,Gender
1400,وهبة,M
1401,ياسر,M
1402,يحيى,M
1403,يزيد,M
1404,يعقوب,M


In [48]:
# view first 5 rows of male names
male.head()

Unnamed: 0,Name
0,أبان
1,أبو بكر
2,أبو حمزة
3,أحد
4,أحمد


In [49]:
# view last 5 rows of male names
male.tail()

Unnamed: 0,Name
635,وفيق
636,وهاب
637,وهبة
638,ياسر
639,يحيى


In [50]:
# view first 5 rows of female names
female.head()

Unnamed: 0,Name
0,آيات
1,آلاء
2,آمال
3,آمنة
4,آيات


In [51]:
# view last 5 rows of female names
female.tail()

Unnamed: 0,Name
411,وميض
412,وهبة
413,يارا
414,ياسمين
415,يسرا


In [52]:
# add gender column to male and female names
male['Gender'] = 'M'
female['Gender'] = 'F'

In [53]:
# merge male names with female names
names = pd.concat([male,female])
# merge the result with arabic names
real_names = pd.concat([real_arabic_names,names])

In [54]:
# add label column to the arabic names datasets that identifies that these names are real names
real_names['Label'] = 'Real'

In [55]:
# shape of the result of merging all real arabic names datasets
real_names.shape

(2461, 3)

In [56]:
def check_duplicates(data, text):
  df = data
  # check to see if there is duplicates in rows
  print('Number of duplicated names are: ', df['Name'].duplicated().sum())
  # create mask
  duplicated_names_mask_fake = df['Name'].duplicated()
  # new arabic names dataframe without any duplicated names
  df = df[~ duplicated_names_mask_fake]
  print('The shape of ', text, 'after deleting duplicates is:', df.shape)
  # reset index
  df = df.reset_index(drop=True)
  return df

In [57]:
# check and remove duplicates
names = check_duplicates(real_names, 'real names')

Number of duplicated names are:  909
The shape of  real names after deleting duplicates is: (1552, 3)


In [58]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1552 entries, 0 to 1551
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1552 non-null   object
 1   Gender  1552 non-null   object
 2   Label   1552 non-null   object
dtypes: object(3)
memory usage: 36.5+ KB


In [59]:
names.describe()

Unnamed: 0,Name,Gender,Label
count,1552,1552,1552
unique,1552,2,1
top,ابتسام,M,Real
freq,1,929,1552


## Data generation

In [60]:
arabic_alphabet = ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 
                     'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'أ', 'ؤ', 
                     'ة', 'آ', 'ء', 'ئ', 'إ']

In [61]:
class DataGenerator:
    
    def __init__(self, alpha):
        self.alphabet = alpha
    
    def generate(self, mode, num_of_words, name, gender):
        generated_data = dict()
        generated_names = []
        generated_gender = []
        generated_label = ['Fake'] * num_of_words
        i = 0
        while i < num_of_words:
            random_index = random.randint(0, (len(name) - 1))
            if mode == 'replace_char':
                alpha_random_index = random.randint(0, (len(self.alphabet) - 1))
                fake_name = name.replace(name[random_index], self.alphabet[alpha_random_index])
            elif mode == 'delete_char':
                fake_name = name.replace(name[random_index], "")
            elif mode == 'add_char':
                alpha_random_index = random.randint(0, (len(self.alphabet) - 1))
                fake_name = name[:random_index] + self.alphabet[alpha_random_index] + name[random_index:]
            if fake_name in generated_names:
                continue
            generated_names.append(fake_name)
            generated_gender.append(gender)
            i = i + 1
        generated_data['Name'] = generated_names
        generated_data['Gender'] = generated_gender
        generated_data['Label'] = generated_label        
        return generated_data
    
    def generate_full_name(self, mode, data, num_of_names):
        generated_data = dict()
        generated_first_last_names = []
        generated_full_names = []
        generated_gender = []
        generated_label = []
        # first name and second name
        names = list(data['Name'].values)
        genders = list(data['Gender'].values)
        for name, gender in zip(names, genders):
          counter1 = 0
          while True:
            if counter1 >= num_of_names:
              break
            index2 = random.randint(0,(len(names) - 1))
            if genders[index2] == 'F':
              continue
            first_last_name = name + ' ' + names[index2]
            generated_first_last_names.append(first_last_name)
            generated_gender.append(gender)
            if mode == 'Real':
              generated_label.append('Correct')
            elif mode == 'Fake':
              generated_label.append('Incorrect')
            counter1 = counter1 + 1
        # full name
        for name in  generated_first_last_names:
          while True:
            index2 = random.randint(0,(len(names) - 1))
            if genders[index2] == 'F':
              continue
            else:
              break
          full_name = name + ' ' + names[index2]
          generated_full_names.append(full_name)
        generated_data['Name'] = generated_full_names
        generated_data['Gender'] = generated_gender
        generated_data['Label'] = generated_label  
        return pd.DataFrame.from_dict(generated_data)

In [62]:
# create object of the DataGenerator class
generator = DataGenerator(arabic_alphabet)

For each name in the **_Arabic Names Dataset_**, we will generate **two** fake names by deleting a random character from the original name at random position, then generate **six** fake names by adding a  wrong character to the original name at random position, and **four** fake names by replacing a random single character from the original name with another wrong random character

**IMPORTANT NOTE**, we will not consider all possibilities of generating a fake name from an original name, as this will lead us to huge fake names maybe reach to more than **500,000!** fake name, and that will lead to **Imbalanced Dataset Problem**, So as a start it will be enough to generate **12x** fake names from the real names dataset

In [63]:
# generate 2 fake names by delete a char
f_names_deleted_char = names.apply(lambda x: generator.generate(mode='delete_char', num_of_words=2, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)
# generate 4 fake names by replace a char
f_names_replaced_char = names.apply(lambda x: generator.generate(mode='replace_char', num_of_words=4, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)
# generate 6 fake names by adding a wrong char
f_names_adding_wrong_char = names.apply(lambda x: generator.generate(mode='add_char', num_of_words=6, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)

In [64]:
# this function is used to re-represent the output of the generator to make it one dictionary instead of a 
# dictionary for each name, to convert the dictionary to a dataframe
def one_dict(multi_dict):
    dic = dict()
    name = []
    gender = []
    label = []
    for row in multi_dict:
        for names in row['Name']:
            name.append(names)
        for genders in row['Gender']:
            gender.append(genders)
        for labels in row['Label']:
            label.append(labels)  
    dic['Name'] = name
    dic['Gender'] = gender
    dic['Label'] = label
    return dic

In [65]:
# convert the output of the generator to one dictionary containing all names
deleted_char_dict = one_dict(f_names_deleted_char)
replaced_char_dict = one_dict(f_names_replaced_char)
added_char_dcit = one_dict(f_names_adding_wrong_char)
# convert the dictionaries into dataframe form
deleted_char_df = pd.DataFrame.from_dict(deleted_char_dict)
replaced_char_df = pd.DataFrame.from_dict(replaced_char_dict)
added_char_df = pd.DataFrame.from_dict(added_char_dcit)
# concatenate the 3 dataframe to form the Fake Names Dataset
f_names = pd.concat([added_char_df, replaced_char_df, deleted_char_df],ignore_index=True)

In [66]:
# view first 5 rows of fake names
f_names.head()

Unnamed: 0,Name,Gender,Label
0,اآبتسام,F,Fake
1,اابتسام,F,Fake
2,ابتوسام,F,Fake
3,ابلتسام,F,Fake
4,ابإتسام,F,Fake


In [67]:
# view last 5 rows of fake names
f_names.tail()

Unnamed: 0,Name,Gender,Label
18619,ناحة,F,Fake
18620,منا,F,Fake
18621,ميا,F,Fake
18622,صاحة,F,Fake
18623,ناحة,F,Fake


In [68]:
f_names.shape

(18624, 3)

In [69]:
f_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18624 entries, 0 to 18623
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    18624 non-null  object
 1   Gender  18624 non-null  object
 2   Label   18624 non-null  object
dtypes: object(3)
memory usage: 436.6+ KB


In [70]:
f_names.describe()

Unnamed: 0,Name,Gender,Label
count,18624,18624,18624
unique,17887,2,1
top,ري,M,Fake
freq,7,11148,18624


In [71]:
# check and remove duplicates
f_names2 = check_duplicates(f_names, 'fake names')

Number of duplicated names are:  737
The shape of  fake names after deleting duplicates is: (17887, 3)


In [72]:
# check to see if there is a fake name exist in real names
# False indicates that this name will be deleted from the fake names dataset
# True indicates that this name will remain exist in the fake names dataset
fake_real_mask = [False if name in names['Name'] else True for name in f_names2['Name']]
print('Number of fake names that exist in real arabic names', len([fake_real_mask == False]))
fake_names = f_names2[fake_real_mask]

Number of fake names that exist in real arabic names 1


We have our real arabic names dataset and generated fake names dataset from our real arabic names, We will now create a **Full Name Dataset** consists of 3 names separated by space

We will create **Two Datasets**:
1. The first is full names with real arabic names, by generating **random 162 name** for each name in the real arabic names dataset
2. the second is full names with fake arabic names, by generating **random 15 name** for each name in the fake arabic names dataset

In [73]:
# full names with real arabic names
full_names_real = generator.generate_full_name(mode='Real', data=names,num_of_names=162)
# full names with fake arabic names
full_names_fake = generator.generate_full_name(mode='Fake', data=fake_names,num_of_names=15)

In [74]:
# combine the two dataset together into one
full_names = pd.concat([full_names_real,full_names_fake])
# shuffle the data
full_names = full_names.sample(frac=1)
# reset index
full_names = full_names.reset_index(drop=True)

In [75]:
# view first 5 rows of the full_names dataset
full_names.head()

Unnamed: 0,Name,Gender,Label
0,عاركف أسامط قداسمة,M,Incorrect
1,براسخة خغلب إزهاب,F,Incorrect
2,مريثاء صآدح رشيتق,F,Incorrect
3,وليفة عبدالرب عبدالمنان,F,Correct
4,نإجوان ملج أدهش,M,Incorrect


In [76]:
# view last 5 rows of the full_names dataset
full_names.tail()

Unnamed: 0,Name,Gender,Label
519724,هنان عازث معن,M,Incorrect
519725,باضل عيسأ خمينس,M,Incorrect
519726,ظعينة رسول صادح,F,Correct
519727,رممضان رغسخ فتاي,M,Incorrect
519728,وديع أدهم قائد,M,Correct


In [77]:
full_names.shape

(519729, 3)

In [78]:
full_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519729 entries, 0 to 519728
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Name    519729 non-null  object
 1   Gender  519729 non-null  object
 2   Label   519729 non-null  object
dtypes: object(3)
memory usage: 11.9+ MB


In [79]:
full_names.describe()

Unnamed: 0,Name,Gender,Label
count,519729,519729,519729
unique,519714,2,2
top,أشهم عقيل أكرم,M,Incorrect
freq,2,309633,268305


In [80]:
# check and remove duplicates
full_names = check_duplicates(full_names, 'full names')

Number of duplicated names are:  15
The shape of  full names after deleting duplicates is: (519714, 3)
