## Imports

In [1]:
import pandas as pd 
import numpy as np
import random

## Load Datasets

In [2]:
# load arabic names
real_arabic_names = pd.read_csv('Names web dataset/Arabic_names.csv')
# load male names
male = pd.read_csv('Names web dataset/mnames.txt', sep='\n', header=None, names=['Name'])
# load female names
female = pd.read_csv('Names web dataset/fnames.txt', sep='\n', header=None, names=['Name'])

In [3]:
# view first rows of real arabic names
real_arabic_names.head()

Unnamed: 0,Name,Gender
0,ابتسام,F
1,ابتهاج,F
2,ابتهال,F
3,اجتهاد,F
4,ازدهار,F


In [4]:
# view last 5 rows of arabic names
real_arabic_names.tail()

Unnamed: 0,Name,Gender
1400,وهبة,M
1401,ياسر,M
1402,يحيى,M
1403,يزيد,M
1404,يعقوب,M


In [5]:
# view first 5 rows of male names
male.head()

Unnamed: 0,Name
0,أبان
1,أبو بكر
2,أبو حمزة
3,أحد
4,أحمد


In [6]:
# view last 5 rows of male names
male.tail()

Unnamed: 0,Name
635,وفيق
636,وهاب
637,وهبة
638,ياسر
639,يحيى


In [7]:
# view first 5 rows of female names
female.head()

Unnamed: 0,Name
0,آيات
1,آلاء
2,آمال
3,آمنة
4,آيات


In [8]:
# view last 5 rows of female names
female.tail()

Unnamed: 0,Name
411,وميض
412,وهبة
413,يارا
414,ياسمين
415,يسرا


In [9]:
# add gender column to male and female names
male['Gender'] = 'M'
female['Gender'] = 'F'

In [10]:
# merge male names with female names
names = pd.concat([male,female])
# merge the result with arabic names
real_names = pd.concat([real_arabic_names,names])

In [11]:
# add label column to the arabic names datasets that identifies that these names are real names
real_names['Label'] = 'Real'

In [12]:
# shape of the result of merging all real arabic names datasets
real_names.shape

(2461, 3)

In [13]:
# check to see if there is duplicates in rows
print('Number of duplicated names are: ', real_names['Name'].duplicated().sum())
# create mask
duplicated_names_mask = real_names['Name'].duplicated()
# new arabic names dataframe without any duplicated names
names = real_names[~ duplicated_names_mask]
print('The shape of the final arabic names is', names.shape)

Number of duplicated names are:  909
The shape of the final arabic names is (1552, 3)


In [14]:
names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1552 entries, 0 to 385
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1552 non-null   object
 1   Gender  1552 non-null   object
 2   Label   1552 non-null   object
dtypes: object(3)
memory usage: 48.5+ KB


In [15]:
names.describe()

Unnamed: 0,Name,Gender,Label
count,1552,1552,1552
unique,1552,2,1
top,إيمان,M,Real
freq,1,929,1552


## Data generation

In [16]:
arabic_alphabet = ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 
                     'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'أ', 'ؤ', 
                     'ة', 'آ', 'ء', 'ئ', 'إ']

In [67]:
class DataGenerator:
    
    def __init__(self, alpha):
        self.alphabet = alpha
    
    def generate(self, mode, num_of_words, name, gender):
        generated_data = dict()
        generated_names = []
        generated_gender = []
        generated_label = ['Fake'] * num_of_words
        i = 0
        while i < num_of_words:
            random_index = random.randint(0, (len(name) - 1))
            if mode == 'replace_char':
                alpha_random_index = random.randint(0, (len(self.alphabet) - 1))
                fake_name = name.replace(name[random_index], self.alphabet[alpha_random_index])
            elif mode == 'delete_char':
                fake_name = name.replace(name[random_index], "")
            elif mode == 'add_char':
                alpha_random_index = random.randint(0, (len(self.alphabet) - 1))
                fake_name = name[:random_index] + self.alphabet[alpha_random_index] + name[random_index:]
            if fake_name in generated_names:
                continue
            generated_names.append(fake_name)
            generated_gender.append(gender)
            i = i + 1
        generated_data['Name'] = generated_names
        generated_data['Gender'] = generated_gender
        generated_data['Label'] = generated_label        
        return generated_data
    
    def generate_full_name(self, real, fake):
        pass

In [65]:
# create object of the DataGenerator class
generator = DataGenerator(arabic_alphabet)

For each name in the **_Arabic Names Dataset_**, we will generate **two** fake names by deleting a random character from the original name at random position, then generate **six** fake names by adding a  wrong character to the original name at random position, and **four** fake names by replacing a random single character from the original name with another wrong random character

**IMPORTANT NOTE**, we will not consider all possibilities of generating a fake name from an original name, as this will lead us to huge fake names maybe reach to more than **500,000!** fake name, and that will lead to **Imbalanced Dataset Problem**, So as a start it will be enough to generate **12x** fake names from the real names dataset

In [66]:
# generate 2 fake names by delete a char
f_names_deleted_char = names.apply(lambda x: generator.generate(mode='delete_char', num_of_words=2, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)
# generate 4 fake names by replace a char
f_names_replaced_char = names.apply(lambda x: generator.generate(mode='replace_char', num_of_words=4, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)
# generate 6 fake names by adding a wrong char
f_names_adding_wrong_char = names.apply(lambda x: generator.generate(mode='add_char', num_of_words=6, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)

In [68]:
# this function is used to re-represent the output of the generator to make it one dictionary instead of a 
# dictionary for each name, to convert the dictionary to a dataframe
def one_dict(multi_dict):
    dic = dict()
    name = []
    gender = []
    label = []
    for row in multi_dict:
        for names in row['Name']:
            name.append(names)
        for genders in row['Gender']:
            gender.append(genders)
        for labels in row['Label']:
            label.append(labels)  
    dic['Name'] = name
    dic['Gender'] = gender
    dic['Label'] = label
    return dic

In [81]:
# convert the output of the generator to one dictionary containing all names
deleted_char_dict = one_dict(f_names_deleted_char)
replaced_char_dict = one_dict(f_names_replaced_char)
added_char_dcit = one_dict(f_names_adding_wrong_char)
# convert the dictionaries into dataframe form
deleted_char_df = pd.DataFrame.from_dict(deleted_char_dict)
replaced_char_df = pd.DataFrame.from_dict(replaced_char_dict)
added_char_df = pd.DataFrame.from_dict(added_char_dcit)
# concatenate the 3 dataframe to form the Fake Names Dataset
fake_names = pd.concat([added_char_df, replaced_char_df, deleted_char_df],ignore_index=True)

In [85]:
# view first 5 rows of fake names
fake_names.head()

Unnamed: 0,Name,Gender,Label
0,ابتسازم,F,Fake
1,ابتسارم,F,Fake
2,آابتسام,F,Fake
3,ابتسزام,F,Fake
4,ابتساجم,F,Fake


In [86]:
# view last 5 rows of fake names
fake_names.tail()

Unnamed: 0,Name,Gender,Label
18619,منحة,F,Fake
18620,ميا,F,Fake
18621,منا,F,Fake
18622,نصحة,F,Fake
18623,نصاة,F,Fake


In [87]:
fake_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18624 entries, 0 to 18623
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    18624 non-null  object
 1   Gender  18624 non-null  object
 2   Label   18624 non-null  object
dtypes: object(3)
memory usage: 436.6+ KB


In [88]:
fake_names.describe()

Unnamed: 0,Name,Gender,Label
count,18624,18624,18624
unique,17844,2,1
top,مير,M,Fake
freq,8,11148,18624
