## Imports

In [100]:
#!pip install arabert

In [101]:
#!pip install transformers

In [2]:
#!pip install datasets

In [116]:
import pandas as pd 
import numpy as np
import random
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from arabert.preprocess import ArabertPreprocessor
from transformers import Trainer
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score
from datasets import Dataset
from datasets import Value
from datasets import ClassLabel
from datasets import Features
from datasets import Sequence

## Load Datasets

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# load arabic names
real_arabic_names = pd.read_csv('drive/MyDrive/Names web dataset/Arabic_names.csv')
# load male names
male = pd.read_csv('drive/MyDrive/Names web dataset/mnames.txt', sep='\n', header=None, names=['Name'])
# load female names
female = pd.read_csv('drive/MyDrive/Names web dataset/fnames.txt', sep='\n', header=None, names=['Name'])

In [5]:
# view first rows of real arabic names
real_arabic_names.head()

Unnamed: 0,Name,Gender
0,ابتسام,F
1,ابتهاج,F
2,ابتهال,F
3,اجتهاد,F
4,ازدهار,F


In [6]:
# view last 5 rows of arabic names
real_arabic_names.tail()

Unnamed: 0,Name,Gender
1400,وهبة,M
1401,ياسر,M
1402,يحيى,M
1403,يزيد,M
1404,يعقوب,M


In [7]:
# view first 5 rows of male names
male.head()

Unnamed: 0,Name
0,أبان
1,أبو بكر
2,أبو حمزة
3,أحد
4,أحمد


In [8]:
# view last 5 rows of male names
male.tail()

Unnamed: 0,Name
635,وفيق
636,وهاب
637,وهبة
638,ياسر
639,يحيى


In [9]:
# view first 5 rows of female names
female.head()

Unnamed: 0,Name
0,آيات
1,آلاء
2,آمال
3,آمنة
4,آيات


In [10]:
# view last 5 rows of female names
female.tail()

Unnamed: 0,Name
411,وميض
412,وهبة
413,يارا
414,ياسمين
415,يسرا


In [11]:
# add gender column to male and female names
male['Gender'] = 'M'
female['Gender'] = 'F'

In [12]:
# merge male names with female names
names = pd.concat([male,female])
# merge the result with arabic names
real_names = pd.concat([real_arabic_names,names])

In [13]:
# add label column to the arabic names datasets that identifies that these names are real names
real_names['Label'] = 'Real'

In [14]:
# shape of the result of merging all real arabic names datasets
real_names.shape

(2461, 3)

In [15]:
def check_duplicates(data, text):
  df = data
  # check to see if there is duplicates in rows
  print('Number of duplicated names are: ', df['Name'].duplicated().sum())
  # create mask
  duplicated_names_mask_fake = df['Name'].duplicated()
  # new arabic names dataframe without any duplicated names
  df = df[~ duplicated_names_mask_fake]
  print('The shape of ', text, 'after deleting duplicates is:', df.shape)
  # reset index
  df = df.reset_index(drop=True)
  return df

In [16]:
# check and remove duplicates
names = check_duplicates(real_names, 'real names')

Number of duplicated names are:  909
The shape of  real names after deleting duplicates is: (1552, 3)


In [17]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1552 entries, 0 to 1551
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1552 non-null   object
 1   Gender  1552 non-null   object
 2   Label   1552 non-null   object
dtypes: object(3)
memory usage: 36.5+ KB


In [18]:
names.describe()

Unnamed: 0,Name,Gender,Label
count,1552,1552,1552
unique,1552,2,1
top,ابتسام,M,Real
freq,1,929,1552


## Data generation

In [19]:
arabic_alphabet = ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 
                     'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'أ', 'ؤ', 
                     'ة', 'آ', 'ء', 'ئ', 'إ']

In [20]:
class DataGenerator:
    
    def __init__(self, alpha):
        self.alphabet = alpha
    
    def generate(self, mode, num_of_words, name, gender):
        generated_data = dict()
        generated_names = []
        generated_gender = []
        generated_label = ['Fake'] * num_of_words
        i = 0
        while i < num_of_words:
            random_index = random.randint(0, (len(name) - 1))
            if mode == 'replace_char':
                alpha_random_index = random.randint(0, (len(self.alphabet) - 1))
                fake_name = name.replace(name[random_index], self.alphabet[alpha_random_index])
            elif mode == 'delete_char':
                fake_name = name.replace(name[random_index], "")
            elif mode == 'add_char':
                alpha_random_index = random.randint(0, (len(self.alphabet) - 1))
                fake_name = name[:random_index] + self.alphabet[alpha_random_index] + name[random_index:]
            if fake_name in generated_names:
                continue
            generated_names.append(fake_name)
            generated_gender.append(gender)
            i = i + 1
        generated_data['Name'] = generated_names
        generated_data['Gender'] = generated_gender
        generated_data['Label'] = generated_label        
        return generated_data
    
    def generate_full_name(self, mode, data, num_of_names):
        generated_data = dict()
        generated_first_last_names = []
        generated_full_names = []
        generated_gender = []
        generated_label = []
        # first name and second name
        names = list(data['Name'].values)
        genders = list(data['Gender'].values)
        for name, gender in zip(names, genders):
          counter1 = 0
          while True:
            if counter1 >= num_of_names:
              break
            index2 = random.randint(0,(len(names) - 1))
            if genders[index2] == 'F':
              continue
            first_last_name = name + ' ' + names[index2]
            generated_first_last_names.append(first_last_name)
            generated_gender.append(gender)
            if mode == 'Real':
              generated_label.append('Correct')
            elif mode == 'Fake':
              generated_label.append('Incorrect')
            counter1 = counter1 + 1
        # full name
        for name in  generated_first_last_names:
          while True:
            index2 = random.randint(0,(len(names) - 1))
            if genders[index2] == 'F':
              continue
            else:
              break
          full_name = name + ' ' + names[index2]
          generated_full_names.append(full_name)
        generated_data['Name'] = generated_full_names
        generated_data['Gender'] = generated_gender
        generated_data['Label'] = generated_label  
        return pd.DataFrame.from_dict(generated_data)

In [21]:
# create object of the DataGenerator class
generator = DataGenerator(arabic_alphabet)

For each name in the **_Arabic Names Dataset_**, we will generate **two** fake names by deleting a random character from the original name at random position, then generate **six** fake names by adding a  wrong character to the original name at random position, and **four** fake names by replacing a random single character from the original name with another wrong random character

**IMPORTANT NOTE**, we will not consider all possibilities of generating a fake name from an original name, as this will lead us to huge fake names maybe reach to more than **500,000!** fake name, and that will lead to **Imbalanced Dataset Problem**, So as a start it will be enough to generate **12x** fake names from the real names dataset

In [22]:
# generate 2 fake names by delete a char
f_names_deleted_char = names.apply(lambda x: generator.generate(mode='delete_char', num_of_words=2, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)
# generate 4 fake names by replace a char
f_names_replaced_char = names.apply(lambda x: generator.generate(mode='replace_char', num_of_words=4, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)
# generate 6 fake names by adding a wrong char
f_names_adding_wrong_char = names.apply(lambda x: generator.generate(mode='add_char', num_of_words=6, 
                                                                name=x['Name'], gender=x['Gender']), axis=1)

In [23]:
# this function is used to re-represent the output of the generator to make it one dictionary instead of a 
# dictionary for each name, to convert the dictionary to a dataframe
def one_dict(multi_dict):
    dic = dict()
    name = []
    gender = []
    label = []
    for row in multi_dict:
        for names in row['Name']:
            name.append(names)
        for genders in row['Gender']:
            gender.append(genders)
        for labels in row['Label']:
            label.append(labels)  
    dic['Name'] = name
    dic['Gender'] = gender
    dic['Label'] = label
    return dic

In [24]:
# convert the output of the generator to one dictionary containing all names
deleted_char_dict = one_dict(f_names_deleted_char)
replaced_char_dict = one_dict(f_names_replaced_char)
added_char_dcit = one_dict(f_names_adding_wrong_char)
# convert the dictionaries into dataframe form
deleted_char_df = pd.DataFrame.from_dict(deleted_char_dict)
replaced_char_df = pd.DataFrame.from_dict(replaced_char_dict)
added_char_df = pd.DataFrame.from_dict(added_char_dcit)
# concatenate the 3 dataframe to form the Fake Names Dataset
f_names = pd.concat([added_char_df, replaced_char_df, deleted_char_df],ignore_index=True)

In [25]:
# view first 5 rows of fake names
f_names.head()

Unnamed: 0,Name,Gender,Label
0,ابتسالم,F,Fake
1,ابتسارم,F,Fake
2,اهبتسام,F,Fake
3,ابهتسام,F,Fake
4,ايبتسام,F,Fake


In [26]:
# view last 5 rows of fake names
f_names.tail()

Unnamed: 0,Name,Gender,Label
18619,منحة,F,Fake
18620,منا,F,Fake
18621,ينا,F,Fake
18622,نصاح,F,Fake
18623,صاحة,F,Fake


In [27]:
f_names.shape

(18624, 3)

In [28]:
f_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18624 entries, 0 to 18623
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    18624 non-null  object
 1   Gender  18624 non-null  object
 2   Label   18624 non-null  object
dtypes: object(3)
memory usage: 436.6+ KB


In [29]:
f_names.describe()

Unnamed: 0,Name,Gender,Label
count,18624,18624,18624
unique,17848,2,1
top,سام,M,Fake
freq,7,11148,18624


In [30]:
# check and remove duplicates
f_names2 = check_duplicates(f_names, 'fake names')

Number of duplicated names are:  776
The shape of  fake names after deleting duplicates is: (17848, 3)


In [31]:
# check to see if there is a fake name exist in real names
# False indicates that this name will be deleted from the fake names dataset
# True indicates that this name will remain exist in the fake names dataset
fake_real_mask = [False if name in names['Name'] else True for name in f_names2['Name']]
print('Number of fake names that exist in real arabic names', len([fake_real_mask == False]))
fake_names = f_names2[fake_real_mask]

Number of fake names that exist in real arabic names 1


We have our real arabic names dataset and generated fake names dataset from our real arabic names, We will now create a **Full Name Dataset** consists of 3 names separated by space

We will create **Two Datasets**:
1. The first is full names with real arabic names, by generating **random 162 name** for each name in the real arabic names dataset
2. the second is full names with fake arabic names, by generating **random 15 name** for each name in the fake arabic names dataset

In [32]:
# full names with real arabic names
full_names_real = generator.generate_full_name(mode='Real', data=names,num_of_names=162)
# full names with fake arabic names
full_names_fake = generator.generate_full_name(mode='Fake', data=fake_names,num_of_names=15)

In [63]:
# combine the two dataset together into one
full_names = pd.concat([full_names_real,full_names_fake])
# shuffle the data
full_names = full_names.sample(frac=1)
# reset index
full_names = full_names.reset_index(drop=True)

In [34]:
# view first 5 rows of the full_names dataset
full_names.head()

Unnamed: 0,Name,Gender,Label
0,آاهدة غشاري مبسفر,F,Incorrect
1,البراء ساهر جبر,M,Correct
2,طافر عريض كؤسب,M,Incorrect
3,ذصابر أضصيل حاؤفظ,M,Incorrect
4,أريام غصاب باهر,F,Correct


In [35]:
# view last 5 rows of the full_names dataset
full_names.tail()

Unnamed: 0,Name,Gender,Label
519139,ريماس رافل زيان,F,Correct
519140,وفيقة ياسر حماد,F,Correct
519141,جوان أصيل عارف,F,Correct
519142,خليوي هادف حماد,M,Correct
519143,ؤرسيمة لطام حاؤرب,F,Incorrect


In [36]:
full_names.shape

(519144, 3)

In [37]:
full_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519144 entries, 0 to 519143
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Name    519144 non-null  object
 1   Gender  519144 non-null  object
 2   Label   519144 non-null  object
dtypes: object(3)
memory usage: 11.9+ MB


In [38]:
full_names.describe()

Unnamed: 0,Name,Gender,Label
count,519144,519144,519144
unique,519129,2,2
top,حسان شهم عوض,M,Incorrect
freq,2,309093,267720


In [39]:
# check and remove duplicates
full_names = check_duplicates(full_names, 'full names')

Number of duplicated names are:  15
The shape of  full names after deleting duplicates is: (519129, 3)


## Core Model

For the Core Model part i will use a **Pretrained Model** that is **AraBERT** and it is an Arabic pretrained language model based on Google's BERT architechture but for  Arabic Language Understanding

Prepare the data for the **Model**

In [64]:
# drop the gender column as it wouldn't be useful anymore
full_names = full_names.drop(columns=['Gender'])
# divide data into x that represents features and y that represent target
X = full_names.drop(columns=['Label'])
y = full_names[['Label']]
# divide the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# divide the train data into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [67]:
# reset index for all data
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

In [41]:
# load the model, tokenizer and preprocessor
model_name = 'aubmindlab/bert-large-arabertv02'
tokenizer =  AutoTokenizer.from_pretrained(model_name)
arabert = AutoModel.from_pretrained(model_name)
preprocessor = ArabertPreprocessor(model_name=model_name)

Some weights of the model checkpoint at aubmindlab/bert-large-arabertv02 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
# apply preprocessor on the train, validation and test data
X_train = X_train['Name'].apply(lambda x: preprocessor.preprocess(x))
X_val = X_val['Name'].apply(lambda x: preprocessor.preprocess(x))
X_test = X_test['Name'].apply(lambda x: preprocessor.preprocess(x))

In [70]:
# convert them back to dataframe and reset_index
X_train = pd.DataFrame(X_train,columns=['Name'])
X_val = pd.DataFrame(X_val,columns=['Name'])
X_test = pd.DataFrame(X_test,columns=['Name'])

In [73]:
# tokenize train data
X_train = [tokenizer.tokenize(name ,max_length=3, truncation=True) for name in X_train['Name'].tolist()]
# tokenize validation data
X_val = [tokenizer.tokenize(name ,max_length=3, truncation=True) for name in X_val['Name'].tolist()]
# tokenize the test data
X_test = [tokenizer.tokenize(name ,max_length=3, truncation=True) for name in X_test['Name'].tolist()]

In [74]:
# convert tokens in train, validation and test to number representation
X_train = [tokenizer.convert_tokens_to_ids(name) for name in X_train]
X_val = [tokenizer.convert_tokens_to_ids(name) for name in X_val]
X_test = [tokenizer.convert_tokens_to_ids(name) for name in X_test]

In [95]:
df_train = pd.concat([pd.DataFrame({'Name':X_train}, columns=['Name']),y_train],axis=1)
df_val = pd.concat([pd.DataFrame({'Name':X_val}, columns=['Name']),y_val],axis=1)
df_test = pd.concat([pd.DataFrame({'Name':X_test}, columns=['Name']),y_test],axis=1)

In [114]:
df_train['Name'].dtype

dtype('O')

In [117]:
# create datasets to be passed to Trainer
features = Features({'Name': Sequence(feature=Value('int64')),'Label': ClassLabel(num_classes=2, names=['Correct','Incorrect'])})
train_dataset = Dataset.from_pandas(df_train,features)
val_dataset = Dataset.from_pandas(df_val,features)
test_dataset = Dataset.from_pandas(df_test,features)

In [119]:
def compute_metrics(eval_prediction):
  acc = accuracy_score(eval_prediction.label_ids, eval_prediction.predictions)
  return {'accuracy': acc}

In [120]:
# arguments
training_arguments = TrainingArguments(
  output_dir = 'drive/MyDrive/',
  num_train_epochs= 10,
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  evaluation_strategy = 'epoch'
)

trainer = Trainer(
    model = arabert,
    args = training_arguments,
    train_dataset = train_dataset,
    eval_dataset= val_dataset,
    tokenizer = tokenizer,
    compute_metrics= compute_metrics,
)

# train the model
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `BertModel.forward` and have been ignored: Name, Label. If Name, Label are not expected by `BertModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 0
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 468860
  Number of trainable parameters = 369423360


IndexError: ignored