## Imports

In [7]:
import numpy as np
import pandas as pd
import random
import os
from transformers import AutoTokenizer
from tensorflow.keras.utils import pad_sequences

## Model Evaluation Class

In [2]:
class ModelEvaluation:
    def __init__(self):
        pass

    def plot_curves(self, history):
        """
        Plots the loss and accuracy curves for a model.

        Parameters:
        - history: A history object returned by a model during training.

        Returns:
        - None. The function displays the plot using Matplotlib.
        """
        title_loss = 'Model loss per epoch '
        title_accuracy = 'Model accuracy per epoch '
        fig , axis = plt.subplots(nrows=1, ncols=2)
        # dimensions of figure
        fig.set_figheight(6)
        fig.set_figwidth(14)
        # loss
        loss = history.history['loss']
        val_loss = history.history['val_loss']
        # accuracy
        accuracy = history.history['accuracy']
        val_accuracy = history.history['val_accuracy']
        epoch = np.arange(150)
        # loss curve
        axis[0].plot(loss,label='Train')
        axis[0].plot(val_loss,label='Validation')
        axis[0].set_xlabel('epoch')
        axis[0].set_ylabel('loss')
        axis[0].set_title(title_loss)
        axis[0].legend()
        # accuracy curve
        axis[1].plot(accuracy, label='Train')
        axis[1].plot(val_accuracy, label='Validation')
        axis[1].set_xlabel('epoch')
        axis[1].set_ylabel('accuracy')
        axis[1].set_title(title_accuracy)
        axis[1].legend()

    def predictions(self, model, X_test):
        """
        Makes predictions using a model.

        Parameters:
        - model: A model object.
        - X_test: A NumPy array containing the input data to use for predictions.

        Returns:
        - A dictionary containing the predictions. The dictionary has the following structure:
        {
        'Correct': list of correct predictions (0s and 1s),
        'Incorrect': list of incorrect predictions (0s and 1s)
        }
        """
        # perform predictions
        pred = model.predict(X_test)
        # this variable means that the model predicts the full name is correct and its confidence is higher than 
        # the confidence of the full name being incorrect
        correct_pred = (pred[:, 0] > pred[:, 1]).astype(int)
        # this variable means that the model predicts the full name is incorrect and its confidence is higher than 
        # the confidence of the full name being correct
        incorrect_pred = (pred[:, 1] > pred[:, 0]).astype(int)
        # put predictions into a dictionary under two keys correct and incorrect
        return {'Correct': correct_pred, 'Incorrect': incorrect_pred}

## Data Transformer Class

In [3]:
class DataTransformer:
    def __init__(self, model_name):
        self.tokenizer =  AutoTokenizer.from_pretrained(model_name)

    def transform_data(self, X,y):
        """
        This function is used to transform the data into a format that can be used by the model.
        It will convert the names into ids and pad them to a length of 50.
        It will also encode the labels into one-hot encoding.
        Parameters:
        - X: A Pandas DataFrame containing the names.
        - y: A Pandas DataFrame containing the labels.
        Returns:
        - X: A numpy array containing the names in ids format.
        - y: A numpy array containing the labels in one-hot encoding format.
        """
        if not isinstance(X, pd.DataFrame):
            X = {'Name': [X]}
            X = pd.DataFrame(X, index=[0], columns=['Name'])
        def split_word(word):
            word = word.replace(" ", "")
            return list(word)
        # split each name into a list of characters
        X['Name'] = X['Name'].apply(split_word)
        # convert tokens in X to ids
        X['Name'] = X['Name'].apply(lambda x: self.tokenizer.convert_tokens_to_ids(x))
        # convert into numpy array
        X = pad_sequences(X['Name'], maxlen=50, padding='post', truncating='post')
        if y is None:
            return X 
        else:
            # encode labels in y data
            y = pd.get_dummies(y['Label'])
        return X, y

## Data Duplicate Checker Class

In [4]:
class DataDuplicateChecker:
    def __init__(self):
        pass
    
    def check_duplicates(self, data, text):
        df = data
        # check to see if there is duplicates in rows
        print('Number of duplicated names are: ', df['Name'].duplicated().sum())
        # create mask
        duplicated_names_mask_fake = df['Name'].duplicated()
        # new arabic names dataframe without any duplicated names
        df = df[~ duplicated_names_mask_fake]
        print('The shape of ', text, 'after deleting duplicates is:', df.shape)
        # reset index
        df = df.reset_index(drop=True)
        return df

## Data Generator Class

In [5]:
class DataGenerator:
    def __init__(self):
        self.alphabet = ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 
                     'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'أ', 'ؤ', 
                     'ة', 'آ', 'ء', 'ئ', 'إ']
    
    def flatten_dict(self, multi_dict):
        """
        Flattens a list of dictionaries into a single dictionary.

        Parameters:
        - multi_dict: A list of dictionaries. Each dictionary must have the following structure:
        {
        'Name': list of names (strings),
        'Gender': list of genders (strings),
        'Label': list of labels (strings)
        }

        Returns:
        - A Pandas DataFrame containing the flattened lists of names, genders, and labels. The DataFrame has the following structure:
        {
        'Name': list of names (strings),
        'Gender': list of genders (strings),
        'Label': list of labels (strings)
        }
        """
        name = []
        gender = []
        label = []
        for row in multi_dict:
            name.extend(row['Name'])
            gender.extend(row['Gender'])
            label.extend(row['Label'])
        return pd.DataFrame({
            'Name': name,
            'Gender': gender,
            'Label': label
        })

    def generate_fake_names_replace_char(self, num_of_words, name, gender):
        """
        Generates fake names based on the given name.

        Parameters:
        - num_of_words: An integer representing the number of fake names to generate.
        - name: A string representing the base name.
        - gender: A string representing the gender of the name. Can be 'male' or 'female'.

        Returns:
        - A dictionary containing the generated names, genders, and labels. The dictionary has the following structure:
        {
            'Name': list of generated names (strings),
            'Gender': list of genders (strings),
            'Label': list of labels (strings)
        }
        """
        generated_names = []
        generated_genders = []
        fake_name = ""
        while len(generated_names) < num_of_words:
            random_index = random.randint(0, (len(name) - 1))
            alpha_random_index = random.randint(0, (len(self.alphabet) - 1))
            fake_name = name.replace(name[random_index], self.alphabet[alpha_random_index])
            if fake_name not in generated_names:
                generated_names.append(fake_name)
                generated_genders.append(gender)
        return {
            'Name': generated_names,
            'Gender': generated_genders,
            'Label': ['Fake'] * num_of_words
        }

    def generate_fake_names_delete_char(self, num_of_words, name, gender):
        """
        Generates fake names based on the given name.

        Parameters:
        - num_of_words: An integer representing the number of fake names to generate.
        - name: A string representing the base name.
        - gender: A string representing the gender of the name. Can be 'male' or 'female'.

        Returns:
        - A dictionary containing the generated names, genders, and labels. The dictionary has the following structure:
        {
            'Name': list of generated names (strings),
            'Gender': list of genders (strings),
            'Label': list of labels (strings)
        }
        """
        generated_names = []
        generated_genders = []
        fake_name = ""
        while len(generated_names) < num_of_words:
            random_index = random.randint(0, (len(name) - 1))
            fake_name = name.replace(name[random_index], "")
            if fake_name not in generated_names:
                generated_names.append(fake_name)
                generated_genders.append(gender)
        return {
            'Name': generated_names,
            'Gender': generated_genders,
            'Label': ['Fake'] * num_of_words
        }

    def generate_fake_names_add_char(self, num_of_words, name, gender):
        """
        Generates fake names based on the given name.

        Parameters:
        - num_of_words: An integer representing the number of fake names to generate.
        - name: A string representing the base name.
        - gender: A string representing the gender of the name. Can be 'male' or 'female'.

        Returns:
        - A dictionary containing the generated names, genders, and labels. The dictionary has the following structure:
        {
            'Name': list of generated names (strings),
            'Gender': list of genders (strings),
            'Label': list of labels (strings)
        }
        """
        generated_names = []
        generated_genders = []
        fake_name = ""
        while len(generated_names) < num_of_words:
            random_index = random.randint(0, (len(name) - 1))
            alpha_random_index = random.randint(0, (len(self.alphabet) - 1))
            fake_name = name[:random_index] + self.alphabet[alpha_random_index] + name[random_index:]
            if fake_name not in generated_names:
                generated_names.append(fake_name)
                generated_genders.append(gender)
        return {
            'Name': generated_names,
            'Gender': generated_genders,
            'Label': ['Fake'] * num_of_words
        }

    def generate_full_name(self, full_name_type, data, num_of_names):
        """
        Generates full names based on the given data.

        Parameters:
        - full_name_type: A string representing the mode of generation. Can be 'Real' or 'Fake'.
        - data: A Pandas DataFrame containing names and genders. The DataFrame must have the following structure:
        {
            'Name': list of names (strings),
            'Gender': list of genders (strings)
        }
        - num_of_names: An integer representing the number of full names to generate.

        Returns:
        - A Pandas DataFrame containing the generated full names, genders, and labels. The DataFrame has the following structure:
        {
            'Name': list of generated full names (strings),
            'Gender': list of genders (strings),
            'Label': list of labels (strings)
        }
        """
        generated_full_names = []
        generated_gender = []
        generated_label = []
        names = list(data['Name'].values)
        genders = list(data['Gender'].values)
        for name, gender in zip(names, genders):
          while True:
            index2 = random.randint(0,(len(names) - 1))
            index3 = random.randint(0,(len(names) - 1))
            while genders[index2] == 'F':
              index2 = random.randint(0,(len(names) - 1))
            while genders[index3] == 'F':
              index3 = random.randint(0,(len(names) - 1))
            full_name = name + ' ' + names[index2] + ' ' + names[index3]
            generated_full_names.append(full_name)
            generated_gender.append(gender)
            if full_name_type == 'Real':
              generated_label.append('Correct')
            elif full_name_type == 'Fake':
              generated_label.append('Incorrect')
            if len(generated_full_names) % num_of_names == 0:
              break
        return pd.DataFrame({
            'Name': generated_full_names,
            'Gender': generated_gender,
            'Label': generated_label
        })

## Data Loader Class

In [8]:
class DataLoader:
    def __init__(self):
        pass

    def load_data(self, dir_name, file_name, **kwargs):
        cwd = os.getcwd()
        file_name_relative_path = "C:/Users/mahmo/Desktop/Name-Verification-Model" + "/" + dir_name + "/" + file_name
        file_name_path = os.path.join(cwd, file_name_relative_path)
        data = pd.read_csv(file_name_path, **kwargs)
        return data