In [842]:
from presidio_evaluator.data_generator import PresidioDataGenerator
import pandas as pd
from presidio_evaluator.data_generator.faker_extensions import (
    FakerSpansResult,
    RecordsFaker,
    NationalityProvider,
    OrganizationProvider,
    AgeProvider,
    AddressProviderNew,
    PhoneNumberProviderNew,
)
import pprint
import numpy as np
from collections import Counter
from presidio_evaluator import InputSample
from typing import Dict, List
import tqdm
from presidio_evaluator.validation import split_dataset, save_to_json
from datetime import date


In [843]:
templates_file_path = './data/train_templates.txt'
sentence_templates = PresidioDataGenerator.read_template_file(templates_file_path)

In [844]:
fake_name_generator_file = 'data/FakeNameGenerator.com_3000.csv'


In [845]:
fake_name_generator_df = pd.read_csv(fake_name_generator_file)
fake_name_generator_df = fake_name_generator_df[fake_name_generator_df["NameSet"].isin(nationalities)]

In [847]:
data_generator = PresidioDataGenerator()
fake_name_generator_df = PresidioDataGenerator.update_fake_name_generator_df(fake_name_generator_df)

In [848]:
fake = RecordsFaker(records=fake_name_generator_df)
fake.add_provider(OrganizationProvider)
fake.add_provider(AddressProviderNew)
fake.add_provider(PhoneNumberProviderNew)



In [849]:
data_generator = PresidioDataGenerator(
    custom_faker=fake, lower_case_ratio=0.5
)

## Add aliases for classes not available in the Faker library

In [850]:
data_generator.add_provider_alias(
    provider_name="credit_card_number", new_name="credit_card"
)

## Generate the fake data

In [None]:
fake_records = data_generator.generate_fake_data(
    templates=sentence_templates, n_samples=180
)

fake_records = list(fake_records)
pprint.pprint(fake_records[0])

## Basic Data Statistics 

In [None]:
count_per_template_id = Counter([sample.template_id for sample in fake_records])

print(f"Total: {sum(count_per_template_id.values())}")
print(f"Avg # of records per template: {np.mean(list(count_per_template_id.values()))}")
print(f"Median # of records per template: {np.median(list(count_per_template_id.values()))}")
print(f"Std: {np.std(list(count_per_template_id.values()))}")

In [None]:
count_per_entity = Counter()
for record in fake_records:
    count_per_entity.update(Counter([span.type for span in record.spans]))

count_per_entity

## Translate entity types (Optional)

In [854]:
translator = {
    "iban": "IBAN_CODE",
    "company": "ORGANIZATION",
    "organization": "ORGANIZATION",
    "name_female": "PERSON",
    "address": "STREET_ADDRESS",
    "country": "GPE",
    "state": "GPE",
    "credit_card": "CREDIT_CARD",
    "city": "GPE",
    "street_name": "STREET_ADDRESS",
    "building_number": "STREET_ADDRESS",
    "name": "PERSON",
    "last_name": "PERSON",
    "last_name_male": "PERSON",
    "last_name_female": "PERSON",
    "first_name": "PERSON",
    "first_name_male": "PERSON",
    "first_name_female": "PERSON",
    "phone_number": "PHONE_NUMBER",
    "email": "EMAIL_ADDRESS",
    "date_time": "DATE_TIME",
    "date_of_birth": "DATE_TIME",
    "day_of_week": "DATE_TIME",
    "name_male": "PERSON",
    "prefix_male": "TITLE",
    "prefix_female": "TITLE",
    "prefix": "TITLE",
    "nationality": "NRP",
    "first_name_nonbinary": "PERSON",
    "postcode": "STREET_ADDRESS",
    "secondary_address": "STREET_ADDRESS",
    "job": "TITLE",
    "state_abbr": "GPE",
    "age": "AGE",
}

def update_entity_types(dataset:List[FakerSpansResult], entity_mapping:Dict[str,str]):
    """Replace entity types using a translator dictionary."""

    for sample in dataset:
        # update entity types on spans
        for span in sample.spans:
            span.type = entity_mapping[span.type]
        # update entity types on the template string
        for key, value in entity_mapping.items():
            sample.template = sample.template.replace("{{" + key + "}}", "{{" + value + "}}")

update_entity_types(fake_records, entity_mapping=translator)

# Create the fake records containing text utterances, spans and templates

In [856]:
input_samples = [
    InputSample.from_faker_spans_result(faker_spans_result=fake_record, scheme="BIO")
    for fake_record in tqdm.tqdm(fake_records)
]

100%|██████████| 180/180 [00:01<00:00, 168.29it/s]


In [218]:
TRAIN_TEST_RATIOS = [0.7,0.3]
train,test = split_dataset(input_samples, TRAIN_TEST_RATIOS)

# Save Dataset

In [857]:
DATE_DATE = date.today().strftime("%b-%d-%Y")

save_to_json(train, "./data/train_{}.json".format(DATE_DATE))
save_to_json(test, "./data/val_{}.json".format(DATE_DATE))



# Tests

In [867]:
train_data = pd.read_json("./data/train_Apr-03-2023.json")
val_data = pd.read_json("./data/val_Mar-22-2023.json")

In [869]:
def entity_from_span(data, field="entity_type"):
    """
    Count number of entities of each type in a given dataset.
    Args:
        data: Dataset contining entities of different types generated from the templates (DataFrame).
    """
    spans = data.spans
    entities = []
    for span in spans:
        if len(span) > 0:
            for s in span:
                entities.append(s[field])
    return entities

In [886]:
def count_entity_types(data: pd.DataFrame) -> None:
    """
    Count number of entities of each type in a given dataset.
    Args:
        data: Dataset contining entities of different types generated from the templates (DataFrame).
    """
    count_per_entity = Counter()
    for record in data:
        for span in record.spans:
            count_per_entity[span.type] += 1

    print(count_per_entity.most_common())

In [870]:
train_entity_count = Counter(entity_from_span(train_data, "entity_value"))
val_entity_count = Counter(entity_from_span(val_data, "entity_value"))

In [882]:
def test_unique_entity_count(data: pd.DataFrame) -> None:
    """
    Test to make sure all entities in the dataset are unique.
    Args:
        data: Dataset containing entities (DataFrame) 
    """
    entity_count = Counter(entity_from_span(data, "entity_value"))
    entities = list(entity_count.keys())
    duplicate_entities = [entity for entity in entity_count.keys() if entity_count[entity] > 1]
    print(duplicate_entities)
    # print(len(entities))
    assert sum(entity_count.values()) == len(entities)
    

In [880]:
def test_no_duplicate_entity(train: pd.DataFrame, val: pd.DataFrame) -> None:
    """
    Test to ensure there are no entries repeated across two datasets.
    Args:
        train: Training dataset with all the utterances generated by the templates (DataFrame)
        val: Validation dataset with all the utterances generated by the templates (DataFrame)
    """
    train_entities = entity_from_span(train, "entity_value")
    val_entities = entity_from_span(val, "entity_value")
    intersection = set(train_entities) & set(val_entities)
    assert len(intersection) == 0, print(intersection)

In [873]:
def test_missing_classes(train: pd.DataFrame, val: pd.DataFrame) -> None:
    """
    Test to ensure there are no classes missing in the validation dataset.
    Args:
        train: Training dataset with all the utterances generated by the templates (DataFrame)
        val: Validation dataset with all the utterances generated by the templates (DataFrame)
    """
    train_entities = entity_from_span(train, "entity_type")
    val_entities = entity_from_span(val, "entity_type")
    print(set(val_entities))
    intersection = set(train_entities) & set(val_entities)
    print(intersection)
    print(set(train_entities))
    assert len(intersection) == len(set(train_entities)), f"{intersection}"
    

# Run Tests

In [None]:
test_unique_entity_count(train_data)
test_unique_entity_count(val_data)
test_no_duplicate_entity(train_data, val_data)
test_missing_classes(train_data, val_data)