In [None]:
import datetime
import pprint
from collections import Counter
from pathlib import Path
from typing import Dict, List

import numpy as np
import pandas as pd
import tqdm

from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import PresidioDataGenerator, PresidioFakeRecordGenerator

# Generate fake PII data using Presidio's data generator

Presidio's data generator is based on the [Python Faker tool](https://faker.readthedocs.io/en/master/)
and allows you to generate a synthetic dataset from sentence templates.
It features wrappers for Faker which allows you to sample from existing sources of fake data.

Example templates:

> I live at {{address}}

> You can email me at {{email}}. Thanks, {{first_name}}

> What's your last name? It's {{last_name}}

> Every time I see you falling I get down on my knees and pray


### Simple example
This uses all the default values to generate 10 samples based on three templates

In [None]:
sentence_templates = [
    "My name is {{name}}",
    "Please send it to {{address}}",
    "I just moved to {{city}} from {{country}}",
]


data_generator = PresidioDataGenerator()
fake_records = data_generator.generate_fake_data(
    templates=sentence_templates, n_samples=10
)

fake_records = list(fake_records)

# Print the spans of the first sample
print(fake_records[0].fake)
print(fake_records[0].spans)

## Generate a full dataset

In this example we use the `PresidioFakeRecordGenerator` which extends the `PresidioDataGenerator` to:
1. Accept more types of entities (by adding more providers to Faker. see [Faker's documentation](https://faker.readthedocs.io/en/master/index.html#how-to-create-a-provider))
2. Handle records of multiple PII entities per fake person for a more realistic dataset
3. Translate the generated entity types to match Presidio's

We then save the new dataset in json and CONLL03 formats.

In [None]:
number_of_samples = 1500
lower_case_ratio = 0.05
locale = 'en'
cur_time = datetime.date.today().strftime("%B_%d_%Y")

output_file = f"../data/generated_size_{number_of_samples}_date_{cur_time}.json"
output_conll = f"../data/generated_size_{number_of_samples}_date_{cur_time}.tsv"

The `PresidioFakeRecordGenerator` loads [FakeNameGenerator](https://www.fakenamegenerator.com/) data to extend the set of fake values
and creates a `RecordsFaker` which returns a fake person record (with multiple values) instead of one value,
allowing dependencies between values belonging to the same fake person
(e.g. name = Michael Smith with the email michael.smith@gmail.com).

The `fake_name_generator_file` is included in the presidio_evaluator package and can be sourced from https://www.fakenamegenerator.com/order.php

> Note by using the lower level PresidioDataGenerator and RecordsFaker classes, you can create fake records for multiple name sets, allowing you to adapt the fake data to the real data if needed. 

In [None]:
record_generator = PresidioFakeRecordGenerator(locale, lower_case_ratio)

In [None]:
pd.DataFrame(record_generator._data_generator.faker.records).head()

`PresidioFakeRecordGenerator` adds additional providers by default, which are not included in the Faker package.
These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`

In [None]:
from presidio_evaluator.data_generator.faker_extensions.providers import *

IpAddressProvider  # Both Ipv4 and IPv6 IP addresses
NationalityProvider  # Read countries + nationalities from file
OrganizationProvider  # Read organization names from file
UsDriverLicenseProvider  # Read US driver license numbers from file
AgeProvider  # Age values (unavailable on Faker
AddressProviderNew  # Extend the default address formats
PhoneNumberProviderNew  # Extend the default phone number formats
ReligionProvider  # Read religioons from file

`PresidioFakeRecordGenerator.PROVIDER_ALIASES` can be extended to add additional provider aliases for when templates have a different entity name than the Faker object

In [None]:
# Create entity aliases (e.g. if faker supports "name" but templates contain "person").
PresidioFakeRecordGenerator.PROVIDER_ALIASES

Generate data

In [None]:
fake_records = record_generator.generate_new_fake_records(num_samples=number_of_samples)
pprint.pprint(fake_records[0])

#### Verify randomness of dataset

In [None]:
count_per_template_id = Counter([sample.template_id for sample in fake_records])

print(f"Total: {sum(count_per_template_id.values())}")
print(f"Avg # of records per template: {np.mean(list(count_per_template_id.values()))}")
print(f"Median # of records per template: {np.median(list(count_per_template_id.values()))}")
print(f"Std: {np.std(list(count_per_template_id.values()))}")

#### Which entities did we generate?

In [None]:
count_per_entity = Counter()
for record in fake_records:
    count_per_entity.update(Counter([span.type for span in record.spans]))

count_per_entity

In [None]:
import json
import dataclasses
def get_json(result) -> str:
    spans_dict = json.dumps([dataclasses.asdict(span) for span in result.spans])
    return dict(fake=result.fake, spans=spans_dict, template=result.template, template_id=result.template_id)

In [None]:
len(fake_records)

In [None]:
for record in fake_records[:10]:
    print(get_json(record))

#### Tokenize and transform the fake samples to a list of `InputSample` objects (Common data structure for this package)

In [None]:
%%time
input_samples = [
    InputSample.from_faker_spans_result(faker_spans_result=fake_record)
    for fake_record in tqdm.tqdm(fake_records)
]

#### Save as json

In [None]:
InputSample.to_json(dataset=input_samples, output_file=output_file)

In [None]:
output_file

#### Create a CONLL like data frame

In [None]:
conll = InputSample.create_conll_dataset(input_samples)

In [None]:
conll.to_csv(output_conll, sep="\t")

### Next steps

- Evaluate Presidio using this fake data. [Sample](4_Evaluate_Presidio_Analyzer.ipynb)
- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_#.ipynb)
- Conduct a small exploratory data analysis on the generated data. [Sample](2_PII_EDA.ipynb)

#### Copyright notice:


Data generated for evaluation was created using Fake Name Generator.

Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) 
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.