In [1]:
# install presidio via pip if not yet installed

#!pip install presidio-analyzer
#!pip install presidio-evaluator

In [None]:
import datetime
import pprint
from collections import Counter
from pathlib import Path
from typing import Dict, List

import pandas as pd
import numpy as np

from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import PresidioSentenceFaker

# Generate fake PII data using the Presidio Sentence Faker

The Presidio Sentence Faker enables you to generate a synthetic dataset from sentence templates.
Example templates:

> I live at {{address}}

> You can email me at {{email}}. Thanks, {{first_name}}

> What's your last name? It's {{last_name}}

> Every time I see you falling I get down on my knees and pray

### Simple example
This uses the default generator to create 10 samples based on three templates

In [3]:
sentence_templates = [
    "My name is {{name}}",
    "Please send it to {{address}}",
    "I just moved to {{city}} from {{country}}",
]


sentence_faker = PresidioSentenceFaker(
    "en_US", lower_case_ratio=0.05, sentence_templates=sentence_templates
)
fake_sentence_results = sentence_faker.generate_new_fake_sentences(10)

# Print the spans of the first sample
print(fake_sentence_results[0].masked)
print(fake_sentence_results[0].spans)

Using default entity providers
Using default entity mapping between the entities in the templates and the ones in the output dataset
Using default provider aliases


Sampling: 100%|██████████| 10/10 [00:00<00:00, 4370.89it/s]

I just moved to {{GPE}} from {{GPE}}
[Span(type: GPE, value: Spain, char_span: [45: 50]), Span(type: GPE, value: Valverde de Valdelacasa, char_span: [16: 39])]





## Generate a full dataset

In this example we generate a large dataset with multiple entity types and save it in in JSON and CONLL03 formats.
This uses the default sentence templates included in this package.

In [4]:
number_of_samples = 1500
lower_case_ratio = 0.05
locale = "en"
cur_time = datetime.date.today().strftime("%B_%d_%Y")

output_file = f"../data/generated_size_{number_of_samples}_date_{cur_time}.json"
output_conll = f"../data/generated_size_{number_of_samples}_date_{cur_time}.tsv"

The `PresidioSentenceFaker` is based on the Faker library. It loads [FakeNameGenerator](https://www.fakenamegenerator.com/) data by default
to extend the set of fake values and creates a `SentenceFaker` 
which returns a fake person record (with multiple values) instead of one value,
allowing dependencies between values belonging to the same fake person
(e.g. name = Michael Smith with the email michael.smith@gmail.com).

`FakeNameGenerator.com_3000.csv` is included in this package and can be sourced from https://www.fakenamegenerator.com/order.php

In [5]:
sentence_faker = PresidioSentenceFaker("en_US", lower_case_ratio=0.05)

Using default entity providers
Using default entity mapping between the entities in the templates and the ones in the output dataset
Using default provider aliases


In [6]:
pd.DataFrame(sentence_faker._sentence_faker.records).head()

Unnamed: 0,number,gender,nationality,prefix,first_name,middle_initial,last_name,street_name,city,state_abbr,...,company,domain_name,person,name,first_name_female,first_name_male,prefix_female,prefix_male,last_name_female,last_name_male
0,1,female,Czech,Mrs.,Marie,J,Hamanová,P.O. Box 255,Kangerlussuaq,QE,...,Simple Solutions,MarathonDancing.gl,Marie Hamanová,Marie Hamanová,Marie,,Mrs.,,Hamanová,
1,2,female,French,Ms.,Patricia,G,Desrosiers,Avenida Noruega 42,Vila Real,VR,...,Formula Gray,LostMillions.com.pt,Patricia Desrosiers,Patricia Desrosiers,Patricia,,Ms.,,Desrosiers,
2,3,female,American,Ms.,Debra,O,Neal,1659 Hoog St,Brakpan,GA,...,Dahlkemper's,MediumTube.co.za,Debra Neal,Debra Neal,Debra,,Ms.,,Neal,
3,4,male,French,Mr.,Peverell,C,Racine,183 Epimenidou Street,Limassol,LI,...,Quickbiz,ImproveLook.com.cy,Peverell Racine,Peverell Racine,,Peverell,,Mr.,,Racine
4,5,female,Slovenian,Mrs.,Iolanda,S,Tratnik,Karu põik 61,Pärnu,PR,...,Dubrow's Cafeteria,PostTan.com.ee,Iolanda Tratnik,Iolanda Tratnik,Iolanda,,Mrs.,,Tratnik,


`PresidioSentenceFaker` adds additional providers by default, which are not included in the Faker package.
These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`

It is possible to create providers for additional entity types by extending Faker's `BaseProvider` class, 
and calling `add_provider` on the `PresidioSentenceFaker` instance.
For example:

In [7]:
import random
from faker.providers import BaseProvider


class MarsIdProvider(BaseProvider):
    def mars_id(self):
        # Generate a random row number between 1 and 50
        row = random.randint(1, 50)
        # Generate a random letter for the seat location from A-K
        location = random.choice("ABCDEFGHIJK")
        # Return the seat in the format "row-letter" (e.g., "25A")
        return f"{row}{location}"


sentence_faker.add_provider(MarsIdProvider)
# Now a new `mars_id` entity can be generated if a template has `mars_id` in it.

In [8]:
from presidio_evaluator.data_generator.faker_extensions.providers import *

IpAddressProvider  # Both Ipv4 and IPv6 IP addresses
NationalityProvider  # Read countries + nationalities from file
OrganizationProvider  # Read organization names from file
UsDriverLicenseProvider  # Read US driver license numbers from file
AgeProvider  # Age values (unavailable on Faker
AddressProviderNew  # Extend the default address formats
PhoneNumberProviderNew  # Extend the default phone number formats
ReligionProvider  # Read religions from file

presidio_evaluator.data_generator.faker_extensions.providers.ReligionProvider

`PresidioSentenceFaker.PROVIDER_ALIASES` can be extended to add additional provider aliases for when templates have
a different entity name than what the providers emit.

In [9]:
# Create entity aliases (e.g. if your provider supports "name" but templates contain "person").
provider_aliases = PresidioSentenceFaker.PROVIDER_ALIASES
provider_aliases

# To customize, call `PresidioSentenceFaker(locale="en_US",...,provider_aliases=provider_aliases)`

[('name', 'person'),
 ('credit_card_number', 'credit_card'),
 ('date_of_birth', 'birthday')]

Generate data

In [10]:
fake_records = sentence_faker.generate_new_fake_sentences(num_samples=number_of_samples)
pprint.pprint(fake_records[0])

Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 8316.22it/s]

Full text: I'll meet you at 323 Postbox 78
 Apt. 637
 Slædepatruljen Sirius
 Greenlander after the concert.
Spans: [Span(type: STREET_ADDRESS, value: 323 Postbox 78
 Apt. 637
 Slædepatruljen Sirius
 Greenlander, char_span: [17: 77])]






#### Verify randomness of dataset

In [11]:
count_per_template_id = Counter([sample.template_id for sample in fake_records])

print(f"Total: {sum(count_per_template_id.values())}")
print(f"Avg # of records per template: {np.mean(list(count_per_template_id.values()))}")
print(
    f"Median # of records per template: {np.median(list(count_per_template_id.values()))}"
)
print(f"Std: {np.std(list(count_per_template_id.values()))}")

Total: 1500
Avg # of records per template: 7.142857142857143
Median # of records per template: 7.0
Std: 2.7513756608669206


#### Which entities did we generate?

In [12]:
count_per_entity = Counter()
for record in fake_records:
    count_per_entity.update(Counter([span.entity_type for span in record.spans]))

count_per_entity

Counter({'PERSON': 875,
         'STREET_ADDRESS': 647,
         'GPE': 462,
         'ORGANIZATION': 260,
         'CREDIT_CARD': 146,
         'PHONE_NUMBER': 101,
         'DATE_TIME': 96,
         'TITLE': 88,
         'AGE': 73,
         'NRP': 61,
         'EMAIL_ADDRESS': 47,
         'ZIP_CODE': 39,
         'DOMAIN_NAME': 28,
         'IBAN_CODE': 22,
         'US_SSN': 11,
         'IP_ADDRESS': 11,
         'US_DRIVER_LICENSE': 11})

In [13]:
for record in fake_records[:10]:
    print(record)

Full text: I'll meet you at 323 Postbox 78
 Apt. 637
 Slædepatruljen Sirius
 Greenlander after the concert.
Spans: [Span(type: STREET_ADDRESS, value: 323 Postbox 78
 Apt. 637
 Slædepatruljen Sirius
 Greenlander, char_span: [17: 77])]

Full text: The Adomos SA Orchestra was founded in 2014. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Portugal
Spans: [Span(type: GPE, value: Portugal, char_span: [149: 157]), Span(type: DATE_TIME, value: 2014, char_span: [39: 43]), Span(type: ORGANIZATION, value: Adomos SA, char_span: [4: 13])]

Full text: It's like that since 4/26/1954
Spans: [Span(type: DATE_TIME, value: 4/26/1954, char_span: [21: 30])]

Full text: One of the most depressing songs on the list. He's injured from the waist down from New Zealand, but Rinoka just has to get laid. Don't go to town, Lisa!
Spans: [Span(type: PERSON, value: Lisa, char_span: [148: 152]), Span(type: PERSON, value: Rinoka, char_span: [101: 107]), Span(type

#### Save as json

In [14]:
InputSample.to_json(dataset=fake_records, output_file=output_file)

In [15]:
output_file

'../data/generated_size_1500_date_January_08_2025.json'

#### Create a CONLL like data frame

In [16]:
conll = InputSample.create_conll_dataset(dataset=fake_records)
conll.head(10)

  0%|          | 0/1500 [00:00<?, ?it/s]

loading model en_core_web_sm


100%|██████████| 1500/1500 [00:04<00:00, 320.23it/s]


Unnamed: 0,text,pos,tag,template_id,label,sentence
0,I,PRON,PRP,46,O,0
1,'ll,AUX,MD,46,O,0
2,meet,VERB,VB,46,O,0
3,you,PRON,PRP,46,O,0
4,at,ADP,IN,46,O,0
5,323,NUM,CD,46,B-STREET_ADDRESS,0
6,Postbox,PROPN,NNP,46,I-STREET_ADDRESS,0
7,78,NUM,CD,46,I-STREET_ADDRESS,0
8,\n,SPACE,_SP,46,I-STREET_ADDRESS,0
9,Apt,PROPN,NNP,46,I-STREET_ADDRESS,0


In [17]:
conll.to_csv(output_conll, sep="\t")
print(f"CoNLL2003 dataset structure output location: {output_conll}")

CoNLL2003 dataset structure output location: ../data/generated_size_1500_date_January_08_2025.tsv


### Next steps

- Evaluate Presidio using fake data: [Sample](4_Evaluate_Presidio_Analyzer.ipynb)
- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset: [Sample](3_Split_by_pattern_#.ipynb)
- Conduct a small exploratory data analysis on the generated data: [Sample](2_PII_EDA.ipynb)

#### Copyright notice:


Data generated for evaluation was created using Fake Name Generator.

Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) 
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.