In [1]:
# install presidio via pip if not yet installed

#!pip install presidio-analyzer
#!pip install presidio-evaluator

In [1]:
import datetime
import pprint
from collections import Counter
from pathlib import Path
from typing import Dict, List

import pandas as pd
import numpy as np

from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import PresidioSentenceFaker

# Generate fake PII data using the Presidio Sentence Faker

The Presidio Sentence Faker enables you to generate a synthetic dataset from sentence templates.
Example templates:

> I live at {{address}}

> You can email me at {{email}}. Thanks, {{first_name}}

> What's your last name? It's {{last_name}}

> Every time I see you falling I get down on my knees and pray

### Simple example
This uses the default generator to create 10 samples based on three templates

In [2]:
sentence_templates = [
    "My name is {{name}}",
    "Please send it to {{address}}",
    "I just moved to {{city}} from {{country}}",
]


sentence_faker = PresidioSentenceFaker(
    "en_US", lower_case_ratio=0.05, sentence_templates=sentence_templates
)
fake_sentence_results = sentence_faker.generate_new_fake_sentences(10)

# Print the spans of the first sample
print(fake_sentence_results[0].masked)
print(fake_sentence_results[0].spans)

Using default entity providers
Using default entity mapping between the entities in the templates and the ones in the output dataset
Using default provider aliases


Sampling: 100%|██████████| 10/10 [00:00<00:00, 6924.72it/s]

I just moved to {{GPE}} from {{GPE}}
[Span(type: GPE, value: Austria, char_span: [27: 34]), Span(type: GPE, value: STEIN, char_span: [16: 21])]





## Generate a full dataset

In this example we generate a large dataset with multiple entity types and save it in in JSON and CONLL03 formats.
This uses the default sentence templates included in this package.

In [14]:
number_of_samples = 1500
lower_case_ratio = 0.05
locale = "en"
cur_time = datetime.date.today().strftime("%B_%d_%Y")

output_file = f"../sample_data/generated_size_{number_of_samples}_date_{cur_time}.json"
output_conll = f"../sample_data/generated_size_{number_of_samples}_date_{cur_time}.tsv"

The `PresidioSentenceFaker` is based on the Faker library. It loads [FakeNameGenerator](https://www.fakenamegenerator.com/) data by default
to extend the set of fake values and creates a `SentenceFaker` 
which returns a fake person record (with multiple values) instead of one value,
allowing dependencies between values belonging to the same fake person
(e.g. name = Michael Smith with the email michael.smith@gmail.com).

`FakeNameGenerator.com_3000.csv` is included in this package and can be sourced from https://www.fakenamegenerator.com/order.php

In [4]:
sentence_faker = PresidioSentenceFaker("en_US", lower_case_ratio=0.05)

Using default entity providers
Using default entity mapping between the entities in the templates and the ones in the output dataset
Using default provider aliases


In [5]:
pd.DataFrame(sentence_faker._sentence_faker.records).head()

Unnamed: 0,number,gender,nationality,prefix,first_name,middle_initial,last_name,street_name,city,state_abbr,...,company,domain_name,person,name,first_name_female,first_name_male,prefix_female,prefix_male,last_name_female,last_name_male
0,1,female,Czech,Mrs.,Marie,J,Hamanová,P.O. Box 255,Kangerlussuaq,QE,...,Simple Solutions,MarathonDancing.gl,Marie Hamanová,Marie Hamanová,Marie,,Mrs.,,Hamanová,
1,2,female,French,Ms.,Patricia,G,Desrosiers,Avenida Noruega 42,Vila Real,VR,...,Formula Gray,LostMillions.com.pt,Patricia Desrosiers,Patricia Desrosiers,Patricia,,Ms.,,Desrosiers,
2,3,female,American,Ms.,Debra,O,Neal,1659 Hoog St,Brakpan,GA,...,Dahlkemper's,MediumTube.co.za,Debra Neal,Debra Neal,Debra,,Ms.,,Neal,
3,4,male,French,Mr.,Peverell,C,Racine,183 Epimenidou Street,Limassol,LI,...,Quickbiz,ImproveLook.com.cy,Peverell Racine,Peverell Racine,,Peverell,,Mr.,,Racine
4,5,female,Slovenian,Mrs.,Iolanda,S,Tratnik,Karu põik 61,Pärnu,PR,...,Dubrow's Cafeteria,PostTan.com.ee,Iolanda Tratnik,Iolanda Tratnik,Iolanda,,Mrs.,,Tratnik,


`PresidioSentenceFaker` adds additional providers by default, which are not included in the Faker package.
These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`

It is possible to create providers for additional entity types by extending Faker's `BaseProvider` class, 
and calling `add_provider` on the `PresidioSentenceFaker` instance.
For example:

In [6]:
import random
from faker.providers import BaseProvider


class MarsIdProvider(BaseProvider):
    def mars_id(self):
        # Generate a random row number between 1 and 50
        row = random.randint(1, 50)
        # Generate a random letter for the seat location from A-K
        location = random.choice("ABCDEFGHIJK")
        # Return the seat in the format "row-letter" (e.g., "25A")
        return f"{row}{location}"


sentence_faker.add_provider(MarsIdProvider)
# Now a new `mars_id` entity can be generated if a template has `mars_id` in it.

In [7]:
from presidio_evaluator.data_generator.faker_extensions.providers import *

IpAddressProvider  # Both Ipv4 and IPv6 IP addresses
NationalityProvider  # Read countries + nationalities from file
OrganizationProvider  # Read organization names from file
UsDriverLicenseProvider  # Read US driver license numbers from file
AgeProvider  # Age values (unavailable on Faker
AddressProviderNew  # Extend the default address formats
PhoneNumberProviderNew  # Extend the default phone number formats
ReligionProvider  # Read religions from file

presidio_evaluator.data_generator.faker_extensions.providers.ReligionProvider

`PresidioSentenceFaker.PROVIDER_ALIASES` can be extended to add additional provider aliases for when templates have
a different entity name than what the providers emit.

In [8]:
# Create entity aliases (e.g. if your provider supports "name" but templates contain "person").
provider_aliases = PresidioSentenceFaker.PROVIDER_ALIASES
provider_aliases

# To customize, call `PresidioSentenceFaker(locale="en_US",...,provider_aliases=provider_aliases)`

[('name', 'person'),
 ('credit_card_number', 'credit_card'),
 ('date_of_birth', 'birthday')]

Generate data

In [9]:
fake_records = sentence_faker.generate_new_fake_sentences(num_samples=number_of_samples)
pprint.pprint(fake_records[0])

Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 13848.62it/s]

Full text: My religion does not allow speaking to bots, they are evil and hacked by the Devil
Spans: []






#### Verify randomness of dataset

In [10]:
count_per_template_id = Counter([sample.template_id for sample in fake_records])

print(f"Total: {sum(count_per_template_id.values())}")
print(f"Avg # of records per template: {np.mean(list(count_per_template_id.values()))}")
print(
    f"Median # of records per template: {np.median(list(count_per_template_id.values()))}"
)
print(f"Std: {np.std(list(count_per_template_id.values()))}")

Total: 1500
Avg # of records per template: 7.177033492822966
Median # of records per template: 7.0
Std: 2.6361942998265584


#### Which entities did we generate?

In [11]:
count_per_entity = Counter()
for record in fake_records:
    count_per_entity.update(Counter([span.entity_type for span in record.spans]))

count_per_entity

Counter({'PERSON': 900,
         'STREET_ADDRESS': 593,
         'GPE': 410,
         'ORGANIZATION': 306,
         'PHONE_NUMBER': 140,
         'CREDIT_CARD': 134,
         'DATE_TIME': 112,
         'AGE': 77,
         'NRP': 65,
         'TITLE': 62,
         'EMAIL_ADDRESS': 44,
         'ZIP_CODE': 31,
         'DOMAIN_NAME': 30,
         'IBAN_CODE': 29,
         'US_SSN': 10,
         'IP_ADDRESS': 8,
         'US_DRIVER_LICENSE': 5})

In [12]:
for record in fake_records[:10]:
    print(record)

Full text: My religion does not allow speaking to bots, they are evil and hacked by the Devil
Spans: []

Full text: Inject SELECT * FROM Users WHERE client_ip = ?%//!%20\|121.166.60.53|%20/
Spans: [Span(type: IP_ADDRESS, value: 121.166.60.53, char_span: [55: 68])]

Full text: My birthday is on the weekend. I'll turn 58.
Spans: [Span(type: AGE, value: 58, char_span: [41: 43])]

Full text: During 1997, M2i SA invested heavily in new microprocessor designs fostering the rapid growth of the computer industry.
Spans: [Span(type: ORGANIZATION, value: M2i SA, char_span: [13: 19]), Span(type: DATE_TIME, value: 1997, char_span: [7: 11])]

Full text: Can I withdraw cash using my card 6011208816836756 at aTM center ?
Spans: [Span(type: CREDIT_CARD, value: 6011208816836756, char_span: [34: 50])]

Full text: You can tell Lechosław was a huge Lechosław Kozłowski fan. Written when he was 38.
Spans: [Span(type: AGE, value: 38, char_span: [79: 81]), Span(type: PERSON, value: Lechosław Kozłowski, char_s

#### Save as json

In [15]:
InputSample.to_json(dataset=fake_records, output_file=output_file)

In [15]:
output_file

'../data/generated_size_1500_date_January_08_2025.json'

#### Create a CONLL like data frame

In [17]:
conll = InputSample.create_conll_dataset(dataset=fake_records)
conll.head(10)

  0%|          | 0/1500 [00:00<?, ?it/s]

loading model en_core_web_sm


100%|██████████| 1500/1500 [00:05<00:00, 291.64it/s]


Unnamed: 0,text,pos,tag,template_id,label,sentence
0,My,PRON,PRP$,33,O,0
1,religion,NOUN,NN,33,O,0
2,does,AUX,VBZ,33,O,0
3,not,PART,RB,33,O,0
4,allow,VERB,VB,33,O,0
5,speaking,VERB,VBG,33,O,0
6,to,ADP,IN,33,O,0
7,bots,NOUN,NNS,33,O,0
8,",",PUNCT,",",33,O,0
9,they,PRON,PRP,33,O,0


In [18]:
conll.to_csv(output_conll, sep="\t")
print(f"CoNLL2003 dataset structure output location: {output_conll}")

CoNLL2003 dataset structure output location: ../sample_data/generated_size_1500_date_January_09_2026.tsv


### Next steps

- Evaluate Presidio using fake data: [Sample](4_Evaluate_Presidio_Analyzer.ipynb)
- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset: [Sample](3_Split_by_pattern_#.ipynb)
- Conduct a small exploratory data analysis on the generated data: [Sample](2_PII_EDA.ipynb)

#### Copyright notice:


Data generated for evaluation was created using Fake Name Generator.

Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) 
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.