In [1]:
# install presidio via pip if not yet installed

#!pip install presidio-analyzer
#!pip install presidio-evaluator

In [2]:
import datetime
import pprint
from collections import Counter
from pathlib import Path
from typing import Dict, List

import pandas as pd
import numpy as np

from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import PresidioSentenceFaker



# Generate fake PII data using the Presidio Sentence Faker

The Presidio Sentence Faker enables you to generate a synthetic dataset from sentence templates.
Example templates:

> I live at {{address}}

> You can email me at {{email}}. Thanks, {{first_name}}

> What's your last name? It's {{last_name}}

> Every time I see you falling I get down on my knees and pray

### Simple example
This uses the default generator to create 10 samples based on three templates

In [3]:
sentence_templates = [
    "My name is {{name}}",
    "Please send it to {{address}}",
    "I just moved to {{city}} from {{country}}",
]


sentence_faker = PresidioSentenceFaker('en_US', 
                                       lower_case_ratio=0.05, 
                                       sentence_templates=sentence_templates)
fake_sentence_results = sentence_faker.generate_new_fake_sentences(10)

# Print the spans of the first sample
print(fake_sentence_results[0].masked)
print(fake_sentence_results[0].spans)

Using default entity providers
Using default entity mapping between the entities                   in the templates and the ones in the output dataset
Using default provider aliases


Sampling: 100%|██████████| 10/10 [00:00<00:00, 3407.23it/s]

I just moved to {{GPE}} from {{GPE}}
[Span(type: country, value: Italy, char_span: [33: 38]), Span(type: city, value: Gorgoglione, char_span: [16: 27])]





## Generate a full dataset

In this example we generate a large dataset with multiple entity types and save it in in JSON and CONLL03 formats.
This uses the default sentence templates included in this package.

In [8]:
number_of_samples = 1500
lower_case_ratio = 0.05
locale = 'en'
cur_time = datetime.date.today().strftime("%B_%d_%Y")

output_file = f"../data/generated_size_{number_of_samples}_date_{cur_time}.json"
output_conll = f"../data/generated_size_{number_of_samples}_date_{cur_time}.tsv"

The `PresidioSentenceFaker` loads [FakeNameGenerator](https://www.fakenamegenerator.com/) data by default
to extend the set of fake values and creates a `RecordsFaker` 
which returns a fake person record (with multiple values) instead of one value,
allowing dependencies between values belonging to the same fake person
(e.g. name = Michael Smith with the email michael.smith@gmail.com).

`FakeNameGenerator.com_3000.csv` is included in this package and can be sourced from https://www.fakenamegenerator.com/order.php

In [9]:
sentence_faker = PresidioSentenceFaker('en_US', lower_case_ratio=0.05)

Using default entity providers
Using default entity mapping between the entities                   in the templates and the ones in the output dataset
Using default provider aliases


In [10]:
pd.DataFrame(sentence_faker._sentence_faker.records).head()

Unnamed: 0,number,gender,nationality,prefix,first_name,middle_initial,last_name,street_name,city,state_abbr,...,company,domain_name,person,name,first_name_female,first_name_male,prefix_female,prefix_male,last_name_female,last_name_male
0,1,female,Czech,Mrs.,Marie,J,Hamanová,P.O. Box 255,Kangerlussuaq,QE,...,Simple Solutions,MarathonDancing.gl,Marie Hamanová,Marie Hamanová,Marie,,Mrs.,,Hamanová,
1,2,female,French,Ms.,Patricia,G,Desrosiers,Avenida Noruega 42,Vila Real,VR,...,Formula Gray,LostMillions.com.pt,Patricia G. Desrosiers,Patricia G. Desrosiers,Patricia,,Ms.,,Desrosiers,
2,3,female,American,Ms.,Debra,O,Neal,1659 Hoog St,Brakpan,GA,...,Dahlkemper's,MediumTube.co.za,Debra Neal,Debra Neal,Debra,,Ms.,,Neal,
3,4,male,French,Mr.,Peverell,C,Racine,183 Epimenidou Street,Limassol,LI,...,Quickbiz,ImproveLook.com.cy,Peverell Racine,Peverell Racine,,Peverell,,Mr.,,Racine
4,5,female,Slovenian,Mrs.,Iolanda,S,Tratnik,Karu põik 61,Pärnu,PR,...,Dubrow's Cafeteria,PostTan.com.ee,Iolanda Tratnik,Iolanda Tratnik,Iolanda,,Mrs.,,Tratnik,


`PresidioSentenceFaker` adds additional providers by default, which are not included in the Faker package.
These can be found in `presidio_evaluator.data_generator.faker_extensions.providers`

It is possible to create providers for additional entity types by extending Faker's `BaseProvider` class, 
and calling `add_provider` on the `PresidioSentenceFaker` instance.
For example:

In [13]:
import random
from faker.providers import BaseProvider

class MarsIdProvider(BaseProvider):
    def mars_id(self):
        # Generate a random row number between 1 and 50
        row = random.randint(1, 50)
        # Generate a random letter for the seat location from A-K
        location = random.choice('ABCDEFGHIJK')
        # Return the seat in the format "row-letter" (e.g., "25A")
        return f"{row}{location}"

sentence_faker.add_provider(MarsIdProvider)
# Now a new `mars_id` entity can be generated if a template has `mars_id` in it.


In [14]:
from presidio_evaluator.data_generator.faker_extensions.providers import *

IpAddressProvider  # Both Ipv4 and IPv6 IP addresses
NationalityProvider  # Read countries + nationalities from file
OrganizationProvider  # Read organization names from file
UsDriverLicenseProvider  # Read US driver license numbers from file
AgeProvider  # Age values (unavailable on Faker
AddressProviderNew  # Extend the default address formats
PhoneNumberProviderNew  # Extend the default phone number formats
ReligionProvider  # Read religions from file

presidio_evaluator.data_generator.faker_extensions.providers.ReligionProvider

`PresidioSentenceFaker.PROVIDER_ALIASES` can be extended to add additional provider aliases for when templates have
a different entity name than what the providers emit.

In [15]:
# Create entity aliases (e.g. if your provider supports "name" but templates contain "person").
provider_aliases = PresidioSentenceFaker.PROVIDER_ALIASES
provider_aliases

# To customize, call `PresidioSentenceFaker(locale="en_US",...,provider_aliases=provider_aliases)`

[('name', 'person'),
 ('credit_card_number', 'credit_card'),
 ('date_of_birth', 'birthday')]

Generate data

In [16]:
fake_records = sentence_faker.generate_new_fake_sentences(num_samples=number_of_samples)
pprint.pprint(fake_records[0])

Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 7794.60it/s]

Full text: The bus station is on Via Pasquale Scura 127
Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]






#### Verify randomness of dataset

In [17]:
count_per_template_id = Counter([sample.template_id for sample in fake_records])

print(f"Total: {sum(count_per_template_id.values())}")
print(f"Avg # of records per template: {np.mean(list(count_per_template_id.values()))}")
print(f"Median # of records per template: {np.median(list(count_per_template_id.values()))}")
print(f"Std: {np.std(list(count_per_template_id.values()))}")

Total: 1500
Avg # of records per template: 7.142857142857143
Median # of records per template: 7.0
Std: 2.4394713378441786


#### Which entities did we generate?

In [18]:
count_per_entity = Counter()
for record in fake_records:
    count_per_entity.update(Counter([span.type for span in record.spans]))

count_per_entity

Counter({'PERSON': 895,
         'STREET_ADDRESS': 571,
         'GPE': 375,
         'ORGANIZATION': 277,
         'PHONE_NUMBER': 124,
         'CREDIT_CARD': 115,
         'DATE_TIME': 110,
         'AGE': 77,
         'TITLE': 71,
         'NRP': 67,
         'EMAIL_ADDRESS': 38,
         'DOMAIN_NAME': 31,
         'ZIP_CODE': 25,
         'IP_ADDRESS': 17,
         'US_SSN': 15,
         'IBAN_CODE': 12,
         'US_DRIVER_LICENSE': 4})

In [19]:
for record in fake_records[:10]:
    print(record)

Full text: The bus station is on Via Pasquale Scura 127
Spans: [Span(type: street_name, value: Via Pasquale Scura 127, char_span: [22: 44])]

Full text: Leigha Mackay\n\nLa Sagne\nSwitzerland
Spans: [Span(type: country, value: Switzerland, char_span: [27: 38]), Span(type: city, value: La Sagne, char_span: [17: 25]), Span(type: name, value: Leigha Mackay, char_span: [0: 13])]

Full text: Can someone call me on 06-82237745? I have some questions about opening an account.
Spans: [Span(type: phone_number, value: 06-82237745, char_span: [23: 34])]

Full text: Could you please send me the last billed amount for cc 4218196001337 on my e-mail TomaszJablonski@gustr.com?
Spans: [Span(type: email, value: TomaszJablonski@gustr.com, char_span: [82: 107]), Span(type: credit_card_number, value: 4218196001337, char_span: [55: 68])]

Full text: Csanád had given Csanád his address: 083 254 Damvergi Street, Nicosia
Spans: [Span(type: city, value: Nicosia, char_span: [62: 69]), Span(type: street_name, val

#### Save as json

In [20]:
InputSample.to_json(dataset=fake_records, output_file=output_file)

In [None]:
output_file

#### Create a CONLL like data frame

In [23]:
conll = InputSample.create_conll_dataset(dataset=fake_records)
conll.head(10)

100%|██████████| 1500/1500 [00:00<00:00, 47248.41it/s]


Unnamed: 0,text,pos,tag,template_id,label,sentence
0,The,DET,DT,163,O,0
1,bus,NOUN,NN,163,O,0
2,station,NOUN,NN,163,O,0
3,is,AUX,VBZ,163,O,0
4,on,ADP,IN,163,O,0
5,Via,PROPN,NNP,163,B-street_name,0
6,Pasquale,PROPN,NNP,163,I-street_name,0
7,Scura,PROPN,NNP,163,I-street_name,0
8,127,NUM,CD,163,I-street_name,0
9,Leigha,VERB,VB,189,B-name,1


In [24]:
conll.to_csv(output_conll, sep="\t")
print(f"CoNLL2003 dataset structure output location: {output_conll}")

CoNLL2003 dataset structure output location: ../data/generated_size_1500_date_January_06_2025.tsv


### Next steps

- Evaluate Presidio using fake data: [Sample](4_Evaluate_Presidio_Analyzer.ipynb)
- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset: [Sample](3_Split_by_pattern_#.ipynb)
- Conduct a small exploratory data analysis on the generated data: [Sample](2_PII_EDA.ipynb)

#### Copyright notice:


Data generated for evaluation was created using Fake Name Generator.

Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) 
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.