Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PresidioSentenceFaker #50

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
56ab9a5
Map DOMAIN_NAME entity to URL
Robbie-Palmer Aug 2, 2022
f538bc5
Add PresidioFakeRecordGenerator class
Robbie-Palmer Aug 3, 2022
6e8eec3
Fix bug in PresidioAnalyzerWrapper where 'en' is always the chosen la…
Robbie-Palmer Aug 3, 2022
5bfca66
Update PresidioAnalyzerWrapper to use the provided language in the de…
Robbie-Palmer Aug 3, 2022
79f796a
Format span_to_tag.py
Robbie-Palmer Aug 3, 2022
146dc63
Merge branch 'microsoft:master' into fake-record-generator
Robbie-Palmer Aug 5, 2022
6863e19
Map DOMAIN_NAME entity to URL
Robbie-Palmer Aug 2, 2022
b5efd1e
Add PresidioFakeRecordGenerator class
Robbie-Palmer Aug 3, 2022
04a43fe
Fix bug in PresidioAnalyzerWrapper where 'en' is always the chosen la…
Robbie-Palmer Aug 3, 2022
7818b28
Update PresidioAnalyzerWrapper to use the provided language in the de…
Robbie-Palmer Aug 3, 2022
279ca75
Format span_to_tag.py
Robbie-Palmer Aug 3, 2022
ddf8e72
Fix python3.7 support for getting raw data dir path
Robbie-Palmer Dec 7, 2022
a385403
Strip whitespace from ends of template files in PresidioDataGenerator
Robbie-Palmer Dec 7, 2022
6aede14
Test PresidioFakeRecordGenerator
Robbie-Palmer Dec 7, 2022
e6c09fd
Fix mutable default argument problem in PresidioFakeRecordGenerator
Robbie-Palmer Dec 7, 2022
4afd7c0
Unit test PresidioFakeRecordGenerator
Robbie-Palmer Dec 7, 2022
76c24b7
Expose ReligionProvider from faker_extensions package
Robbie-Palmer Dec 7, 2022
fd7bc88
Format tests/__init__.py
Robbie-Palmer Dec 7, 2022
c5e22f3
Add missing religions.csv and us_driver_license_format.yaml to packag…
Robbie-Palmer Dec 7, 2022
c0ea8da
Fix UsDriverLicenseProvider to provide us_driver_license entity
Robbie-Palmer Dec 7, 2022
798c9e5
Simplify Generate_data notebook by using PresidioFakeRecordGenerator
Robbie-Palmer Dec 7, 2022
e2f1e34
Update Data Generator README to include PresidioFakeRecordGenerator u…
Robbie-Palmer Dec 7, 2022
6731399
Merge remote-tracking branch 'origin/fake-record-generator' into fake…
Robbie-Palmer Dec 7, 2022
b7fbec1
Merge branch 'master' into fake-record-generator
omri374 Dec 18, 2022
7fc5bfe
Fix grammar in 1_Generate_data.ipynb
Robbie-Palmer Dec 20, 2022
407718d
Make it possible to use PresidioFakeRecordGenerator without the defau…
Robbie-Palmer Jan 3, 2023
57d3279
Merge remote-tracking branch 'origin/fake-record-generator' into fake…
Robbie-Palmer Jan 3, 2023
ae56da6
Merge remote-tracking branch 'upstream/master' into fake-record-gener…
Robbie-Palmer Jan 3, 2023
ce59376
Add Optional type annotations to parameters
Robbie-Palmer Jan 17, 2023
8041123
Rename PresidioDataGenerator to SentenceFaker
Robbie-Palmer Jan 17, 2023
97049ee
Minimize the responsibilities of SentenceFaker
Robbie-Palmer Jan 17, 2023
63dfa38
Move SentenceFaker into `faker_extensions` package
Robbie-Palmer Jan 17, 2023
3c1cc48
Fix imports
Robbie-Palmer Jan 17, 2023
ba327e8
Rename presidio_data_generator.py to presidio_sentence_faker.py
Robbie-Palmer Jan 17, 2023
a30342a
Fix 1_Generate_data.ipynb
Robbie-Palmer Jan 17, 2023
9795d8c
Add support for providing your own base records for PresidioSentenceF…
Robbie-Palmer Jan 17, 2023
c36f722
Fix SentenceFaker docstring
Robbie-Palmer Jan 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions presidio_evaluator/data_generator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .presidio_data_generator import PresidioDataGenerator
from .presidio_data_generator import PresidioDataGenerator, PresidioFakeRecordGenerator
from .presidio_pseudonymize import PresidioPseudonymization


Expand All @@ -8,4 +8,4 @@ def read_synth_dataset():
)


__all__ = ["PresidioDataGenerator", "PresidioPseudonymization", "read_synth_dataset"]
__all__ = ["PresidioDataGenerator", "PresidioFakeRecordGenerator", "PresidioPseudonymization", "read_synth_dataset"]
15 changes: 6 additions & 9 deletions presidio_evaluator/data_generator/faker_extensions/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,16 @@
from faker.providers import BaseProvider
from faker.providers.address.en_US import Provider as AddressProvider
from faker.providers.phone_number.en_US import Provider as PhoneNumberProvider
from presidio_evaluator.data_generator import raw_data

_raw_data_dir = Path(raw_data.__path__[0])
Robbie-Palmer marked this conversation as resolved.
Show resolved Hide resolved


class NationalityProvider(BaseProvider):
def __init__(self, generator, nationality_file: Union[str, Path] = None):
super().__init__(generator=generator)
if not nationality_file:
nationality_file = Path(
Path(__file__).parent.parent, "raw_data", "nationalities.csv"
).resolve()
nationality_file = (_raw_data_dir / "nationalities.csv").resolve()

self.nationality_file = nationality_file
self.nationalities = self.load_nationalities()
Expand Down Expand Up @@ -47,9 +48,7 @@ def __init__(
):
super().__init__(generator=generator)
if not organizations_file:
organizations_file = Path(
Path(__file__).parent.parent, "raw_data", "organizations.csv"
).resolve()
organizations_file = (_raw_data_dir / "organizations.csv").resolve()
self.organizations_file = organizations_file
self.organizations = self.load_organizations()

Expand All @@ -71,9 +70,7 @@ def __init__(
):
super().__init__(generator=generator)
if not us_driver_license_file:
us_driver_license_file = Path(
Path(__file__).parent.parent, "raw_data", "us_driver_licenses.csv"
).resolve()
us_driver_license_file = (_raw_data_dir / "us_driver_licenses.csv").resolve()
self.us_driver_license_file = us_driver_license_file
self.us_driver_licenses = self.load_us_driver_licenses()

Expand Down
183 changes: 124 additions & 59 deletions presidio_evaluator/data_generator/presidio_data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
import pandas as pd
from faker import Faker
from faker.providers import BaseProvider
from faker.typing import SeedType
from pandas import DataFrame
from tqdm import tqdm

from presidio_evaluator.data_generator import raw_data
from presidio_evaluator.data_generator.faker_extensions import (
FakerSpansResult,
NationalityProvider,
Expand All @@ -28,10 +30,10 @@

class PresidioDataGenerator:
def __init__(
self,
custom_faker: Faker = None,
locale: Optional[List[str]] = None,
lower_case_ratio: float = 0.05,
self,
custom_faker: Faker = None,
locale: Optional[List[str]] = None,
lower_case_ratio: float = 0.05,
):
"""
Fake data generator.
Expand Down Expand Up @@ -82,7 +84,7 @@ def __init__(
self.lower_case_ratio = lower_case_ratio

def parse(
self, template: str, template_id: Optional[int] = None, add_spans: bool = True
self, template: str, template_id: Optional[int] = None, add_spans: bool = True
) -> Union[FakerSpansResult, str]:
"""
This function replaces known PII {{tokens}} in a template sentence
Expand Down Expand Up @@ -136,16 +138,16 @@ def make_lower_case(match_obj):
templates = [
(
re.sub(r"\[.*?\]", make_lower_case, template.strip())
.replace("[", "{" + "{")
.replace("]", "}" + "}")
.replace("[", "{" + "{")
.replace("]", "}" + "}")
)
for template in raw_templates
]

return templates

def generate_fake_data(
self, templates: List[str], n_samples: int
self, templates: List[str], n_samples: int
) -> Union[Generator[FakerSpansResult, None, None], Generator[str, None, None]]:
"""
Generates fake PII data whenever it encounters known faker entities in a template.
Expand Down Expand Up @@ -173,9 +175,9 @@ def _lower_pattern(pattern: Union[str, FakerSpansResult]):
span.value = str(span.value).lower()
return pattern

@staticmethod
def seed(seed_value=42):
def seed(self, seed_value=42):
Faker.seed(seed_value)
self.faker.seed_instance(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

Expand Down Expand Up @@ -209,16 +211,10 @@ def camel_to_snake(name):

def full_name(row):
if random.random() > 0.2:
return str(row.first_name) + " " + str(row.last_name)
return f'{row.first_name} {row.last_name}'
else:
space_after_initials = " " if random.random() > 0.5 else ". "
return (
str(row.first_name)
+ " "
+ str(row.middle_initial)
+ space_after_initials
+ str(row.last_name)
)
return f'{row.first_name} {row.middle_initial}{space_after_initials}{row.last_name}'

def name_gendered(row):
first_name_female, prefix_female, last_name_female = (
Expand Down Expand Up @@ -295,48 +291,117 @@ def name_gendered(row):
return fake_data


if __name__ == "__main__":
PresidioDataGenerator.seed(42)

template_file_path = Path(Path(__file__).parent, "raw_data", "templates.txt")

# Read FakeNameGenerator data
fake_data_df = pd.read_csv(
Path(Path(__file__).parent, "raw_data", "FakeNameGenerator.com_3000.csv")
)
# Convert column names to lowercase to match patterns
fake_data_df = PresidioDataGenerator.update_fake_name_generator_df(fake_data_df)

# Create a RecordsFaker (Faker object which prefers samples multiple objects from one record)
faker = RecordsFaker(records=fake_data_df, local="en_US")
faker.add_provider(IpAddressProvider)
faker.add_provider(NationalityProvider)
faker.add_provider(OrganizationProvider)
faker.add_provider(UsDriverLicenseProvider)
faker.add_provider(AgeProvider)
faker.add_provider(AddressProviderNew) # More address formats than Faker
faker.add_provider(PhoneNumberProviderNew) # More phone number formats than Faker

# Create Presidio Data Generator
data_generator = PresidioDataGenerator(custom_faker=faker, lower_case_ratio=0.05)
data_generator.add_provider_alias(provider_name="name", new_name="person")
data_generator.add_provider_alias(
provider_name="credit_card_number", new_name="credit_card"
)
data_generator.add_provider_alias(
provider_name="date_of_birth", new_name="birthday"
)

sentence_templates = PresidioDataGenerator.read_template_file(template_file_path)
fake_patterns = data_generator.generate_fake_data(
templates=sentence_templates, n_samples=10000
)

# save to json
output_file = Path(
Path(__file__).parent.parent.parent, "data", "presidio_data_generator_data.json"
)
class PresidioFakeRecordGenerator:
omri374 marked this conversation as resolved.
Show resolved Hide resolved
"""
Fake record generator.
Leverages PresidioDataGenerator and the existing templates and new providers in this library to give a high level
interface for generating a list of fake records.
:param: locale: The faker locale to use e.g. 'en_US'
:param lower_case_ratio: Percentage of names that should start with lower case
:param: additional_entity_providers: Custom entity providers beyond those existing in this library
:param: additional_sentence_templates: Custom sentence templates beyond those existing in this library
:param: random_seed: A seed to make results reproducible between runs
"""

faker_to_presidio_entity_type = dict(person="PERSON",
ip_address="IP_ADDRESS",
us_driver_license="US_DRIVER_LICENSE",
organization="ORGANIZATION",
name_female="PERSON",
address="STREET_ADDRESS",
country="GPE",
state="GPE",
credit_card_number="CREDIT_CARD",
city="GPE",
street_name="STREET_ADDRESS",
building_number="STREET_ADDRESS",
name="PERSON",
iban="IBAN_CODE",
last_name="PERSON",
last_name_male="PERSON",
last_name_female="PERSON",
first_name="PERSON",
first_name_male="PERSON",
first_name_female="PERSON",
phone_number="PHONE_NUMBER",
url="DOMAIN_NAME",
ssn="US_SSN",
email="EMAIL_ADDRESS",
date_time="DATE_TIME",
date_of_birth="DATE_TIME",
day_of_week="DATE_TIME",
year="DATE_TIME",
name_male="PERSON",
prefix_male="TITLE",
prefix_female="TITLE",
prefix="TITLE",
nationality="NRP",
nation_woman="NRP",
nation_man="NRP",
nation_plural="NRP",
first_name_nonbinary="PERSON",
postcode="STREET_ADDRESS",
secondary_address="STREET_ADDRESS",
job="TITLE",
zipcode="ZIP_CODE",
state_abbr="GPE",
age="AGE")

def __init__(self,
omri374 marked this conversation as resolved.
Show resolved Hide resolved
locale: str,
lower_case_ratio: float,
additional_entity_providers: List[BaseProvider] = [],
additional_sentence_templates: List[str] = [],
random_seed: SeedType = None):
raw_data_dir = Path(raw_data.__path__[0])

presidio_templates_file_path = raw_data_dir / "templates.txt"
Robbie-Palmer marked this conversation as resolved.
Show resolved Hide resolved
self._sentence_templates = PresidioDataGenerator.read_template_file(presidio_templates_file_path)
self._sentence_templates.extend(additional_sentence_templates)

presidio_additional_entity_providers = [IpAddressProvider,
NationalityProvider,
OrganizationProvider,
UsDriverLicenseProvider,
AgeProvider,
AddressProviderNew,
PhoneNumberProviderNew]
additional_entity_providers.extend(presidio_additional_entity_providers)

fake_person_data_path = raw_data_dir / "FakeNameGenerator.com_3000.csv"
fake_person_df = pd.read_csv(fake_person_data_path)
fake_person_df = PresidioDataGenerator.update_fake_name_generator_df(fake_person_df)
faker = RecordsFaker(records=fake_person_df, locale=locale)

for entity_provider in additional_entity_providers:
faker.add_provider(entity_provider)

self._data_generator = PresidioDataGenerator(custom_faker=faker, lower_case_ratio=lower_case_ratio)
self._data_generator.seed(random_seed)
provider_aliases = dict(name='person', credit_card_number='credit_card', date_of_birth='birthday')
for provider, alias in provider_aliases.items():
self._data_generator.add_provider_alias(provider_name=provider, new_name=alias)

self.fake_records = None

def generate_new_fake_records(self, num_samples: int) -> List[FakerSpansResult]:
self.fake_records = list(self._data_generator.generate_fake_data(templates=self._sentence_templates,
n_samples=num_samples))
# Map faker generated entity types to Presidio entity types
for sample in self.fake_records:
for span in sample.spans:
span.type = self.faker_to_presidio_entity_type[span.type]
melmatlis marked this conversation as resolved.
Show resolved Hide resolved
for key, value in self.faker_to_presidio_entity_type.items():
sample.template = sample.template.replace("{{%s}}" % key, "{{%s}}" % value)
return self.fake_records


if __name__ == "__main__":
entity_generator = PresidioFakeRecordGenerator(locale="en_US", lower_case_ratio=0.05,
random_seed=42)
fake_patterns = entity_generator.generate_new_fake_records(num_samples=10000)
repo_root = Path(__file__).parent.parent.parent
output_file = repo_root / "data/presidio_data_generator_data.json"
to_json = [dataclasses.asdict(pattern) for pattern in fake_patterns]
with open("{}".format(output_file), "w+", encoding="utf-8") as f:
json.dump(to_json, f, ensure_ascii=False, indent=2)
36 changes: 16 additions & 20 deletions presidio_evaluator/data_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,22 @@
FakerSpan,
)

SPACY_PRESIDIO_ENTITIES = {
"ORG": "ORGANIZATION",
"NORP": "NRP",
"GPE": "LOCATION",
"LOC": "LOCATION",
"FAC": "LOCATION",
"PERSON": "PERSON",
"LOCATION": "LOCATION",
"ORGANIZATION": "ORGANIZATION",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
}
PRESIDIO_SPACY_ENTITIES = {
"PERSON": "PERSON",
"LOCATION": "LOC",
"GPE": "GPE",
"ORGANIZATION": "ORG",
"DATE_TIME": "DATE",
"NRP": "NORP",
}
SPACY_PRESIDIO_ENTITIES = dict(ORG="ORGANIZATION",
NORP="NRP",
GPE="LOCATION",
LOC="LOCATION",
FAC="LOCATION",
PERSON="PERSON",
LOCATION="LOCATION",
ORGANIZATION="ORGANIZATION",
DATE="DATE_TIME",
TIME="DATE_TIME")
PRESIDIO_SPACY_ENTITIES = dict(PERSON="PERSON",
LOCATION="LOC",
GPE="GPE",
ORGANIZATION="ORG",
DATE_TIME="DATE",
NRP="NORP")


class Span:
Expand Down