In [65]:
# sentance to analyze and anonymize
text = "His name is Mr. Jones and his phone number is 212-555-5555"

In [66]:
from presidio_analyzer import AnalyzerEngine

# Set up the Analyzer engine, load the NLP module (spaCy model by default) and other PII recognizers
analyzer = AnalyzerEngine()

# Call analyzer to get results 
analyzer_results = analyzer.analyze(text=text, entities=["PHONE_NUMBER"], language='en')

print(analyzer_results)

[type: PHONE_NUMBER, start: 46, end: 58, score: 0.85]


In [67]:
from presidio_analyzer import PatternRecognizer

# Create custom recognizers
titles_recognizer = PatternRecognizer(supported_entity="TITLE",
                                      deny_list=["Mr.","Mrs.","Miss"])

pronoun_recognizer = PatternRecognizer(supported_entity="PRONOUN",
                                      deny_list=["he", "He", "his", "His", "she", "She", "hers" "Hers"])

# Add the new custom recognizers to the analyzer and call the analyzer 
analyzer.registry.add_recognizer(titles_recognizer)
analyzer.registry.add_recognizer(pronoun_recognizer)

# # Call analyzer to get results for the new recognizers
analyzer_results = analyzer.analyze(text=text,
                            entities=["TITLE", "PRONOUN"],
                            language="en")
print(analyzer_results)


[type: PRONOUN, start: 0, end: 3, score: 1.0, type: TITLE, start: 12, end: 15, score: 1.0, type: PRONOUN, start: 26, end: 29, score: 1.0]


In [68]:
# Call analyzer to get results with the default and new recognizers
analyzer_results = analyzer.analyze(text=text, language='en')

print(analyzer_results)

[type: PRONOUN, start: 0, end: 3, score: 1.0, type: TITLE, start: 12, end: 15, score: 1.0, type: PRONOUN, start: 26, end: 29, score: 1.0, type: PERSON, start: 16, end: 21, score: 0.85, type: PHONE_NUMBER, start: 46, end: 58, score: 0.85]


In [69]:
# Convert analyzer results to anonymizer input - a list of dict
converted_analyzer_results = [{"start": analyzer_results[i].start, "end": analyzer_results[i].end, "score": analyzer_results[i].score, "entity_type": analyzer_results[i].entity_type}          for i in range(len(analyzer_results))]

print(converted_analyzer_results)


[{'start': 0, 'end': 3, 'score': 1.0, 'entity_type': 'PRONOUN'}, {'start': 12, 'end': 15, 'score': 1.0, 'entity_type': 'TITLE'}, {'start': 26, 'end': 29, 'score': 1.0, 'entity_type': 'PRONOUN'}, {'start': 16, 'end': 21, 'score': 0.85, 'entity_type': 'PERSON'}, {'start': 46, 'end': 58, 'score': 0.85, 'entity_type': 'PHONE_NUMBER'}]


In [70]:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import AnonymizerRequest

# setup the anonymizer engine 
anonymizer = AnonymizerEngine()

# create a dict with the text to anonymize, list of transformations to apply on the text and the results from the analyzer request
request = {
        "text": text,
        "transformations": {
            "DEFAULT": {"type": "replace", "new_value": "<ANONYMIZED>"},
             "PHONE_NUMBER": {
                "type": "mask",
                "masking_char": "*",
                "chars_to_mask": 12,
                "from_end": True,
            },
            "TITLE": {
                "type": "redact"
            }
        },
        "analyzer_results": converted_analyzer_results
    }

# create an anonymizer request
data = AnonymizerRequest(request, AnonymizerEngine().builtin_anonymizers)

# anonymize the text
anonymized_results = anonymizer.anonymize(data)

print(anonymized_results)

<ANONYMIZED> name is  <ANONYMIZED> and <ANONYMIZED> phone number is ************
