In [None]:
from presidio_analyzer import AnalyzerEngine

# Initialize analyzer
analyzer = AnalyzerEngine()

# Print supported entities
print("Supported Entities:", analyzer.get_supported_entities())




Supported Entities: ['IN_AADHAAR', 'IN_PAN', 'NRP', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'UK_NHS', 'MEDICAL_LICENSE', 'US_PASSPORT', 'IP_ADDRESS', 'IBAN_CODE', 'AU_ABN', 'PERSON', 'CRYPTO', 'DATE_TIME', 'US_SSN', 'URL', 'CREDIT_CARD', 'US_ITIN', 'US_DRIVER_LICENSE', 'LOCATION', 'IN_PASSPORT', 'AU_MEDICARE', 'US_BANK_NUMBER', 'SG_NRIC_FIN', 'UK_NINO', 'IN_VEHICLE_REGISTRATION', 'AU_TFN', 'IN_VOTER', 'AU_ACN']


In [None]:
# Install Presidio and necessary NLP libraries
!pip install presidio-analyzer presidio-anonymizer spacy
!python -m spacy download en_core_web_lg

Collecting presidio-analyzer
  Downloading presidio_analyzer-2.2.357-py3-none-any.whl.metadata (3.3 kB)
Collecting presidio-anonymizer
  Downloading presidio_anonymizer-2.2.357-py3-none-any.whl.metadata (8.2 kB)
Collecting phonenumbers<9.0.0,>=8.12 (from presidio-analyzer)
  Downloading phonenumbers-8.13.55-py2.py3-none-any.whl.metadata (11 kB)
Collecting tldextract (from presidio-analyzer)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting azure-core (from presidio-anonymizer)
  Downloading azure_core-1.32.0-py3-none-any.whl.metadata (39 kB)
Collecting pycryptodome>=3.10.1 (from presidio-anonymizer)
  Downloading pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting requests-file>=1.4 (from tldextract->presidio-analyzer)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading presidio_analyzer-2.2.357-py3-none-any.whl (112 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [None]:
# # Import required modules
# from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern, RecognizerRegistry
# from presidio_analyzer.recognizer_registry import RecognizerRegistry
# # The PatternRecognizer class was moved in v2.0.0 of presidio-analyzer
# # https://github.com/microsoft/presidio/releases/tag/v2.0.0
# #from presidio_analyzer import PatternRecognizer  # This is no longer needed
# from presidio_analyzer.nlp_engine import SpacyNlpEngine

# from presidio_anonymizer import AnonymizerEngine

In [None]:
from presidio_analyzer import PatternRecognizer, Pattern, AnalyzerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine

# Initialize Spacy NLP Engine for enhanced entity detection
nlp_engine = SpacyNlpEngine(models=[{"lang_code": "en", "model_name": "en_core_web_lg"}])
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])

# ✅ Define regex patterns for common date formats
date_patterns = [
    Pattern(name="Date (MM/DD/YYYY)", regex=r"\b(0[1-9]|1[0-2])/(0[1-9]|[12][0-9]|3[01])/\d{4}\b", score=0.9),
    Pattern(name="Date (YYYY-MM-DD)", regex=r"\b\d{4}-\d{2}-\d{2}\b", score=0.9),
    Pattern(name="Date (DD-MM-YYYY)", regex=r"\b(0[1-9]|[12][0-9]|3[01])-(0[1-9]|1[0-2])-\d{4}\b", score=0.9),
]

# ✅ Define regex pattern for SSN
ssn_pattern = Pattern(name="SSN Pattern", regex=r"\b\d{3}-\d{2}-\d{4}\b", score=0.85)
# Corrected US SSN regex pattern
us_ssn_pattern = Pattern(
    name="US_SSN_R Pattern",
    regex=r"^(?!000|666|9\d\d)(\d{3})-(?!00)(\d{2})-(?!0000)(\d{4})$",
    score=0.85
)

# ✅ Create recognizers for DATE and SSN
date_recognizer = PatternRecognizer(supported_entity="DATE", patterns=date_patterns)
ssn_recognizer = PatternRecognizer(supported_entity="SSN", patterns=[ssn_pattern])

# ✅ Register recognizers to the analyzer
analyzer.registry.add_recognizer(date_recognizer)
analyzer.registry.add_recognizer(ssn_recognizer)

# ✅ Sample text for testing
# test_text = """John Doe was born on 05/21/1985. His SSN is 123-45-6789.
# He signed a contract on 2023-10-12 and renewed it on 15-09-2022.
# His email is john.doe@example.com, and his phone number is +1-800-555-1234.
# """

test_text = """John Doe 05/21/1985 123-45-6789 2023-10-12 15-09-2022 john.doe@example.com +1-800-555-1234.
"""

# ✅ Analyze text for multiple PII entities
#entities_to_detect = ["PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD", "PERSON", "LOCATION", "SSN", "DATE"]
entities_to_detect = ['SSN','DATE','NRP', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'UK_NHS', 'MEDICAL_LICENSE', 'US_PASSPORT', 'IP_ADDRESS', 'IBAN_CODE', 'AU_ABN', 'PERSON', 'CRYPTO', 'DATE_TIME', 'US_SSN', 'URL', 'CREDIT_CARD', 'US_ITIN', 'US_DRIVER_LICENSE', 'LOCATION', 'IN_PASSPORT', 'AU_MEDICARE', 'US_BANK_NUMBER', 'SG_NRIC_FIN', 'UK_NINO', 'IN_VEHICLE_REGISTRATION', 'AU_TFN', 'IN_VOTER', 'AU_ACN']
results = analyzer.analyze(text=test_text, entities=entities_to_detect, language="en")

# ✅ Print detected entities
print("\nDetected Entities:")
for result in results:
    detected_text = test_text[result.start:result.end]
    print(f"Entity: {result.entity_type}, Confidence: {result.score:.2f}, Text: '{detected_text}'")





Detected Entities:
Entity: EMAIL_ADDRESS, Confidence: 1.00, Text: 'john.doe@example.com'
Entity: DATE, Confidence: 0.90, Text: '05/21/1985'
Entity: DATE, Confidence: 0.90, Text: '2023-10-12'
Entity: DATE, Confidence: 0.90, Text: '15-09-2022'
Entity: PERSON, Confidence: 0.85, Text: 'John Doe'
Entity: SSN, Confidence: 0.85, Text: '123-45-6789'
Entity: DATE_TIME, Confidence: 0.85, Text: '15-09-2022'
Entity: DATE_TIME, Confidence: 0.60, Text: '05/21/1985'
Entity: DATE_TIME, Confidence: 0.60, Text: '2023-10-12'
Entity: URL, Confidence: 0.50, Text: 'john.do'
Entity: URL, Confidence: 0.50, Text: 'example.com'
Entity: PHONE_NUMBER, Confidence: 0.40, Text: '+1-800-555-1234'


In [None]:
# Display detected PII entities
print("Detected PII Entities:")
for result in results:
    print(f"{result.entity_type}: '{test_text[result.start:result.end]}' (Confidence: {result.score:.2f})")


Detected PII Entities:
EMAIL_ADDRESS: 'john.doe@example.com' (Confidence: 1.00)
DATE: '05/21/1985' (Confidence: 0.90)
DATE: '2023-10-12' (Confidence: 0.90)
DATE: '15-09-2022' (Confidence: 0.90)
PERSON: 'John Doe' (Confidence: 0.85)
SSN: '123-45-6789' (Confidence: 0.85)
DATE_TIME: '15-09-2022' (Confidence: 0.85)
DATE_TIME: '05/21/1985' (Confidence: 0.60)
DATE_TIME: '2023-10-12' (Confidence: 0.60)
URL: 'john.do' (Confidence: 0.50)
URL: 'example.com' (Confidence: 0.50)
PHONE_NUMBER: '+1-800-555-1234' (Confidence: 0.40)


In [None]:
print("Supported Entities:", analyzer.get_supported_entities())


Supported Entities: ['IN_AADHAAR', 'IN_PAN', 'EMAIL_ADDRESS', 'NRP', 'PHONE_NUMBER', 'UK_NHS', 'MEDICAL_LICENSE', 'IP_ADDRESS', 'US_PASSPORT', 'IBAN_CODE', 'AU_ABN', 'PERSON', 'ID', 'CRYPTO', 'DATE_TIME', 'SSN', 'US_SSN', 'URL', 'EMAIL', 'CREDIT_CARD', 'US_ITIN', 'AGE', 'US_DRIVER_LICENSE', 'DATE', 'LOCATION', 'IN_PASSPORT', 'US_BANK_NUMBER', 'AU_MEDICARE', 'SG_NRIC_FIN', 'UK_NINO', 'IN_VEHICLE_REGISTRATION', 'AU_TFN', 'IN_VOTER', 'AU_ACN']


In [None]:
# Initialize Presidio Anonymizer
anonymizer = AnonymizerEngine()


In [None]:
# Perform PII anonymization
anonymized_text = anonymizer.anonymize(text=test_text, analyzer_results=results)


In [None]:
# Display anonymized text
print("\nAnonymized Text:")
print(anonymized_text.text)


Anonymized Text:
<PERSON> was born on <DATE>. His SSN is <SSN>. 
He signed a contract on <DATE> and renewed it on <DATE>. 
His email is <EMAIL_ADDRESS>, and his phone number is <PHONE_NUMBER>.



In [None]:
# Define test text containing PII
test_text = """
John Doe lives in New York and works at Google.
His email is john.doe@example.com, and his phone number is +1-800-555-1234.
His credit card number is 4111-1111-1111-1111.
His social security number is 123-45-6789.
Born on 21-12-2023
"""