In [None]:
from transformers import pipeline

# Load a zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Synthetic text with private information
text = """
In recent years, the rapid advancement of technology has reshaped the way businesses operate globally.
Companies like TechCorp have been at the forefront of this change, with notable individuals such as John Doe (phone number: (555) 123-4567)
leading the charge in innovation. While TechCorp focuses on software, other industries like healthcare are seeing similar changes.
For example, Jane Smith, residing at 123 Maple Street, Springfield, has been working on medical research breakthroughs.
In finance, modern banking methods have allowed customers to securely store information.
For instance, client Michael Johnson has a registered account with the number 9876-5432-1098-7654 and regularly uses this for transactions.
Companies are also using data analytics to predict trends, and addresses such as 567 Oak Avenue, Metropolis often appear in demographic studies.
Researchers often refer to real-world examples; a common example might include a credit card like 1234-5678-9012-3456 used for illustrative purposes.
Contacting the support team at support@techcorp.com has become easier, providing direct assistance for all users.
"""

# Split text into sentences or smaller parts for more granular classification
sentences = text.split(". ")

# Define labels to use in zero-shot classification
labels = ["sensitive information", "non-sensitive information"]

# Process each sentence and classify it
sensitive_info = []
for sentence in sentences:
    result = classifier(sentence, labels)
    if result["labels"][0] == "sensitive information" and result["scores"][0] > 0.5:
        sensitive_info.append(sentence)

# Print detected sensitive information
print("Detected Sensitive Information:")
for info in sensitive_info:
    print(info)


Detected Sensitive Information:

In recent years, the rapid advancement of technology has reshaped the way businesses operate globally
Companies like TechCorp have been at the forefront of this change, with notable individuals such as John Doe (phone number: (555) 123-4567) leading the charge in innovation
For example, Jane Smith, residing at 123 Maple Street, Springfield, has been working on medical research breakthroughs
In finance, modern banking methods have allowed customers to securely store information
For instance, client Michael Johnson has a registered account with the number 9876-5432-1098-7654 and regularly uses this for transactions
Companies are also using data analytics to predict trends, and addresses such as 567 Oak Avenue, Metropolis often appear in demographic studies


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re

# Load a pre-trained NER model from Hugging Face
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  # This model recognizes entities like names and locations
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Initialize a pipeline for named entity recognition
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Sample text with private information
text = """
In recent years, the rapid advancement of technology has reshaped the way businesses operate globally.
Companies like TechCorp have been at the forefront of this change, with notable individuals such as John Doe (phone number: (555) 123-4567) leading the charge in innovation.
For example, Jane Smith, residing at 123 Maple Street, Springfield, has been working on medical research breakthroughs.
In finance, modern banking methods have allowed customers to securely store information.
For instance, client Michael Johnson has a registered account with the number 9876-5432-1098-7654 and regularly uses this for transactions.
Companies are also using data analytics to predict trends, and addresses such as 567 Oak Avenue, Metropolis often appear in demographic studies.
"""

# Initialize a dictionary to store extracted information
results = {
    "Names": [],
    "Phone Numbers": [],
    "Addresses": [],
    "Credit Card Numbers": [],
}

# Step 1: Use the NER model to extract names and locations
entities = ner_pipeline(text)
for entity in entities:
    if entity["entity_group"] == "PER":  # Person names
        results["Names"].append(entity["word"])
    elif entity["entity_group"] in ["LOC", "GPE"]:  # Locations as potential addresses
        results["Addresses"].append(entity["word"])

# Step 2: Use regex to extract phone numbers and credit card numbers
# Patterns for phone numbers and credit card numbers
phone_pattern = r"\(\d{3}\)\s\d{3}-\d{4}"
cc_pattern = r"\b(?:\d{4}[-.\s]?){3}\d{4}\b"

# Extract phone numbers and credit card numbers from the text
results["Phone Numbers"] = re.findall(phone_pattern, text)
results["Credit Card Numbers"] = re.findall(cc_pattern, text)

# Print the results in the specified format
print("Names:", ", ".join(results["Names"]))
print("Phone Numbers:", ", ".join(results["Phone Numbers"]))
print("Addresses:", ", ".join(results["Addresses"]))
print("Credit Card Numbers:", ", ".join(results["Credit Card Numbers"]))

tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Names: John Do, Jane Smith, Michael Johnson
Phone Numbers: (555) 123-4567
Addresses: Maple Street, Springfield, Oak Avenue, Metropolis
Credit Card Numbers: 9876-5432-1098-7654


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re

# Load a pre-trained NER model from Hugging Face
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  # This model recognizes entities like names and locations
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


# Initialize a pipeline for named entity recognition
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Synthetic text with various sensitive information for testing
text = """
**Corporate Records and Client Profiles**

Client Profile:
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough. Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320. She recently updated her driver's license to ID123456789. Jessica's personal email, jessica.alcott@example.com, is stored in our confidential client database along with her IP address, 192.168.45.12, for tracking her account activity.

Dr. Michael Green, a research scientist from Newville, has been working on a confidential health-related study. His office is located at 123 Elm Avenue, Newville, where he can be reached via office phone (212) 555-0482. His passport number is A98765432, and his private website for research documentation is www.michaelgreen-research.com. Dr. Green has also authorized us to store his biometric voice print and retina scan data for secure lab access.

Finance Department Notes:
Benjamin Parker, an executive client, uses his company-issued credit card, 4539-1482-5674-8952, for business expenses. His bank account number is 123456789012, and his routing number is 021000021. Ben also has a prepaid card with number 6789-1234-5678-9012, kept on file for emergencies. For identity verification, his DNA profile and a backup of his genetic data are stored in our secure database.

Healthcare Records:
Angela Bryant, an employee under our health plan, has recently been diagnosed with a confidential health condition. Her medical record number is MRN1029384756, and her insurance policy lists her health plan beneficiary number as HPN-98765. Angela lives at 202 Oakwood Lane, Rivertown. Emergency contact is her husband, Paul Bryant, reachable at (323) 555-8765. Angela's full-face photographic image and fingerprint scan are stored to comply with health provider requirements.

Legal and Regulatory Information:
Samuel T. Wright, an employee at TechCorp, has political opinions that he keeps confidential. His citizenship status and immigration documents, including passport number G45678901, are verified by the HR department. Samuel also holds union membership with Local 342. His business profile is associated with username "techSam1234," and he has authorized browsing history analysis on his work device to ensure compliance with company policy.

Digital Interaction and Device Data:
Our company website, www.techcorp-business.com, provides a public point of contact at (555) 213-4567. However, sensitive client data is stored on secure devices, such as mobile device identifier MDID-9876543210, and encrypted IP addresses for each user session. Company policies require that device serial numbers like SN-ABC1234567 be logged for each interaction with sensitive documents.

John Smith, a business associate, often communicates with TechCorp via email at john.smith@techcorp.com. His browsing history reveals visits to sensitive research portals, and he uses a biometric facial recognition system to access confidential company data. He also receives two-factor authentication codes on his registered device.

Employment Records:
Employee records show that Sarah Johnson, job title "Senior Analyst," has a salary of $90,000 per annum. Her employee ID is SJ001234567, and she holds a driver's license number DL456789012. Her health insurance coverage includes a full dental plan, and her genetic data is on file due to company policy. Sarah’s purchase history for work-related equipment is regularly reviewed for compliance.

System Logs and Monitoring:
Corporate device 10.0.0.5 was accessed using login credentials of user "admin" on March 3, 2023. This activity was logged, and browsing history was saved for review. The network administrator has verified the authenticity of the browser cookie and recorded it for audit purposes. All actions on this device are tracked, including any login attempts.

**Confidential Communications and Transaction Logs**
On April 12, 2023, client Margaret Lee conducted a high-value transaction with account number 234567890123 and routing number 987654321. The transaction involved the use of her prepaid gift card (card number 4321-8765-1234-5678). Margaret, a frequent user, has her IP address logged as 172.16.254.1. Her personal profile, created under pseudonym "userML254," reflects purchase history and previous transactions.

Confidential Medical Report:
Peter D’Souza, health record number 6789123456, is under treatment for a rare genetic disorder. His personal physician has authorized his medical data, including DNA sequencing, to be available for secure consultation. Peter's last known address is 456 Willow Drive, Lakewood, and his primary contact number is (415) 555-4567. His biometric fingerprint is stored for hospital identification.
"""

# Keywords indicating private or sensitive context
sensitivity_keywords = [
    "confidential", "personal", "private", "client", "account", "sensitive",
    "registered", "protected", "classified", "restricted", "secure", "health", "medical"
]

# Initialize a dictionary to store extracted information based on the sensitive categories
results = {
    "Names": [],
    "Addresses": [],
    "Dates": [],
    "Phone Numbers": [],
    "Fax Numbers": [],
    "Email Addresses": [],
    "Social Security Numbers": [],
    "Passport Numbers": [],
    "Account Numbers": [],
    "Credit Card Numbers": [],
    "URLs": [],
    "IP Addresses": [],
    "Device Identifiers": [],
    "License Plate Numbers": [],
    "Driver's License Numbers": [],
    "Biometric Identifiers": [],
    "Health Data": [],
    "Genetic Data": [],
    "Employment Data": [],
    "Browsing History": [],
}

# Step 1: Use the NER model to extract potential names, addresses, and dates
entities = ner_pipeline(text)
for entity in entities:
    if entity["entity_group"] == "PER":  # Person names
        results["Names"].append(entity["word"])
    elif entity["entity_group"] in ["LOC", "GPE"]:  # Locations as potential addresses
        results["Addresses"].append(entity["word"])
    elif entity["entity_group"] == "DATE":  # Dates
        results["Dates"].append(entity["word"])

# Step 2: Use regex patterns to capture structured sensitive data
patterns = {
    "Phone Numbers": r"\(\d{3}\)\s\d{3}-\d{4}",
    "Fax Numbers": r"Fax:\s*(\(\d{3}\)\s\d{3}-\d{4})",
    "Email Addresses": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "Social Security Numbers": r"\b\d{3}-\d{2}-\d{4}\b",
    "Passport Numbers": r"\b[A-Z0-9]{7,9}\b",
    "Account Numbers": r"\b\d{9,12}\b",
    "Credit Card Numbers": r"\b(?:\d{4}[-.\s]?){3}\d{4}\b",
    "URLs": r"(https?://[^\s]+|www\.[^\s]+)",
    "IP Addresses": r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
    "Device Identifiers": r"\b(?:[A-Za-z0-9]{15,20})\b",
    "License Plate Numbers": r"\b[A-Z0-9]{1,3}-[A-Z0-9]{1,3}-[0-9]{1,4}\b",
    "Driver's License Numbers": r"\b[A-Z0-9]{8,12}\b",
    "Biometric Identifiers": r"\b(fingerprint|retina|voiceprint|facial recognition|biometric)\b",
    "Health Data": r"\b(diagnosis|health condition|medical record|treatment|prescription)\b",
    "Genetic Data": r"\b(DNA|genetic|chromosome|genome)\b",
    "Employment Data": r"\b(employee|employer|job title|salary|employment)\b",
    "Browsing History": r"\b(visited|browser|history|cookies|search)\b",
}

# Apply each regex pattern to the text
for category, pattern in patterns.items():
    results[category].extend(re.findall(pattern, text))

# Step 3: Contextual Filtering based on sensitivity keywords
def is_sensitive(text, keywords):
    """Check if any keyword is found near the sensitive entity in text."""
    return any(keyword in text.lower() for keyword in keywords)

# Filter sensitive results based on context
sensitive_info = {category: [] for category in results}
for category, entities in results.items():
    for entity in entities:
        # Find the context around each entity in the text
        context_start = max(text.find(entity) - 50, 0)
        context_end = min(text.find(entity) + len(entity) + 50, len(text))
        context_text = text[context_start:context_end]

        # Check if the context contains any sensitivity keyword
        if is_sensitive(context_text, sensitivity_keywords):
            sensitive_info[category].append(entity)

# Print only contextually sensitive information
print("Detected Sensitive Information:")
for category, items in sensitive_info.items():
    if items:
        print(f"{category}: {', '.join(items)}")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Detected Sensitive Information:
Names: Jessica Alcott, Jessica, Michael Green, Green, Benjamin Parker, Ben, Angela Bryant, Angela, Angela
Addresses: Pine Street, Hillsborough, Newville, Newville
Phone Numbers: (555) 213-4567
Email Addresses: jessica.alcott@example.com, john.smith@techcorp.com
Passport Numbers: A98765432
Account Numbers: 123456789012, 234567890123, 6789123456
Credit Card Numbers: 4539-1482-5674-8952
URLs: www.michaelgreen-research.com.
IP Addresses: 192.168.45.12, 172.16.254.1
Driver's License Numbers: ID123456789, A98765432, 123456789012, ABC1234567, DL456789012, 234567890123, 6789123456
Biometric Identifiers: biometric, retina, fingerprint, biometric, facial recognition, biometric, fingerprint
Health Data: health condition, medical record, treatment
Genetic Data: genetic, genetic, genetic
Employment Data: employee, employee, employee


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sentence_transformers import SentenceTransformer
import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a high-quality NER model from Hugging Face for entity recognition
model_name = "roberta-large"  # Use RoBERTa large for improved NER accuracy
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Load Sentence-BERT for vector-based similarity matching
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Synthetic text with various sensitive information for testing
text = """
**Corporate Records and Client Profiles**

Client Profile:
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough. Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320. She recently updated her driver's license to ID123456789. Jessica's personal email, jessica.alcott@example.com, is stored in our confidential client database along with her IP address, 192.168.45.12, for tracking her account activity.

Dr. Michael Green, a research scientist from Newville, has been working on a confidential health-related study. His office is located at 123 Elm Avenue, Newville, where he can be reached via office phone (212) 555-0482. His passport number is A98765432, and his private website for research documentation is www.michaelgreen-research.com. Dr. Green has also authorized us to store his biometric voice print and retina scan data for secure lab access.

Finance Department Notes:
Benjamin Parker, an executive client, uses his company-issued credit card, 4539-1482-5674-8952, for business expenses. His bank account number is 123456789012, and his routing number is 021000021. Ben also has a prepaid card with number 6789-1234-5678-9012, kept on file for emergencies. For identity verification, his DNA profile and a backup of his genetic data are stored in our secure database.

Healthcare Records:
Angela Bryant, an employee under our health plan, has recently been diagnosed with a confidential health condition. Her medical record number is MRN1029384756, and her insurance policy lists her health plan beneficiary number as HPN-98765. Angela lives at 202 Oakwood Lane, Rivertown. Emergency contact is her husband, Paul Bryant, reachable at (323) 555-8765. Angela's full-face photographic image and fingerprint scan are stored to comply with health provider requirements.

Legal and Regulatory Information:
Samuel T. Wright, an employee at TechCorp, has political opinions that he keeps confidential. His citizenship status and immigration documents, including passport number G45678901, are verified by the HR department. Samuel also holds union membership with Local 342. His business profile is associated with username "techSam1234," and he has authorized browsing history analysis on his work device to ensure compliance with company policy.

Digital Interaction and Device Data:
Our company website, www.techcorp-business.com, provides a public point of contact at (555) 213-4567. However, sensitive client data is stored on secure devices, such as mobile device identifier MDID-9876543210, and encrypted IP addresses for each user session. Company policies require that device serial numbers like SN-ABC1234567 be logged for each interaction with sensitive documents.

John Smith, a business associate, often communicates with TechCorp via email at john.smith@techcorp.com. His browsing history reveals visits to sensitive research portals, and he uses a biometric facial recognition system to access confidential company data. He also receives two-factor authentication codes on his registered device.

Employment Records:
Employee records show that Sarah Johnson, job title "Senior Analyst," has a salary of $90,000 per annum. Her employee ID is SJ001234567, and she holds a driver's license number DL456789012. Her health insurance coverage includes a full dental plan, and her genetic data is on file due to company policy. Sarah’s purchase history for work-related equipment is regularly reviewed for compliance.

System Logs and Monitoring:
Corporate device 10.0.0.5 was accessed using login credentials of user "admin" on March 3, 2023. This activity was logged, and browsing history was saved for review. The network administrator has verified the authenticity of the browser cookie and recorded it for audit purposes. All actions on this device are tracked, including any login attempts.

**Confidential Communications and Transaction Logs**
On April 12, 2023, client Margaret Lee conducted a high-value transaction with account number 234567890123 and routing number 987654321. The transaction involved the use of her prepaid gift card (card number 4321-8765-1234-5678). Margaret, a frequent user, has her IP address logged as 172.16.254.1. Her personal profile, created under pseudonym "userML254," reflects purchase history and previous transactions.

Confidential Medical Report:
Peter D’Souza, health record number 6789123456, is under treatment for a rare genetic disorder. His personal physician has authorized his medical data, including DNA sequencing, to be available for secure consultation. Peter's last known address is 456 Willow Drive, Lakewood, and his primary contact number is (415) 555-4567. His biometric fingerprint is stored for hospital identification.
"""

# Keywords indicating sensitive context
sensitivity_keywords = [
    "confidential", "personal", "private", "client", "account", "sensitive",
    "registered", "protected", "classified", "restricted", "secure", "health", "medical"
]

# Initialize a dictionary to store extracted information based on the sensitive categories
results = {
    "Names": [],
    "Addresses": [],
    "Dates": [],
    "Phone Numbers": [],
    "Fax Numbers": [],
    "Email Addresses": [],
    "Social Security Numbers": [],
    "Passport Numbers": [],
    "Account Numbers": [],
    "Credit Card Numbers": [],
    "URLs": [],
    "IP Addresses": [],
    "Device Identifiers": [],
    "License Plate Numbers": [],
    "Driver's License Numbers": [],
    "Biometric Identifiers": [],
    "Health Data": [],
    "Genetic Data": [],
    "Employment Data": [],
    "Browsing History": [],
}

# Step 1: Use the NER model to extract potential names, addresses, and dates
entities = ner_pipeline(text)
for entity in entities:
    if entity["entity_group"] == "PER":  # Person names
        results["Names"].append(entity["word"])
    elif entity["entity_group"] in ["LOC", "GPE"]:  # Locations as potential addresses
        results["Addresses"].append(entity["word"])
    elif entity["entity_group"] == "DATE":  # Dates
        results["Dates"].append(entity["word"])

# Step 2: Use regex patterns to capture structured sensitive data
patterns = {
    "Phone Numbers": r"\(\d{3}\)\s\d{3}-\d{4}",
    "Fax Numbers": r"Fax:\s*(\(\d{3}\)\s\d{3}-\d{4})",
    "Email Addresses": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "Social Security Numbers": r"\b\d{3}-\d{2}-\d{4}\b",
    "Passport Numbers": r"\b[A-Z0-9]{7,9}\b",
    "Account Numbers": r"\b\d{9,12}\b",
    "Credit Card Numbers": r"\b(?:\d{4}[-.\s]?){3}\d{4}\b",
    "URLs": r"(https?://[^\s]+|www\.[^\s]+)",
    "IP Addresses": r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
    "Device Identifiers": r"\b(?:[A-Za-z0-9]{15,20})\b",
    "License Plate Numbers": r"\b[A-Z0-9]{1,3}-[A-Z0-9]{1,3}-[0-9]{1,4}\b",
    "Driver's License Numbers": r"\b[A-Z0-9]{8,12}\b",
    "Biometric Identifiers": r"\b(fingerprint|retina|voiceprint|facial recognition|biometric)\b",
    "Health Data": r"\b(diagnosis|health condition|medical record|treatment|prescription)\b",
    "Genetic Data": r"\b(DNA|genetic|chromosome|genome)\b",
    "Employment Data": r"\b(employee|employer|job title|salary|employment)\b",
    "Browsing History": r"\b(visited|browser|history|cookies|search)\b",
}

# Apply each regex pattern to the text
for category, pattern in patterns.items():
    results[category].extend(re.findall(pattern, text))

# Step 3: Vector similarity search for contextual filtering
# Convert keywords into embeddings
sensitive_embeddings = sentence_model.encode(sensitivity_keywords)

# Function to check sensitivity context
def is_sensitive(entity, context):
    # Generate embedding for the context
    context_embedding = sentence_model.encode(context)
    # Check similarity with sensitive embeddings
    similarity_scores = cosine_similarity([context_embedding], sensitive_embeddings)
    # Return True if any similarity score exceeds threshold
    return np.any(similarity_scores > 0.7)

# Filter contextually sensitive information
sensitive_info = {category: [] for category in results}
for category, entities in results.items():
    for entity in entities:
        # Find the context around each entity in the text
        context_start = max(text.find(entity) - 50, 0)
        context_end = min(text.find(entity) + len(entity) + 50, len(text))
        context_text = text[context_start:context_end]

        # Check if the context contains any sensitivity keywords or is sensitive based on vector similarity
        if any(keyword in context_text.lower() for keyword in sensitivity_keywords) or is_sensitive(entity, context_text):
            sensitive_info[category].append(entity)

# Print only contextually sensitive information
print("Detected Sensitive Information:")
for category, items in sensitive_info.items():
    if items:
        print(f"{category}: {', '.join(items)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

IndexError: index out of range in self

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sentence_transformers import SentenceTransformer
import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a high-quality NER model from Hugging Face for entity recognition
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  # Use BERT-large-cased model for NER
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Load Sentence-BERT for vector-based similarity matching
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

text = """
**Corporate Records and Client Profiles**

Client Profile:
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough. Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320. She recently updated her driver's license to ID123456789. Jessica's personal email, jessica.alcott@example.com, is stored in our confidential client database along with her IP address, 192.168.45.12, for tracking her account activity.

Dr. Michael Green, a research scientist from Newville, has been working on a confidential health-related study. His office is located at 123 Elm Avenue, Newville, where he can be reached via office phone (212) 555-0482. His passport number is A98765432, and his private website for research documentation is www.michaelgreen-research.com. Dr. Green has also authorized us to store his biometric voice print and retina scan data for secure lab access.

Finance Department Notes:
Benjamin Parker, an executive client, uses his company-issued credit card, 4539-1482-5674-8952, for business expenses. His bank account number is 123456789012, and his routing number is 021000021. Ben also has a prepaid card with number 6789-1234-5678-9012, kept on file for emergencies. For identity verification, his DNA profile and a backup of his genetic data are stored in our secure database.

Healthcare Records:
Angela Bryant, an employee under our health plan, has recently been diagnosed with a confidential health condition. Her medical record number is MRN1029384756, and her insurance policy lists her health plan beneficiary number as HPN-98765. Angela lives at 202 Oakwood Lane, Rivertown. Emergency contact is her husband, Paul Bryant, reachable at (323) 555-8765. Angela's full-face photographic image and fingerprint scan are stored to comply with health provider requirements.

Legal and Regulatory Information:
Samuel T. Wright, an employee at TechCorp, has political opinions that he keeps confidential. His citizenship status and immigration documents, including passport number G45678901, are verified by the HR department. Samuel also holds union membership with Local 342. His business profile is associated with username "techSam1234," and he has authorized browsing history analysis on his work device to ensure compliance with company policy.

Digital Interaction and Device Data:
Our company website, www.techcorp-business.com, provides a public point of contact at (555) 213-4567. However, sensitive client data is stored on secure devices, such as mobile device identifier MDID-9876543210, and encrypted IP addresses for each user session. Company policies require that device serial numbers like SN-ABC1234567 be logged for each interaction with sensitive documents.

John Smith, a business associate, often communicates with TechCorp via email at john.smith@techcorp.com. His browsing history reveals visits to sensitive research portals, and he uses a biometric facial recognition system to access confidential company data. He also receives two-factor authentication codes on his registered device.

Employment Records:
Employee records show that Sarah Johnson, job title "Senior Analyst," has a salary of $90,000 per annum. Her employee ID is SJ001234567, and she holds a driver's license number DL456789012. Her health insurance coverage includes a full dental plan, and her genetic data is on file due to company policy. Sarah’s purchase history for work-related equipment is regularly reviewed for compliance.

System Logs and Monitoring:
Corporate device 10.0.0.5 was accessed using login credentials of user "admin" on March 3, 2023. This activity was logged, and browsing history was saved for review. The network administrator has verified the authenticity of the browser cookie and recorded it for audit purposes. All actions on this device are tracked, including any login attempts.

**Confidential Communications and Transaction Logs**
On April 12, 2023, client Margaret Lee conducted a high-value transaction with account number 234567890123 and routing number 987654321. The transaction involved the use of her prepaid gift card (card number 4321-8765-1234-5678). Margaret, a frequent user, has her IP address logged as 172.16.254.1. Her personal profile, created under pseudonym "userML254," reflects purchase history and previous transactions.

Confidential Medical Report:
Peter D’Souza, health record number 6789123456, is under treatment for a rare genetic disorder. His personal physician has authorized his medical data, including DNA sequencing, to be available for secure consultation. Peter's last known address is 456 Willow Drive, Lakewood, and his primary contact number is (415) 555-4567. His biometric fingerprint is stored for hospital identification.
"""

# Keywords indicating sensitive context
sensitivity_keywords = [
    "confidential", "personal", "private", "client", "account", "sensitive",
    "registered", "protected", "classified", "restricted", "secure", "health", "medical"
]

# Initialize a dictionary to store extracted information based on the sensitive categories
results = {
    "Names": [],
    "Addresses": [],
    "Dates": [],
    "Phone Numbers": [],
    "Fax Numbers": [],
    "Email Addresses": [],
    "Social Security Numbers": [],
    "Passport Numbers": [],
    "Account Numbers": [],
    "Credit Card Numbers": [],
    "URLs": [],
    "IP Addresses": [],
    "Device Identifiers": [],
    "License Plate Numbers": [],
    "Driver's License Numbers": [],
    "Biometric Identifiers": [],
    "Health Data": [],
    "Genetic Data": [],
    "Employment Data": [],
    "Browsing History": [],
}

# Split text into smaller chunks (e.g., paragraphs) for processing
def chunk_text(text, max_length=512):
    """Split text into chunks of max_length or less."""
    sentences = text.split("\n\n")
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + "\n\n"
        else:
            chunks.append(current_chunk)
            current_chunk = sentence + "\n\n"
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

# Step 1: Use the NER model to extract potential names, addresses, and dates
text_chunks = chunk_text(text)
for chunk in text_chunks:
    entities = ner_pipeline(chunk)
    for entity in entities:
        if entity["entity_group"] == "PER":  # Person names
            results["Names"].append(entity["word"])
        elif entity["entity_group"] in ["LOC", "GPE"]:  # Locations as potential addresses
            results["Addresses"].append(entity["word"])
        elif entity["entity_group"] == "DATE":  # Dates
            results["Dates"].append(entity["word"])

# Step 2: Use regex patterns to capture structured sensitive data
patterns = {
    "Phone Numbers": r"\(\d{3}\)\s\d{3}-\d{4}",
    "Fax Numbers": r"Fax:\s*(\(\d{3}\)\s\d{3}-\d{4})",
    "Email Addresses": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "Social Security Numbers": r"\b\d{3}-\d{2}-\d{4}\b",
    "Passport Numbers": r"\b[A-Z0-9]{7,9}\b",
    "Account Numbers": r"\b\d{9,12}\b",
    "Credit Card Numbers": r"\b(?:\d{4}[-.\s]?){3}\d{4}\b",
    "URLs": r"(https?://[^\s]+|www\.[^\s]+)",
    "IP Addresses": r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
    "Device Identifiers": r"\b(?:[A-Za-z0-9]{15,20})\b",
    "License Plate Numbers": r"\b[A-Z0-9]{1,3}-[A-Z0-9]{1,3}-[0-9]{1,4}\b",
    "Driver's License Numbers": r"\b[A-Z0-9]{8,12}\b",
    "Biometric Identifiers": r"\b(fingerprint|retina|voiceprint|facial recognition|biometric)\b",
    "Health Data": r"\b(diagnosis|health condition|medical record|treatment|prescription)\b",
    "Genetic Data": r"\b(DNA|genetic|chromosome|genome)\b",
    "Employment Data": r"\b(employee|employer|job title|salary|employment)\b",
    "Browsing History": r"\b(visited|browser|history|cookies|search)\b",
}

# Apply each regex pattern to the text
for category, pattern in patterns.items():
    results[category].extend(re.findall(pattern, text))

# Step 3: Vector similarity search for contextual filtering
sensitive_embeddings = sentence_model.encode(sensitivity_keywords)

# Function to check sensitivity context
def is_sensitive(entity, context):
    context_embedding = sentence_model.encode(context)
    similarity_scores = cosine_similarity([context_embedding], sensitive_embeddings)
    return np.any(similarity_scores > 0.7)

# Filter contextually sensitive information
sensitive_info = {category: [] for category in results}
for category, entities in results.items():
    for entity in entities:
        context_start = max(text.find(entity) - 50, 0)
        context_end = min(text.find(entity) + len(entity) + 50, len(text))
        context_text = text[context_start:context_end]
        if any(keyword in context_text.lower() for keyword in sensitivity_keywords) or is_sensitive(entity, context_text):
            sensitive_info[category].append(entity)

# Print only contextually sensitive information
print("Detected Sensitive Information:")
for category, items in sensitive_info.items():
    if items:
        print(f"{category}: {', '.join(items)}")


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Detected Sensitive Information:
Names: Jessica Alcott, Jessica, al, Michael Green, Green, Benjamin Parker, Ben, Angela Bryant, Angela, Angela, John Smith, Margaret Lee, Margaret, Peter D ’ Souza, Peter
Addresses: Pine Street, Hillsborough, Newville, Newville
Phone Numbers: (555) 213-4567
Email Addresses: jessica.alcott@example.com, john.smith@techcorp.com
Passport Numbers: A98765432
Account Numbers: 123456789012, 234567890123, 6789123456
Credit Card Numbers: 4539-1482-5674-8952
URLs: www.michaelgreen-research.com.
IP Addresses: 192.168.45.12, 172.16.254.1
Driver's License Numbers: ID123456789, A98765432, 123456789012, ABC1234567, DL456789012, 234567890123, 6789123456
Biometric Identifiers: biometric, retina, fingerprint, biometric, facial recognition, biometric, fingerprint
Health Data: health condition, medical record, treatment
Genetic Data: genetic, genetic, genetic
Employment Data: employee, employee, employee


In [None]:
!pip install gliner -U

Collecting gliner
  Downloading gliner-0.2.13-py3-none-any.whl.metadata (7.3 kB)
Collecting onnxruntime (from gliner)
  Downloading onnxruntime-1.20.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting coloredlogs (from onnxruntime->gliner)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->gliner)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading gliner-0.2.13-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.20.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from gliner import GLiNER

# Load the fine-tuned GLiNER model
model = GLiNER.from_pretrained("gretelai/gretel-gliner-bi-large-v1.0")

# Sample text containing PII/PHI entities
text = """
**Corporate Records and Client Profiles**

Client Profile:
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough. Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320. She recently updated her driver's license to ID123456789. Jessica's personal email, jessica.alcott@example.com, is stored in our confidential client database along with her IP address, 192.168.45.12, for tracking her account activity.

Dr. Michael Green, a research scientist from Newville, has been working on a confidential health-related study. His office is located at 123 Elm Avenue, Newville, where he can be reached via office phone (212) 555-0482. His passport number is A98765432, and his private website for research documentation is www.michaelgreen-research.com. Dr. Green has also authorized us to store his biometric voice print and retina scan data for secure lab access.

Finance Department Notes:
Benjamin Parker, an executive client, uses his company-issued credit card, 4539-1482-5674-8952, for business expenses. His bank account number is 123456789012, and his routing number is 021000021. Ben also has a prepaid card with number 6789-1234-5678-9012, kept on file for emergencies. For identity verification, his DNA profile and a backup of his genetic data are stored in our secure database.

Healthcare Records:
Angela Bryant, an employee under our health plan, has recently been diagnosed with a confidential health condition. Her medical record number is MRN1029384756, and her insurance policy lists her health plan beneficiary number as HPN-98765. Angela lives at 202 Oakwood Lane, Rivertown. Emergency contact is her husband, Paul Bryant, reachable at (323) 555-8765. Angela's full-face photographic image and fingerprint scan are stored to comply with health provider requirements.

Legal and Regulatory Information:
Samuel T. Wright, an employee at TechCorp, has political opinions that he keeps confidential. His citizenship status and immigration documents, including passport number G45678901, are verified by the HR department. Samuel also holds union membership with Local 342. His business profile is associated with username "techSam1234," and he has authorized browsing history analysis on his work device to ensure compliance with company policy.

Digital Interaction and Device Data:
Our company website, www.techcorp-business.com, provides a public point of contact at (555) 213-4567. However, sensitive client data is stored on secure devices, such as mobile device identifier MDID-9876543210, and encrypted IP addresses for each user session. Company policies require that device serial numbers like SN-ABC1234567 be logged for each interaction with sensitive documents.

John Smith, a business associate, often communicates with TechCorp via email at john.smith@techcorp.com. His browsing history reveals visits to sensitive research portals, and he uses a biometric facial recognition system to access confidential company data. He also receives two-factor authentication codes on his registered device.

Employment Records:
Employee records show that Sarah Johnson, job title "Senior Analyst," has a salary of $90,000 per annum. Her employee ID is SJ001234567, and she holds a driver's license number DL456789012. Her health insurance coverage includes a full dental plan, and her genetic data is on file due to company policy. Sarah’s purchase history for work-related equipment is regularly reviewed for compliance.

System Logs and Monitoring:
Corporate device 10.0.0.5 was accessed using login credentials of user "admin" on March 3, 2023. This activity was logged, and browsing history was saved for review. The network administrator has verified the authenticity of the browser cookie and recorded it for audit purposes. All actions on this device are tracked, including any login attempts.

**Confidential Communications and Transaction Logs**
On April 12, 2023, client Margaret Lee conducted a high-value transaction with account number 234567890123 and routing number 987654321. The transaction involved the use of her prepaid gift card (card number 4321-8765-1234-5678). Margaret, a frequent user, has her IP address logged as 172.16.254.1. Her personal profile, created under pseudonym "userML254," reflects purchase history and previous transactions.

Confidential Medical Report:
Peter D’Souza, health record number 6789123456, is under treatment for a rare genetic disorder. His personal physician has authorized his medical data, including DNA sequencing, to be available for secure consultation. Peter's last known address is 456 Willow Drive, Lakewood, and his primary contact number is (415) 555-4567. His biometric fingerprint is stored for hospital identification.
"""

# Define the labels of expected PII/PHI entities
labels = [
    "medical_record_number",
    "date_of_birth",
    "ssn",
    "date",
    "first_name",
    "email",
    "last_name",
    "customer_id",
    "employee_id",
    "name",
    "street_address",
    "phone_number",
    "ipv4",
    "credit_card_number",
    "license_plate",
    "address",
    "user_name",
    "device_identifier",
    "bank_routing_number",
    "date_time",
    "company_name",
    "unique_identifier",
    "biometric_identifier",
    "account_number",
    "city",
    "certificate_license_number",
    "time",
    "postcode",
    "vehicle_identifier",
    "coordinate",
    "country",
    "api_key",
    "ipv6",
    "password",
    "health_plan_beneficiary_number",
    "national_id",
    "tax_id",
    "url",
    "state",
    "swift_bic",
    "cvv",
    "pin"
]

# Predict entities with a confidence threshold of 0.7
entities = model.predict_entities(text, labels, threshold=0.7)

# Display the detected entities
for entity in entities:
    print(f"{entity['text']} => {entity['label']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/5.88k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


89 Pine Street, Hillsborough => street_address
(312) 555-0199 => phone_number
987-65-4320 => ssn
jessica.alcott@example.com => email
(212) 555-0482 => phone_number
www.michaelgreen-research.com => url
Benjamin Parker => name
4539-1482-5674-8952 => credit_card_number
021000021 => bank_routing_number
MRN1029384756 => medical_record_number
202 Oakwood Lane, Rivertown => street_address
(323) 555-8765 => phone_number
Samuel T. Wright => name
G45678901 => medical_record_number


In [None]:
!pip install llama-index

Collecting llama-index
  Downloading llama_index-0.11.23-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-agent-openai<0.4.0,>=0.3.4 (from llama-index)
  Downloading llama_index_agent_openai-0.3.4-py3-none-any.whl.metadata (728 bytes)
Collecting llama-index-cli<0.4.0,>=0.3.1 (from llama-index)
  Downloading llama_index_cli-0.3.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.12.0,>=0.11.23 (from llama-index)
  Downloading llama_index_core-0.11.23-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.3.0,>=0.2.4 (from llama-index)
  Downloading llama_index_embeddings_openai-0.2.5-py3-none-any.whl.metadata (686 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.3.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.5.0-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_index_legacy-0.9.48.post4-py3-none-any.whl.metadata (8.5 kB)
Collecti

In [None]:
!pip install presidio-analyzer presidio-anonymizer

Collecting presidio-analyzer
  Downloading presidio_analyzer-2.2.355-py3-none-any.whl.metadata (2.9 kB)
Collecting presidio-anonymizer
  Downloading presidio_anonymizer-2.2.355-py3-none-any.whl.metadata (8.2 kB)
Collecting phonenumbers<9.0.0,>=8.12 (from presidio-analyzer)
  Downloading phonenumbers-8.13.51-py2.py3-none-any.whl.metadata (10 kB)
Collecting tldextract (from presidio-analyzer)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting azure-core (from presidio-anonymizer)
  Downloading azure_core-1.32.0-py3-none-any.whl.metadata (39 kB)
Collecting pycryptodome>=3.10.1 (from presidio-anonymizer)
  Downloading pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting requests-file>=1.4 (from tldextract->presidio-analyzer)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading presidio_analyzer-2.2.355-py3-none-any.whl (109 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [None]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

analyzer = AnalyzerEngine(supported_languages=["en"])
results = analyzer.analyze(text=text, language='en')
engine = AnonymizerEngine()
new_text = engine.anonymize(text=text, analyzer_results=results)

from llama_index.postprocessor.presidio import PresidioPIINodePostprocessor
from llama_index import ServiceContext
from llama_index.schema import TextNode

text = """
My name is Roey Ben Chaim and my credit card number is 4095-2609-9393-4932.
My email is robo@presidio.site and I live in Amsterdam.
Have you been to a Pálmi Einarsson concert before?
What is the limit for card 4158112277712? My IBAN is GB90YNTU67299444055881.
What's your last name? Bob, it's Bob.
My great great grandfather was called Yulan Peres,
and my great great grandmother was called Jennifer Holst
I can't browse to your site, keep getting address 179.177.214.91 blocked error
Just posted a photo https://www.FilmFranchise.dk/
"""

node = TextNode(text=text)

service_context = ServiceContext.from_defaults()
processor = PresidioPIINodePostprocessor(service_context=service_context)
processor = PresidioPIINodePostprocessor()

from llama_index.schema import NodeWithScore

new_nodes = processor.postprocess_nodes([NodeWithScore(node=node)])
print(new_nodes[0].node.get_text())



KeyboardInterrupt: 

In [None]:
pip install llama-index-postprocessor-presidio

Collecting llama-index-postprocessor-presidio
  Downloading llama_index_postprocessor_presidio-0.2.0-py3-none-any.whl.metadata (794 bytes)
Downloading llama_index_postprocessor_presidio-0.2.0-py3-none-any.whl (2.9 kB)
Installing collected packages: llama-index-postprocessor-presidio
Successfully installed llama-index-postprocessor-presidio-0.2.0


In [None]:
# load documents
text = """
**Corporate Records and Client Profiles**

Client Profile:
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough. Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320. She recently updated her driver's license to ID123456789. Jessica's personal email, jessica.alcott@example.com, is stored in our confidential client database along with her IP address, 192.168.45.12, for tracking her account activity.

Dr. Michael Green, a research scientist from Newville, has been working on a confidential health-related study. His office is located at 123 Elm Avenue, Newville, where he can be reached via office phone (212) 555-0482. His passport number is A98765432, and his private website for research documentation is www.michaelgreen-research.com. Dr. Green has also authorized us to store his biometric voice print and retina scan data for secure lab access.

Finance Department Notes:
Benjamin Parker, an executive client, uses his company-issued credit card, 4539-1482-5674-8952, for business expenses. His bank account number is 123456789012, and his routing number is 021000021. Ben also has a prepaid card with number 6789-1234-5678-9012, kept on file for emergencies. For identity verification, his DNA profile and a backup of his genetic data are stored in our secure database.

Healthcare Records:
Angela Bryant, an employee under our health plan, has recently been diagnosed with a confidential health condition. Her medical record number is MRN1029384756, and her insurance policy lists her health plan beneficiary number as HPN-98765. Angela lives at 202 Oakwood Lane, Rivertown. Emergency contact is her husband, Paul Bryant, reachable at (323) 555-8765. Angela's full-face photographic image and fingerprint scan are stored to comply with health provider requirements.

Legal and Regulatory Information:
Samuel T. Wright, an employee at TechCorp, has political opinions that he keeps confidential. His citizenship status and immigration documents, including passport number G45678901, are verified by the HR department. Samuel also holds union membership with Local 342. His business profile is associated with username "techSam1234," and he has authorized browsing history analysis on his work device to ensure compliance with company policy.

Digital Interaction and Device Data:
Our company website, www.techcorp-business.com, provides a public point of contact at (555) 213-4567. However, sensitive client data is stored on secure devices, such as mobile device identifier MDID-9876543210, and encrypted IP addresses for each user session. Company policies require that device serial numbers like SN-ABC1234567 be logged for each interaction with sensitive documents.

John Smith, a business associate, often communicates with TechCorp via email at john.smith@techcorp.com. His browsing history reveals visits to sensitive research portals, and he uses a biometric facial recognition system to access confidential company data. He also receives two-factor authentication codes on his registered device.

Employment Records:
Employee records show that Sarah Johnson, job title "Senior Analyst," has a salary of $90,000 per annum. Her employee ID is SJ001234567, and she holds a driver's license number DL456789012. Her health insurance coverage includes a full dental plan, and her genetic data is on file due to company policy. Sarah’s purchase history for work-related equipment is regularly reviewed for compliance.

System Logs and Monitoring:
Corporate device 10.0.0.5 was accessed using login credentials of user "admin" on March 3, 2023. This activity was logged, and browsing history was saved for review. The network administrator has verified the authenticity of the browser cookie and recorded it for audit purposes. All actions on this device are tracked, including any login attempts.

**Confidential Communications and Transaction Logs**
On April 12, 2023, client Margaret Lee conducted a high-value transaction with account number 234567890123 and routing number 987654321. The transaction involved the use of her prepaid gift card (card number 4321-8765-1234-5678). Margaret, a frequent user, has her IP address logged as 172.16.254.1. Her personal profile, created under pseudonym "userML254," reflects purchase history and previous transactions.

Confidential Medical Report:
Peter D’Souza, health record number 6789123456, is under treatment for a rare genetic disorder. His personal physician has authorized his medical data, including DNA sequencing, to be available for secure consultation. Peter's last known address is 456 Willow Drive, Lakewood, and his primary contact number is (415) 555-4567. His biometric fingerprint is stored for hospital identification.
"""

from llama_index.core.postprocessor import (
    PIINodePostprocessor,
    NERPIINodePostprocessor,
)
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.schema import TextNode


from llama_index.postprocessor.presidio import PresidioPIINodePostprocessor

presidio_node = TextNode(text=text)

processor = PresidioPIINodePostprocessor()

from llama_index.core.schema import NodeWithScore

presidio_new_nodes = processor.postprocess_nodes(
    [NodeWithScore(node=presidio_node)]
)

# view redacted text
presidio_new_nodes[0].node.get_text()

# get mapping in metadata
# NOTE: this is not sent to the LLM!
presidio_new_nodes[0].node.metadata["__pii_node_info__"]



{'<PHONE_NUMBER_1>': '(415) 555-4567',
 '<LOCATION_1>': 'Lakewood',
 '<PERSON_1>': 'Peter',
 '<IN_PAN_1>': 'sequencing',
 '<IN_PAN_2>': 'authorized',
 '<PHONE_NUMBER_2>': '6789123456',
 '<PERSON_2>': 'Peter D’Souza',
 '<IP_ADDRESS_1>': '172.16.254.1',
 '<PERSON_3>': 'Margaret',
 '<IN_PAN_3>': '4321-8765-',
 '<US_BANK_NUMBER_1>': '987654321',
 '<US_BANK_NUMBER_2>': '234567890123',
 '<IN_PAN_4>': 'high-value',
 '<PERSON_4>': 'Margaret Lee',
 '<DATE_TIME_1>': 'April 12, 2023',
 '<DATE_TIME_2>': 'March 3, 2023',
 '<IP_ADDRESS_2>': '10.0.0.5',
 '<IN_PAN_5>': 'Monitoring',
 '<IN_PAN_6>': 'compliance',
 '<PERSON_5>': 'Sarah',
 '<MEDICAL_LICENSE_1>': 'DL4567890',
 '<PERSON_6>': 'Sarah Johnson',
 '<IN_PAN_7>': 'Employment',
 '<IN_PAN_8>': 'registered',
 '<IN_PAN_9>': 'two-factor',
 '<EMAIL_ADDRESS_1>': 'john.smith@techcorp.com',
 '<PERSON_7>': 'John Smith',
 '<IN_VOTER_1>': 'ABC1234567',
 '<UK_NHS_1>': '9876543210',
 '<IN_PAN_10>': 'identifier',
 '<PHONE_NUMBER_3>': '(555) 213-4567',
 '<URL_1>'

In [None]:
presidio_new_nodes[0].node.get_text()

'\n**Corporate Records and Client Profiles**\n\nClient Profile:\n<PERSON_18>, a valued client, resides at 89 <LOCATION_7>, <LOCATION_6>. Her primary phone number is <PHONE_NUMBER_7>, and her Social Security Number is <US_ITIN_1>. She recently updated her driver\'s license to ID123456789. <PERSON_17>\'s personal email, <EMAIL_ADDRESS_2>, is stored in our confidential client database along with her IP address, <IP_ADDRESS_3>, for tracking her account activity.\n\nDr. <PERSON_16>, a research scientist from <LOCATION_4>, has been working on a confidential health-related study. His office is located at <LOCATION_5>, <LOCATION_4>, where he can be reached via office phone <PHONE_NUMBER_6>. His passport number is <IN_PASSPORT_1>, and his private website for research documentation is <URL_2>. Dr. <PERSON_15> has also <IN_PAN_2> us to store his biometric voice print and retina scan data for secure lab access.\n\nFinance <IN_PAN_18> Notes:\n<PERSON_14>, an executive client, uses his company-issue

In [None]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
import uuid

# Step 1: Initialize Presidio Analyzer and Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Step 2: Define the input text containing PII
text = """
**Corporate Records and Client Profiles**

Client Profile:
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough. Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320. She recently updated her driver's license to ID123456789. Jessica's personal email, jessica.alcott@example.com, is stored in our confidential client database along with her IP address, 192.168.45.12, for tracking her account activity.

Dr. Michael Green, a research scientist from Newville, has been working on a confidential health-related study. His office is located at 123 Elm Avenue, Newville, where he can be reached via office phone (212) 555-0482. His passport number is A98765432, and his private website for research documentation is www.michaelgreen-research.com. Dr. Green has also authorized us to store his biometric voice print and retina scan data for secure lab access.

Finance Department Notes:
Benjamin Parker, an executive client, uses his company-issued credit card, 4539-1482-5674-8952, for business expenses. His bank account number is 123456789012, and his routing number is 021000021. Ben also has a prepaid card with number 6789-1234-5678-9012, kept on file for emergencies. For identity verification, his DNA profile and a backup of his genetic data are stored in our secure database.

Healthcare Records:
Angela Bryant, an employee under our health plan, has recently been diagnosed with a confidential health condition. Her medical record number is MRN1029384756, and her insurance policy lists her health plan beneficiary number as HPN-98765. Angela lives at 202 Oakwood Lane, Rivertown. Emergency contact is her husband, Paul Bryant, reachable at (323) 555-8765. Angela's full-face photographic image and fingerprint scan are stored to comply with health provider requirements.

Legal and Regulatory Information:
Samuel T. Wright, an employee at TechCorp, has political opinions that he keeps confidential. His citizenship status and immigration documents, including passport number G45678901, are verified by the HR department. Samuel also holds union membership with Local 342. His business profile is associated with username "techSam1234," and he has authorized browsing history analysis on his work device to ensure compliance with company policy.

Digital Interaction and Device Data:
Our company website, www.techcorp-business.com, provides a public point of contact at (555) 213-4567. However, sensitive client data is stored on secure devices, such as mobile device identifier MDID-9876543210, and encrypted IP addresses for each user session. Company policies require that device serial numbers like SN-ABC1234567 be logged for each interaction with sensitive documents.

John Smith, a business associate, often communicates with TechCorp via email at john.smith@techcorp.com. His browsing history reveals visits to sensitive research portals, and he uses a biometric facial recognition system to access confidential company data. He also receives two-factor authentication codes on his registered device.

Employment Records:
Employee records show that Sarah Johnson, job title "Senior Analyst," has a salary of $90,000 per annum. Her employee ID is SJ001234567, and she holds a driver's license number DL456789012. Her health insurance coverage includes a full dental plan, and her genetic data is on file due to company policy. Sarah’s purchase history for work-related equipment is regularly reviewed for compliance.

System Logs and Monitoring:
Corporate device 10.0.0.5 was accessed using login credentials of user "admin" on March 3, 2023. This activity was logged, and browsing history was saved for review. The network administrator has verified the authenticity of the browser cookie and recorded it for audit purposes. All actions on this device are tracked, including any login attempts.

**Confidential Communications and Transaction Logs**
On April 12, 2023, client Margaret Lee conducted a high-value transaction with account number 234567890123 and routing number 987654321. The transaction involved the use of her prepaid gift card (card number 4321-8765-1234-5678). Margaret, a frequent user, has her IP address logged as 172.16.254.1. Her personal profile, created under pseudonym "userML254," reflects purchase history and previous transactions.

Confidential Medical Report:
Peter D’Souza, health record number 6789123456, is under treatment for a rare genetic disorder. His personal physician has authorized his medical data, including DNA sequencing, to be available for secure consultation. Peter's last known address is 456 Willow Drive, Lakewood, and his primary contact number is (415) 555-4567. His biometric fingerprint is stored for hospital identification.
"""


# Step 3: Analyze the text to detect PII
results = analyzer.analyze(
    text=text,
    entities=[],  # Detect all supported PII entities
    language="en"  # English language model
)

# Step 4: Generate a unique mapping for anonymization
pii_mapping = {}  # Dictionary to store anonymized placeholders and original PII

# Function to generate a unique anonymized placeholder
def generate_placeholder(entity_type):
    return f"[{entity_type}_{uuid.uuid4()}]"

# Step 5: Anonymize the text and store mappings for reversibility
anonymized_text = text
for result in results:
    original_value = text[result.start:result.end]
    placeholder = generate_placeholder(result.entity_type)
    pii_mapping[placeholder] = original_value  # Store mapping for retrieval
    anonymized_text = anonymized_text.replace(original_value, placeholder)

# Step 6: Display anonymized text
print("Anonymized Text:")
print(anonymized_text)

# Step 7: Function to revert anonymized text to its original form
def revert_anonymization(anonymized_text, pii_mapping):
    reverted_text = anonymized_text
    for placeholder, original_value in pii_mapping.items():
        reverted_text = reverted_text.replace(placeholder, original_value)
    return reverted_text

# Step 8: Revert the text to its original form
reverted_text = revert_anonymization(anonymized_text, pii_mapping)

# Step 9: Display reverted text
print("\nReverted Text:")
print(reverted_text)

# Step 10: Display mappings for verification
print("\nPII Mapping (Anonymized <-> Original):")
for placeholder, original_value in pii_mapping.items():
    print(f"{placeholder}: {original_value}")




[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.




Anonymized Text:

**Corporate Records and Client Profiles**

Client Profile:
[PERSON_01de8481-405a-4a16-9a83-5f6e45c6786b], a valued client, resides at 89 [LOCATION_a174229c-8afa-4256-9864-b93398c16c2c], [LOCATION_a8070844-ca05-4c37-a232-a32af9023e95]. Her primary phone number is [PHONE_NUMBER_12d40975-5658-46a4-bb90-234d56da8b84], and her Social Security Number is [US_ITIN_b3a57aee-924a-44b0-a9c4-14b5eeca3468]. She recently updated her driver's license to ID123456789. [PERSON_ed886c65-9cff-407f-b4fb-a41895ff877c]'s personal email, [EMAIL_ADDRESS_e5f780ca-5b26-40f8-8427-b97be717d7e7], is stored in our confidential client database along with her IP address, [IP_ADDRESS_3ba5fcba-9a2e-4a6b-90bb-905fd6e14c9c], for tracking her account activity.

Dr. [PERSON_6a44df24-6099-443b-920d-b36405cf5352], a research scientist from [LOCATION_8ce85198-07c0-4e60-9f0d-7d03f4d8d047], has been working on a confidential health-related study. His office is located at [LOCATION_db2a3cb4-733d-4dda-b7a4-4cf454

In [None]:
# Install required libraries:
# pip install presidio-analyzer presidio-anonymizer transformers

from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Step 1: Initialize Presidio Analyzer and Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Step 2: Initialize BERT NER Pipeline
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Step 3: Input text containing PII
text = """
Jessica Alcott resides at 89 Pine Street, Hillsborough. Her phone number is (312) 555-0199,
and her Social Security Number is 987-65-4320. Email: jessica.alcott@example.com
"""

# Step 4: Analyze text with Presidio
presidio_results = analyzer.analyze(
    text=text,
    entities=[],  # Detect all supported PII entities
    language="en"  # English language model
)

# Step 5: Use BERT NER for validation
bert_results = ner_pipeline(text)

# Step 6: Combine Presidio and BERT Results
validated_results = []
for presidio_result in presidio_results:
    entity_text = text[presidio_result.start:presidio_result.end]
    # Check if BERT also classifies the text as PII
    bert_match = any(
        bert_result["word"] in entity_text and bert_result["entity"].startswith("B-")
        for bert_result in bert_results
    )
    if bert_match:
        validated_results.append(presidio_result)

# Step 7: Display validated results
print("Validated Results:")
for result in validated_results:
    print(f"Entity: {text[result.start:result.end]}, Type: {result.entity_type}")

# Step 8: Prepare anonymization requests
anonymization_requests = [
    {
        "start": result.start,
        "end": result.end,
        "entity_type": result.entity_type,
        "anonymizer": "replace",
        "new_value": "[PII]"
    }
    for result in validated_results
]

# Step 9: Perform anonymization
anonymized_text = anonymizer.anonymize(
    text=text,
    analyzer_results=validated_results,
    anonymizers_config={"default": {"type": "replace", "new_value": "[PII]"}}
)

# Step 10: Print anonymized text
print("\nAnonymized Text:")
print(anonymized_text.text)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Validated Results:


TypeError: AnonymizerEngine.anonymize() got an unexpected keyword argument 'anonymizers_config'

In [None]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import uuid

# Initialize Presidio Analyzer and Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Initialize BERT NER Pipeline
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Input text
text = """
**Corporate Records and Client Profiles**

Client Profile:
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough. Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320. She recently updated her driver's license to ID123456789. Jessica's personal email, jessica.alcott@example.com, is stored in our confidential client database along with her IP address, 192.168.45.12, for tracking her account activity.

Dr. Michael Green, a research scientist from Newville, has been working on a confidential health-related study. His office is located at 123 Elm Avenue, Newville, where he can be reached via office phone (212) 555-0482. His passport number is A98765432, and his private website for research documentation is www.michaelgreen-research.com. Dr. Green has also authorized us to store his biometric voice print and retina scan data for secure lab access.

Finance Department Notes:
Benjamin Parker, an executive client, uses his company-issued credit card, 4539-1482-5674-8952, for business expenses. His bank account number is 123456789012, and his routing number is 021000021. Ben also has a prepaid card with number 6789-1234-5678-9012, kept on file for emergencies. For identity verification, his DNA profile and a backup of his genetic data are stored in our secure database.

Healthcare Records:
Angela Bryant, an employee under our health plan, has recently been diagnosed with a confidential health condition. Her medical record number is MRN1029384756, and her insurance policy lists her health plan beneficiary number as HPN-98765. Angela lives at 202 Oakwood Lane, Rivertown. Emergency contact is her husband, Paul Bryant, reachable at (323) 555-8765. Angela's full-face photographic image and fingerprint scan are stored to comply with health provider requirements.

Legal and Regulatory Information:
Samuel T. Wright, an employee at TechCorp, has political opinions that he keeps confidential. His citizenship status and immigration documents, including passport number G45678901, are verified by the HR department. Samuel also holds union membership with Local 342. His business profile is associated with username "techSam1234," and he has authorized browsing history analysis on his work device to ensure compliance with company policy.

Digital Interaction and Device Data:
Our company website, www.techcorp-business.com, provides a public point of contact at (555) 213-4567. However, sensitive client data is stored on secure devices, such as mobile device identifier MDID-9876543210, and encrypted IP addresses for each user session. Company policies require that device serial numbers like SN-ABC1234567 be logged for each interaction with sensitive documents.

John Smith, a business associate, often communicates with TechCorp via email at john.smith@techcorp.com. His browsing history reveals visits to sensitive research portals, and he uses a biometric facial recognition system to access confidential company data. He also receives two-factor authentication codes on his registered device.

Employment Records:
Employee records show that Sarah Johnson, job title "Senior Analyst," has a salary of $90,000 per annum. Her employee ID is SJ001234567, and she holds a driver's license number DL456789012. Her health insurance coverage includes a full dental plan, and her genetic data is on file due to company policy. Sarah’s purchase history for work-related equipment is regularly reviewed for compliance.

System Logs and Monitoring:
Corporate device 10.0.0.5 was accessed using login credentials of user "admin" on March 3, 2023. This activity was logged, and browsing history was saved for review. The network administrator has verified the authenticity of the browser cookie and recorded it for audit purposes. All actions on this device are tracked, including any login attempts.

**Confidential Communications and Transaction Logs**
On April 12, 2023, client Margaret Lee conducted a high-value transaction with account number 234567890123 and routing number 987654321. The transaction involved the use of her prepaid gift card (card number 4321-8765-1234-5678). Margaret, a frequent user, has her IP address logged as 172.16.254.1. Her personal profile, created under pseudonym "userML254," reflects purchase history and previous transactions.

Confidential Medical Report:
Peter D’Souza, health record number 6789123456, is under treatment for a rare genetic disorder. His personal physician has authorized his medical data, including DNA sequencing, to be available for secure consultation. Peter's last known address is 456 Willow Drive, Lakewood, and his primary contact number is (415) 555-4567. His biometric fingerprint is stored for hospital identification.
"""

# Presidio Detection
presidio_results = analyzer.analyze(
    text=text,
    entities=[],  # Detect all supported PII entities
    language="en"
)
print("Presidio Results:")
for result in presidio_results:
    print(f"Entity: {text[result.start:result.end]}, Type: {result.entity_type}, Confidence: {result.score}")

# BERT NER Detection
bert_results = ner_pipeline(text)
print("\nBERT NER Results:")
for bert_result in bert_results:
    print(bert_result)

# Map BERT entities to Presidio types
bert_to_presidio_mapping = {
    "PER": "PERSON",
    "LOC": "LOCATION",
    "ORG": "ORGANIZATION",
    "MISC": "MISCELLANEOUS"
}

# Validate Presidio Results with BERT
validated_results = []
for presidio_result in presidio_results:
    entity_text = text[presidio_result.start:presidio_result.end]

    # Check if BERT identifies the same text
    bert_match = any(
        (bert_result["word"] in entity_text or entity_text in bert_result["word"]) and
        bert_to_presidio_mapping.get(bert_result["entity"].split("-")[-1], "") == presidio_result.entity_type
        for bert_result in bert_results
    )

    if bert_match:
        validated_results.append(presidio_result)

if not validated_results:
    print("\nNo validated PII detected.")
else:
    print("\nValidated PII Entities:")
    for result in validated_results:
        print(f"Entity: {text[result.start:result.end]}, Type: {result.entity_type}")

# Anonymize Validated PII
pii_mapping = {}
anonymized_text = text
for result in validated_results:
    original_value = text[result.start:result.end]
    placeholder = f"[{result.entity_type}_{uuid.uuid4()}]"
    pii_mapping[placeholder] = original_value
    anonymized_text = anonymized_text.replace(original_value, placeholder)

print("\nAnonymized Text:")
print(anonymized_text)

# Revert Anonymized Text
reverted_text = anonymized_text
for placeholder, original_value in pii_mapping.items():
    reverted_text = reverted_text.replace(placeholder, original_value)

print("\nReverted Text:")
print(reverted_text)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Presidio Results:
Entity: jessica.alcott@example.com, Type: EMAIL_ADDRESS, Confidence: 1.0
Entity: 9876543210, Type: UK_NHS, Confidence: 1.0
Entity: john.smith@techcorp.com, Type: EMAIL_ADDRESS, Confidence: 1.0
Entity: DL4567890, Type: MEDICAL_LICENSE, Confidence: 1.0
Entity: 192.168.45.12, Type: IP_ADDRESS, Confidence: 0.95
Entity: 172.16.254.1, Type: IP_ADDRESS, Confidence: 0.95
Entity: Jessica Alcott, Type: PERSON, Confidence: 0.85
Entity: Pine Street, Type: LOCATION, Confidence: 0.85
Entity: Hillsborough, Type: LOCATION, Confidence: 0.85
Entity: Jessica, Type: PERSON, Confidence: 0.85
Entity: 192.168.45.12, Type: DATE_TIME, Confidence: 0.85
Entity: Michael Green, Type: PERSON, Confidence: 0.85
Entity: Newville, Type: LOCATION, Confidence: 0.85
Entity: 123 Elm Avenue, Type: LOCATION, Confidence: 0.85
Entity: Newville, Type: LOCATION, Confidence: 0.85
Entity: www.michaelgreen-research.com, Type: URL, Confidence: 0.85
Entity: Green, Type: PERSON, Confidence: 0.85
Entity: Benjamin Park

In [None]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import uuid
import re

# Initialize Presidio Analyzer and Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Initialize BERT NER Pipeline
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Input text containing PII
text = """
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough. Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320. She recently updated her driver's license to ID123456789. Jessica's personal email, jessica.alcott@example.com, is stored in our confidential client database along with her IP address, 192.168.45.12, for tracking her account activity.

Dr. Michael Green, a research scientist from Newville, has been working on a confidential health-related study. His office is located at 123 Elm Avenue, Newville, where he can be reached via office phone (212) 555-0482. His passport number is A98765432, and his private website for research documentation is www.michaelgreen-research.com.
"""

# Step 1: Split text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)

# Step 2: Use BERT to identify sentences with PII
pii_sentences = []
for sentence in sentences:
    bert_results = ner_pipeline(sentence)
    print(f"\nBERT Results for Sentence: {sentence}")
    for bert_result in bert_results:
        print(bert_result)

    # If BERT detects any entity, flag the sentence as containing PII
    if any(entity["entity"].startswith("B-") for entity in bert_results):
        pii_sentences.append(sentence)

print("\nSentences with PII detected by BERT:")
for sent in pii_sentences:
    print(f"- {sent}")

# Step 3: Anonymize the detected sentences with Presidio
pii_mapping = {}
anonymized_sentences = {}

for sentence in pii_sentences:
    presidio_results = analyzer.analyze(
        text=sentence,
        entities=[],  # Detect all supported PII entities
        language="en"
    )
    anonymized_sentence = sentence
    for result in presidio_results:
        original_value = sentence[result.start:result.end]
        placeholder = f"[{result.entity_type}_{uuid.uuid4()}]"
        pii_mapping[placeholder] = original_value
        anonymized_sentence = anonymized_sentence.replace(original_value, placeholder)
    anonymized_sentences[sentence] = anonymized_sentence

# Step 4: Reassemble the text with anonymized sentences
anonymized_text = text
for original_sentence, anonymized_sentence in anonymized_sentences.items():
    anonymized_text = anonymized_text.replace(original_sentence, anonymized_sentence)

# Step 5: Display Anonymized Text
print("\nAnonymized Text:")
print(anonymized_text)

# Step 6: Revert Anonymized Text to Original
def revert_anonymization(anonymized_text, pii_mapping):
    reverted_text = anonymized_text
    for placeholder, original_value in pii_mapping.items():
        reverted_text = reverted_text.replace(placeholder, original_value)
    return reverted_text

reverted_text = revert_anonymization(anonymized_text, pii_mapping)

# Step 7: Display Reverted Text
print("\nReverted Text:")
print(reverted_text)

# Step 8: Display PII Mapping for Verification
print("\nPII Mapping (Anonymized <-> Original):")
for placeholder, original_value in pii_mapping.items():
    print(f"{placeholder}: {original_value}")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



BERT Results for Sentence: 
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough.
{'entity': 'I-PER', 'score': 0.99930143, 'index': 1, 'word': 'Jessica', 'start': 1, 'end': 8}
{'entity': 'I-PER', 'score': 0.9995511, 'index': 2, 'word': 'Al', 'start': 9, 'end': 11}
{'entity': 'I-PER', 'score': 0.9986241, 'index': 3, 'word': '##cott', 'start': 11, 'end': 15}
{'entity': 'I-LOC', 'score': 0.80686444, 'index': 12, 'word': 'Pine', 'start': 48, 'end': 52}
{'entity': 'I-LOC', 'score': 0.8432262, 'index': 13, 'word': 'Street', 'start': 53, 'end': 59}
{'entity': 'I-LOC', 'score': 0.9527151, 'index': 15, 'word': 'Hills', 'start': 61, 'end': 66}
{'entity': 'I-LOC', 'score': 0.92371565, 'index': 16, 'word': '##borough', 'start': 66, 'end': 73}

BERT Results for Sentence: Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320.
{'entity': 'I-MISC', 'score': 0.9160111, 'index': 18, 'word': 'Social', 'start': 52, 'end': 58}
{'entity': 'I-MISC', 's

In [None]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re
import uuid

# Initialize Presidio Analyzer and Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Initialize BERT NER Pipeline
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Input text containing PII
text = """
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough. Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320. She recently updated her driver's license to ID123456789. Jessica's personal email, jessica.alcott@example.com, is stored in our confidential client database along with her IP address, 192.168.45.12, for tracking her account activity.

Dr. Michael Green, a research scientist from Newville, has been working on a confidential health-related study. His office is located at 123 Elm Avenue, Newville, where he can be reached via office phone (212) 555-0482. His passport number is A98765432, and his private website for research documentation is www.michaelgreen-research.com.
"""

# Step 1: Split text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)

# Step 2: Use BERT to identify sentences with PII
pii_sentences = []
for sentence in sentences:
    bert_results = ner_pipeline(sentence)
    print(f"\nBERT Results for Sentence: {sentence}")
    for bert_result in bert_results:
        print(bert_result)

    # If BERT detects any entity, flag the sentence as containing PII
    if any(entity["entity"].startswith("B-") for entity in bert_results):
        pii_sentences.append(sentence)

print("\nSentences with PII detected by BERT:")
for sent in pii_sentences:
    print(f"- {sent}")

# Step 3: Anonymize the detected sentences with Presidio
pii_mapping = {}
anonymized_sentences = {}

for sentence in pii_sentences:
    presidio_results = analyzer.analyze(
        text=sentence,
        entities=[],  # Detect all supported PII entities
        language="en"
    )

    # Debug: Print Presidio results
    print(f"\nPresidio Results for Sentence: {sentence}")
    for result in presidio_results:
        print(f"Entity: {sentence[result.start:result.end]}, Type: {result.entity_type}, Confidence: {result.score}")

    # Perform anonymization
    anonymized_sentence = sentence
    for result in presidio_results:
        original_value = sentence[result.start:result.end]
        placeholder = f"[{result.entity_type}_{uuid.uuid4()}]"
        pii_mapping[placeholder] = original_value
        anonymized_sentence = anonymized_sentence.replace(original_value, placeholder)
    anonymized_sentences[sentence] = anonymized_sentence

# Step 4: Reassemble the text with anonymized sentences
anonymized_text = text
for original_sentence, anonymized_sentence in anonymized_sentences.items():
    anonymized_text = anonymized_text.replace(original_sentence, anonymized_sentence)

# Step 5: Display Anonymized Text
print("\nAnonymized Text:")
print(anonymized_text)

# Step 6: Revert Anonymized Text to Original
def revert_anonymization(anonymized_text, pii_mapping):
    reverted_text = anonymized_text
    for placeholder, original_value in pii_mapping.items():
        reverted_text = reverted_text.replace(placeholder, original_value)
    return reverted_text

reverted_text = revert_anonymization(anonymized_text, pii_mapping)

# Step 7: Display Reverted Text
print("\nReverted Text:")
print(reverted_text)

# Step 8: Display PII Mapping for Verification
print("\nPII Mapping (Anonymized <-> Original):")
for placeholder, original_value in pii_mapping.items():
    print(f"{placeholder}: {original_value}")


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



BERT Results for Sentence: 
Jessica Alcott, a valued client, resides at 89 Pine Street, Hillsborough.
{'entity': 'I-PER', 'score': 0.99930143, 'index': 1, 'word': 'Jessica', 'start': 1, 'end': 8}
{'entity': 'I-PER', 'score': 0.9995511, 'index': 2, 'word': 'Al', 'start': 9, 'end': 11}
{'entity': 'I-PER', 'score': 0.9986241, 'index': 3, 'word': '##cott', 'start': 11, 'end': 15}
{'entity': 'I-LOC', 'score': 0.80686444, 'index': 12, 'word': 'Pine', 'start': 48, 'end': 52}
{'entity': 'I-LOC', 'score': 0.8432262, 'index': 13, 'word': 'Street', 'start': 53, 'end': 59}
{'entity': 'I-LOC', 'score': 0.9527151, 'index': 15, 'word': 'Hills', 'start': 61, 'end': 66}
{'entity': 'I-LOC', 'score': 0.92371565, 'index': 16, 'word': '##borough', 'start': 66, 'end': 73}

BERT Results for Sentence: Her primary phone number is (312) 555-0199, and her Social Security Number is 987-65-4320.
{'entity': 'I-MISC', 'score': 0.9160111, 'index': 18, 'word': 'Social', 'start': 52, 'end': 58}
{'entity': 'I-MISC', 's