<a href="https://colab.research.google.com/github/midhunjmes/presidio_final/blob/main/final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install presidio_analyzer
!pip install faker
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
pip install flair



In [4]:
import gzip
import time
import pandas as pd
import json
import re
import os
from collections import defaultdict
import random
from faker import Faker
from presidio_analyzer import AnalyzerEngine
from flair.data import Sentence
from flair.models import SequenceTagger
tagger=SequenceTagger.load("flair/ner-english")
analyzer = AnalyzerEngine()
fake=Faker()
with gzip.open("faker_data.json.gz", "rt", encoding="utf-8") as f:
    fake_data_list = json.load(f)
fake_data={}
for data in fake_data_list:
    for key,value in data.items():
        fake_data[key]=set(value)

entity_mapping={
    'names':'PERSON',
    'emails':'EMAIL_ADDRESS',
    'phone':'PHONE_NUMBER',
    'location':'LOCATION',
    'credit':'CREDIT_CARD',
    'url':'URL',
    'country':'COUNTRY',
    'company':"ORG"
    # 'id':'ID',
}

mapping_file="mapping.json"
forward_mapping=defaultdict(dict)
reverse_mapping=defaultdict(dict)

if os.path.exists(mapping_file):
    with open(mapping_file, "r") as f:
        mapping_data = json.load(f)
        forward_mapping.update(mapping_data.get("forward_mapping", {}))
        reverse_mapping.update(mapping_data.get("reverse_mapping", {}))
def time_it(func):
    """Decorator to measure execution time of functions."""
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print(f'\n⏳ Execution time {func.__name__}: {end-start:.6f} seconds')
        return result
    return wrapper

@time_it
def analyze_column(df):
  entity_columns = {}  # Initialize as a dictionary
  for col in df.columns:
    if "id" in col.lower():
      entity_columns[col]="ID"
    if "country" in col.lower():
      entity_columns[col]="COUNTRY"
  for col in df.columns:
    unique_values = df[col].dropna().astype(str).unique()[:25]
    entity_counts = {}

    for value in unique_values:
      results = analyzer.analyze(text=value, language="en")
      for result in results:
        entity_counts[result.entity_type] = entity_counts.get(result.entity_type, 0) + 1
      sentence=Sentence(value)
      tagger.predict(sentence)
      for entity in sentence.get_spans('ner'):
        if entity.tag =="ORG":
          entity_counts['ORG'] = entity_counts.get('ORG', 0) + 1
    if entity_counts:
      predominant_entity = max(entity_counts, key=entity_counts.get)
      if col not in entity_columns:
        entity_columns[col]=predominant_entity
  return entity_columns  # Return the dictionary

def get_fake_value(category, original_value):
    if original_value in forward_mapping[category]:
        return forward_mapping[category][original_value]

    # Try to get a new fake value
    fake_value = None
    if fake_data.get(category) and fake_data[category]:  # If values exist, pop one
        fake_value = fake_data[category].pop()
    else:  # If exhausted, modify an existing value
        if forward_mapping[category]:  # Use an existing fake value
            fake_value = random.choice(list(forward_mapping[category].values()))
            fake_value = f"{fake_value} {fake_value}"  # Double the value
        else:
            fake_value = f"UNKNOWN_{random.randint(1000, 9999)}"  # Final fallback

    # Save mappings
    forward_mapping[category][original_value] = fake_value
    reverse_mapping[category][fake_value] = original_value

    return fake_value
@time_it
def mask_dataframe(df):
    for col, entity in entity_columns.items():
        matching_keys = [key for key, value in entity_mapping.items() if value == entity]

        if matching_keys:
            df[col] = df[col].astype(str).apply(lambda x: get_fake_value(matching_keys[0], str(x)) if x else str(x))
    return df
def restore_original_value(category, fake_value):
    return reverse_mapping[category].get(fake_value, fake_value)
@time_it
def unmask_dataframe(df):
    for col, entity in entity_columns.items():
        matching_keys = [key for key, value in entity_mapping.items() if value == entity]

        if matching_keys:
            category = matching_keys[0]
            df[col] = df[col].astype(str).apply(lambda x: restore_original_value(category, str(x)) if x else str(x))

    return df
def compare_files(original_file, restored_file):
    """Check if the original and restored files are identical."""
    file_ext = os.path.splitext(original_file)[-1].lower()
    original_df = pd.read_excel(original_file) if file_ext == ".xlsx" else pd.read_csv(original_file)
    restored_df = pd.read_excel(restored_file) if file_ext == ".xlsx" else pd.read_csv(restored_file)

    is_identical = original_df.equals(restored_df)
    print(f"📊 Are files identical? {'✅ Yes' if is_identical else '❌ No'}")
    if not is_identical:
        print("⚠️ The restored file does not match the original. There may be an issue with the mapping.")

    return is_identical
def de_anonymize_paragraph(text):
  for category,mapping in reverse_mapping.items():
    for fake_value,original_value in mapping.items():
      if fake_value in text:
        text=text.replace(fake_value,original_value)
  return text
def save_mapping():
    mapping_data={
        "forward_mapping":forward_mapping,
        "reverse_mapping":reverse_mapping
    }
    with open(mapping_file, "w") as f:
        json.dump(mapping_data, f, indent=4)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


2025-04-01 10:38:23,344 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>




In [6]:
if __name__=="__main__":
    input_file="smaller_100k_companies.csv"
    file_ext=os.path.splitext(input_file)[-1].lower()

    df = pd.read_excel(input_file, dtype=str) if file_ext == ".xlsx" else pd.read_csv(input_file, dtype=str, low_memory=False)
    entity_columns=analyze_column(df)
    print(entity_columns)

    anonymized_df=mask_dataframe(df)
    output_file="anonymized.xlsx" if file_ext==".xlsx" else "anonymized.csv"
    anonymized_df.to_excel(output_file,index=False) if file_ext==".xlsx" else anonymized_df.to_csv(output_file,index=False)
    print(f"✅ Anonymized data saved as {output_file}")

    save_mapping()
    restored_df=unmask_dataframe(pd.read_excel(output_file) if file_ext==".xlsx" else pd.read_csv(output_file))
    restored_file="restored.xlsx" if file_ext==".xlsx" else "restored.csv"
    restored_df.to_excel(restored_file,index=False) if file_ext==".xlsx" else restored_df.to_csv(restored_file,index=False)
    print(f"✅ Restored data saved as {restored_file}")
    compare_files(input_file, restored_file)


⏳ Execution time analyze_column: 5.461540 seconds
{'id': 'ID', 'country': 'COUNTRY', 'name': 'ORG', 'domain': 'URL', 'size range': 'DATE_TIME', 'locality': 'LOCATION', 'linkedin url': 'URL', 'current employee estimate': 'US_DRIVER_LICENSE', 'total employee estimate': 'US_DRIVER_LICENSE'}

⏳ Execution time mask_dataframe: 0.267453 seconds
✅ Anonymized data saved as anonymized.csv

⏳ Execution time unmask_dataframe: 0.206680 seconds
✅ Restored data saved as restored.csv
📊 Are files identical? ✅ Yes


In [11]:
text='''In the bustling city of Port Gabriela, Massachusetts, a tech giant has emerged as a leader in the information technology and services sector. Founded in 1911, the company has grown to a workforce of over 10,001 employees and now operates in North Daniel, Alabama Province District, Micronesia. Their official website, https://www.gonzalez.com/.org.com.in, has become a hub for industry insights.

Meanwhile, in Norrisfort, Ohio, another major player, established in 1968, continues to innovate. This company, headquartered in New Rodriguezmouth, Delaware, Lao People's Democratic Republic, maintains a strong online presence through http://chen-graham.info/ and https://www.molina.com/.co.in.

Across the country, Upper North Larry, Rhode Island, has been home to pioneers since 1989. One such organization, deeply rooted in the IT sector, employs over 10,001+ professionals and has a strategic base in Greater Andersonland, Oklahoma, Guatemala. Their online platform, http://www.gutierrez.com/.net.info.net, serves thousands of users daily.

However, history runs even deeper in Dawsonhaven, South Dakota, where one of the oldest military establishments, founded in 1800, stands strong. With headquarters in North Nicholetown, Kansas, Micronesia, this institution leverages technology through http://www.jackson-carson.biz/ and http://davis.biz/.info.co.info.'''

In [13]:
de_anonymize_paragraph(text)

'In the bustling city of ibm, a tech giant has emerged as a leader in the information technology and services sector. Founded in 1911, the company has grown to a workforce of over 10,001 employees and now operates in new york, new york, united states, united states. Their official website, ibm.com, has become a hub for industry insights.\n\nMeanwhile, in tata consultancy services, another major player, established in 1968, continues to innovate. This company, headquartered in New bidvest logistics, india, maintains a strong online presence through tcs.com and linkedin.com/company/tata-consultancy-services.\n\nAcross the country, accenture, has been home to pioneers since 1989. One such organization, deeply rooted in the IT sector, employs over 10,001+ professionals and has a strategic base in dublin, dublin, ireland, ireland. Their online platform, accenture.com, serves thousands of users daily.\n\nHowever, history runs even deeper in us army, where one of the oldest military establish