<a href="https://colab.research.google.com/github/midhun-james/validation_module/blob/main/json_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install presidio_analyzer


Collecting presidio_analyzer
  Downloading presidio_analyzer-2.2.358-py3-none-any.whl.metadata (3.2 kB)
Collecting phonenumbers<9.0.0,>=8.12 (from presidio_analyzer)
  Downloading phonenumbers-8.13.55-py2.py3-none-any.whl.metadata (11 kB)
Collecting tldextract (from presidio_analyzer)
  Downloading tldextract-5.2.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract->presidio_analyzer)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading presidio_analyzer-2.2.358-py3-none-any.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading phonenumbers-8.13.55-py2.py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tldextract-5.2.0-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[

In [24]:
import gzip
import json
from presidio_analyzer import AnalyzerEngine
from collections import defaultdict

# Load your fake data (Assuming it's stored as a dictionary)
with gzip.open("faker_data_v2.json.gz", "rt",encoding="utf-8") as f:
    fake_data_list = json.load(f)
fake_data = {}
for d in fake_data_list:
    fake_data.update(d)

# Entity mapping from detected entity types to fake data keys
entity_mapping = {
    'PERSON': 'names',
    'EMAIL_ADDRESS': 'emails',
    'PHONE_NUMBER': 'phone',
    'LOCATION': 'location',
    'CREDIT_CARD': 'credit',
    'URL': 'url',
    'COUNTRY': 'country',
    'ORG': 'company',
    'ID': 'id',
}

# Initialize Presidio Analyzer
analyzer = AnalyzerEngine()

# Dictionary to store mappings for consistency
mappings = defaultdict(dict)

def get_fake_value(entity, original_value):
    """Retrieve a fake value while ensuring consistency."""
    mapped_entity = entity_mapping.get(entity)  # Map detected entity to fake data key
    if not mapped_entity or mapped_entity not in fake_data:
        return "[REDACTED]"  # Default if no mapping exists

    if original_value in mappings[mapped_entity]:
        return mappings[mapped_entity][original_value]  # Return existing mapping

    if fake_data[mapped_entity]:  # Check if fake data is available
        fake_value = fake_data[mapped_entity].pop(0)  # Get a fake value
        fake_data[mapped_entity].append(fake_value)  # Recycle value for future use
        mappings[mapped_entity][original_value] = fake_value  # Store mapping
        return fake_value

    return "[REDACTED]"  # Fallback

def anonymize_json(data):
    """Recursively anonymize sensitive data in JSON."""
    if isinstance(data, dict):
        return {key: anonymize_json(value) for key, value in data.items()}

    elif isinstance(data, list):
        return [anonymize_json(item) for item in data]

    elif isinstance(data, str):
        results = analyzer.analyze(text=data, entities=list(entity_mapping.keys()), language="en")

        if not results:
            return data

        # Sort results in reverse to avoid messing up indexes
        results = sorted(results, key=lambda x: x.start, reverse=True)

        for result in results:
            entity_type = result.entity_type
            sensitive_text = data[result.start:result.end]
            fake_value = get_fake_value(entity_type, sensitive_text)

            # Replace exact span safely
            data = data[:result.start] + fake_value + data[result.end:]

        return data

    else:
        return data





In [29]:
input_data = {
    "name": "Alice Johnson",
    "email": "alice.johnson@example.com",
    "phone": "+14155552671",
    "location": "San Francisco",
    "company": "Microsoft",
    "credit_card": "4111 1111 1111 1111",
    "url": "https://www.linkedin.com/in/alicejohnson",
    "country": "United States",
    "id": "AB1234567",
    "profile": {
        "bio": "Alice Johnson is a software engineer living in San Francisco. Contact her at alice.johnson@example.com or call +14155552671.",
        "social": ["https://twitter.com/alicejohnson", "https://facebook.com/alicejohnson"]
    }
}



In [30]:
anonymize_json(input_data)



{'name': 'Mrs. Theresa Williams',
 'email': 'chavez-solomonael@example.com',
 'phone': '001-377-755-7593x2399',
 'location': 'Port Brett, Delaware Region',
 'company': 'Microsoft',
 'credit_card': '6528835898351118',
 'url': 'roman-rogers',
 'country': 'Jefferyburgh, South Dakota',
 'id': 'AB1234567',
 'profile': {'bio': 'Mrs. Theresa Williams is a software engineer living in Port Brett, Delaware Region. Contact her at jacksonley@example.comll 001-377-755-7593x2399.',
  'social': ['duran', 'harrell-hawkins']}}