In [30]:
pip install spacy-layout

Collecting spacy-layout
  Downloading spacy_layout-0.0.12-py2.py3-none-any.whl.metadata (14 kB)
Collecting docling>=2.5.2 (from spacy-layout)
  Downloading docling-2.36.1-py3-none-any.whl.metadata (10 kB)
Collecting docling-core<3.0.0,>=2.29.0 (from docling-core[chunking]<3.0.0,>=2.29.0->docling>=2.5.2->spacy-layout)
  Downloading docling_core-2.36.0-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-ibm-models<4.0.0,>=3.4.4 (from docling>=2.5.2->spacy-layout)
  Downloading docling_ibm_models-3.4.4-py3-none-any.whl.metadata (6.4 kB)
Collecting docling-parse<5.0.0,>=4.0.0 (from docling>=2.5.2->spacy-layout)
  Downloading docling_parse-4.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling>=2.5.2->spacy-layout)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2<5.0.0,>=4.30.0 (from docling>=2.5.2->spacy-layout)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.

In [2]:
pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m91.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.1


In [6]:
import spacy
import fitz  # PyMuPDF
import re
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Span

def setup_nlp():
    nlp = spacy.load("en_core_web_sm")


    phone_matcher = Matcher(nlp.vocab)
    phone_patterns = [
        [{"TEXT": {"REGEX": r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"}}],
        [{"TEXT": {"REGEX": r"\+?1?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"}}],
        [{"TEXT": {"REGEX": r"\d{10}"}}]
    ]
    phone_matcher.add("PHONE", phone_patterns)

    # Setup address matcher
    address_matcher = Matcher(nlp.vocab)
    address_patterns = [
        [{"LIKE_NUM": True}, {"IS_ALPHA": True, "OP": "+"},
         {"TEXT": {"REGEX": r"(?i)(street|st\.?|road|rd\.?|avenue|ave\.?|boulevard|blvd\.?|drive|dr\.?|lane|ln\.?|way|court|ct\.?)"}}],
        [{"LIKE_NUM": True}, {"IS_ALPHA": True, "OP": "+"}, {"IS_ALPHA": True, "OP": "+"},
         {"TEXT": {"REGEX": r"(?i)(street|st\.?|road|rd\.?|avenue|ave\.?|boulevard|blvd\.?|drive|dr\.?|lane|ln\.?|way|court|ct\.?)"}}]
    ]
    address_matcher.add("ADDRESS", address_patterns)

    # Setup email matcher
    email_matcher = Matcher(nlp.vocab)
    email_pattern = [{"TEXT": {"REGEX": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"}}]
    email_matcher.add("EMAIL", [email_pattern])

    @Language.component("custom_entity_matcher")
    def custom_entity_matcher(doc):
        # Find phone numbers
        phone_matches = phone_matcher(doc)
        new_ents = []

        for match_id, start, end in phone_matches:
            span = Span(doc, start, end, label="PHONE")
            new_ents.append(span)

        # Find addresses
        address_matches = address_matcher(doc)
        for match_id, start, end in address_matches:
            span = Span(doc, start, end, label="ADDRESS")
            new_ents.append(span)

        # Find emails
        email_matches = email_matcher(doc)
        for match_id, start, end in email_matches:
            span = Span(doc, start, end, label="EMAIL")
            new_ents.append(span)

        # Combine with existing entities and remove duplicates
        all_ents = list(doc.ents) + new_ents

        # Remove overlapping entities (keep the longer ones)
        filtered_ents = []
        for ent in sorted(all_ents, key=lambda x: (x.start, -len(x.text))):
            if not any(ent.start >= existing.start and ent.end <= existing.end for existing in filtered_ents):
                filtered_ents.append(ent)

        doc.ents = filtered_ents
        return doc

    nlp.add_pipe("custom_entity_matcher", after="ner")
    return nlp

def get_entity_placeholder(label):
    """Get appropriate placeholder for entity type"""
    placeholders = {
        "PERSON": "[NAME]",
        "ORG": "[ORGANIZATION]",
        "PHONE": "[PHONE]",
        "ADDRESS": "[ADDRESS]",
        "EMAIL": "[EMAIL]",
        "GPE": "[LOCATION]",  # Geopolitical entity
        "LOC": "[LOCATION]"   # Location
    }
    return placeholders.get(label, f"[{label}]")

def anonymize_text(text, nlp):

    doc = nlp(text)

    # Sort entities by start position in reverse order to avoid index shifts
    entities = sorted(doc.ents, key=lambda x: x.start_char, reverse=True)

    anonymized_text = text
    replacements = []

    for ent in entities:
        # Skip very short entities that might be false positives
        if len(ent.text.strip()) < 2:
            continue

        placeholder = get_entity_placeholder(ent.label_)
        start_char = ent.start_char
        end_char = ent.end_char

        # Store replacement info
        replacements.append({
            'original': ent.text,
            'placeholder': placeholder,
            'start': start_char,
            'end': end_char,
            'label': ent.label_
        })

        # Replace in text
        anonymized_text = anonymized_text[:start_char] + placeholder + anonymized_text[end_char:]

    return anonymized_text, replacements

def anonymize_pdf(input_pdf_path, output_pdf_path):

    # Initialize NLP pipeline
    nlp = setup_nlp()

    # Open PDF
    pdf_document = fitz.open(input_pdf_path)

    print(f"Processing PDF with {len(pdf_document)} pages...")

    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]


        # Get text blocks with position information
        text_dict = page.get_text("dict")

        # Process each block
        for block in text_dict["blocks"]:
            if "lines" not in block:  # Skip image blocks
                continue

            for line in block["lines"]:
                for span in line["spans"]:
                    original_text = span["text"]

                    # Skip if text is too short or only whitespace
                    if len(original_text.strip()) < 2:
                        continue

                    # Anonymize the text
                    anonymized_text, replacements = anonymize_text(original_text, nlp)

                    # If replacements were made, update the PDF
                    if replacements:
                        # Get the bounding box for this text span
                        bbox = span["bbox"]
                        rect = fitz.Rect(bbox)

                        # Create a white rectangle to cover the original text
                        page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))

                        # Insert the anonymized text with original formatting
                        font_size = span["size"]
                        font_name = span["font"]

                        try:
                            # Try to use the original font
                            page.insert_text(
                                (bbox[0], bbox[3]),  # Bottom-left position
                                anonymized_text,
                                fontsize=font_size,
                                fontname=font_name,
                                color=(0, 0, 0)
                            )
                        except:
                            # Fallback to default font if original font fails
                            page.insert_text(
                                (bbox[0], bbox[3]),
                                anonymized_text,
                                fontsize=font_size,
                                color=(0, 0, 0)
                            )

                        print(f"  Replaced {len(replacements)} entities in text span")
                        for repl in replacements:
                            print(f"    {repl['original']} -> {repl['placeholder']}")

    # Save the anonymized PDF
    pdf_document.save(output_pdf_path)
    pdf_document.close()
    print(f"Anonymized PDF saved to: {output_pdf_path}")

def main():
    """Main function to run the PDF anonymizer"""
    input_file = "jh.pdf"
    output_file = "jh_anonymized_report.pdf"

    try:
        anonymize_pdf(input_file, output_file)
        print("PDF anonymization completed successfully!")
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except Exception as e:
        print(f"Error during PDF processing: {str(e)}")

if __name__ == "__main__":
    main()

Processing PDF with 2 pages...
  Replaced 1 entities in text span
    ANNUAL -> [DATE]
  Replaced 1 entities in text span
    FEBRUARY 2030 -> [DATE]
  Replaced 1 entities in text span
    LICERIA & CO -> [ORG]
  Replaced 1 entities in text span
    Transforming Challenges -> [NAME]
  Replaced 1 entities in text span
    2030 -> [DATE]
  Replaced 1 entities in text span
    Morgan Maxwell -> [ORG]
  Replaced 1 entities in text span
    Margarita Perez -> [NAME]
Anonymized PDF saved to: jh_anonymized_report.pdf
PDF anonymization completed successfully!
