In [16]:
import sys
import os
from faker import Faker

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../presidio")))

from models_config import build_analyzer
from presidio_anonymizer import AnonymizerEngine
from log_analysis import analyze_save, results_to_json
from clinical_filter import ClinicalDataFilter
from context_anonymizer import ContextAwareAnonymizer

try:
    from group_entities import group_names
except ImportError:
    sys.path.append(os.getcwd())
    from group_entities import group_names

%reload_ext autoreload
%autoreload 2

In [17]:
file_path = "data/sample.txt"
text = open(file_path).read()
print(text)

UNIVERSITY MEDICAL CTR
Department:  Neurology / Internal Med
Document Type:  ED NOTE + CONSULT + DISCHARGE
Generated: 2024.03.22  07:14 AM
Printed by: J.Nguyen (unit clerk)   ext 4021

----------------------------------------------------------------------
PATIENT INFORMATION
----------------------------------------------------------------------
Name:   Jenniferr   K.  Lee
AKA:    Jenny Lee / J. Lee
DOB:    5-6-94
Age:    29 y/o
Sex:    F
Preferred Language: English
Race/ethnicity (self-reported):  Korean-American

MRN:    00077219
Acct#:  A-77821
Encounter ID: 44-92-771

Home Address:
  455   San Mateo Ave
  Apt#  3B
  Redwood   City,  CA  94063

CURRENT ADDRESS (per patient, moved recently):
  4127 W. Elm st   Apt  #3B
  Springfeld, IL  62704

Phone:  312-555-7712
Alt/cell: (312) 555  77  13
Email:  jlee94 @ yahoo.com
Emergency contact:
  Marry  Smi th (spouse)  217.555.0198
  relationship: wife
Secondary contact:
  Anne McKinly (sister)  773.555.8891

Primary care provider (outside):

In [18]:
stanford = build_analyzer()[0]

Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 124936.71it/s]
Device set to use cpu


In [19]:
raw_results = stanford.analyze(text=text, language="en")
print(f"Raw results found: {len(raw_results)}")

Raw results found: 76


In [20]:
filtered_results = ClinicalDataFilter.filter_results(text, raw_results)
print(f"Filtered results count: {len(filtered_results)}")
print(f"Removed {len(raw_results) - len(filtered_results)} false positives/preserved entities")

Filtered results count: 66
Removed 10 false positives/preserved entities


In [21]:
person_entities = [text[r.start:r.end] for r in filtered_results if r.entity_type == "PERSON"]

groups = group_names(person_entities, score_cutoff=55)
print(f"Found {len(groups)} unique person groups from {len(person_entities)} occurrences.")

anonymizer = ContextAwareAnonymizer(name_groupings=groups)
anonymized_result = anonymizer.anonymize(text=text, analyzer_results=filtered_results)

print("\n--- Anonymized Text (Context + Grouping Aware) ---\n")
print(anonymized_result)

Found 8 unique person groups from 14 occurrences.

--- Anonymized Text (Context + Grouping Aware) ---

Port Anthony
Department:  Neurology / Internal Med
Document Type:  ED NOTE + CONSULT + DISCHARGE
Generated: 03-03-0104:14 AM
Printed by: J.Nge (unit Rowan)   ext 4021

----------------------------------------------------------------------
PATIENT INFORMATION
----------------------------------------------------------------------
Name:   Asher
AKA:    Asher / Asher
DOB:    09-12-94
Age:    29 y/o
Sex:    F
Preferred Language: English
Race/ethnicity (self-reported):  Korean-American

MRN:    06260594
Acct#:  A-77821
Encounter ID: 44-92-771

Home Address:
  Chrisburgh
  Apt#  3B
  West Anita  45023

CURRENT ADDRESS (per patient, moved recently):
  New David   Apt  #3B
  New Kerry  861.477.5089

Phone:  670.569.1381
Alt/cell: 467.733.8454
Email:  jlee94 @ Lara PLC
Emergency contact:
  Baker th (spouse)  481.471.6666
  relationship: wife
Secondary contact:
  Rowan (sister)  985.679.1268

Pr

In [22]:
print("Generated Synthetic Identities\n")
for key, identity in anonymizer.context.mappings.items():
    if 'fake_name' in identity:
        print(f"PERSON: {identity['original']:<20} -> {identity['fake_name']}")
    elif 'date_shift_days' in identity:
        pass
    elif 'fake_city' in identity:
        pass

Generated Synthetic Identities

PERSON: S. Patel             -> Tatum
PERSON: J. Nguyen PA         -> Jamie
PERSON: Rebecca Wong MD      -> Parker
PERSON: Anne McKinly         -> Rowan
PERSON: Rosa Ramirez         -> Baker
PERSON: Dr.  S.  Patel       -> Spencer
PERSON: Marry  Smi           -> Baker
PERSON: J. Lee               -> Asher
PERSON: clerk                -> Rowan


In [None]:
output_file = "anonymized_sample.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(anonymized_result)

Saved anonymized text to anonymized_sample.txt
