In [None]:
import json
from collections import defaultdict, Counter
from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import ModelError, Evaluator
from presidio_evaluator.dataset_formatters import I2B22014Formatter
from presidio_evaluator.models import PresidioAnalyzerWrapper

from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider

# Evaluate Presidio on the I2B2-2014 de-identification dataset

#### Prerequisites:
1. Get access to the data
2. Copy the data to the `/data/i2b2/2014` folder on the top of the repo. You should have three folders:
    - `testing-PHI-Gold-fixed`
    - `training-PHI-Gold-Set1`
    - `training-PHI-Gold-Set2`
3. Run the following cell for creating a list of InputSamples and save them to json:

In [None]:
CREATE_DATASET=False #Change to true on the first run


if CREATE_DATASET:
    # Data is assumed to be on the data folder (repo root) under i2b2/2014
    # train 1
    input_path1 = Path("../data/i2b2/2014/training-PHI-Gold-Set1")
    output_path1 = Path("../data/i2b2/2014/training-PHI-Gold-Set1.json")
    I2B22014Formatter.dataset_to_json(input_path1, output_path1)

    # train 2
    input_path2 = Path("../data/i2b2/2014/training-PHI-Gold-Set2")
    output_path2 = Path("../data/i2b2/2014/training-PHI-Gold-Set2.json")
    I2B22014Formatter.dataset_to_json(input_path2, output_path2)

    # test
    input_path3 = Path("../data/i2b2/2014/testing-PHI-Gold-fixed")
    output_path3 = Path("../data/i2b2/2014/testing-PHI-Gold-fixed.json")
    I2B22014Formatter.dataset_to_json(input_path3, output_path3)


In [None]:
def read_json_dataset(filepath=None, length=None):

    with open(filepath, "r", encoding="utf-8") as f:
        dataset = json.load(f)

    if length:
        dataset = dataset[:length]

    input_samples = [InputSample.from_json(row) for row in dataset]
    input_samples = [sample for sample in input_samples if len(sample.full_text) < 5120]

    return input_samples

In [None]:
dataset = read_json_dataset("../data/i2b2/2014/training-PHI-Gold-Set1.json")

Entity types in this dataset and their frequencies:

In [None]:
flatten = lambda l: [item for sublist in l for item in sublist]
count_per_entity = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in dataset])])
count_per_entity

Translate I2b2 2014 entity types to Presidio's (If available)

In [None]:
i2b2_presidio_dict = {
    "PATIENT": "PERSON",
    "DOCTOR": "PERSON",
    "AGE":"AGE", # Not supported in Presidio
    "BIOID": "BIOID", # Not supported in Presidio
    "COUNTRY": "LOCATION",
    "CITY":"LOCATION",
    "DATE": "DATE_TIME",
    "DEVICE": "DEVICE", # Not supported in Presidio
    "EMAIL": "EMAIL_ADDRESS",
    "FAX": "US_PHONE_NUMBER",
    "HEALTHPLAN": "HEALTHPLAN", # Not supported in Presidio
    "HOSPITAL": "ORGANIZATION",
    # "IDNUM": "IDNUM", # Not supported in Presidio
    "LOCATION-OTHER": "LOCATION",
    # "MEDICALRECORD": "MEDICAL_RECORD", # Not supported in Presidio
    "ORGANIZATION": "ORGANIZATION",
    "PHONE": "PHONE_NUMBER",
    "PROFESSION": "PROFESSION", # Not supported in Presidio
    "STATE": "LOCATION",
    "STREET": "LOCATION",
    "URL": "DOMAIN_NAME",
    # "USERNAME": "USERNAME", # Not supported in Presidio
    "ZIP": "ZIP", # Not supported in Presidio
    "O": "O",
}

Examine different entity values

In [None]:
values_per_entity = defaultdict(set)
for sample in dataset:
    for span in sample.spans:
        values_per_entity[span.entity_type].add(span.entity_value)

values_per_entity['ORGANIZATION']

In [None]:
new_dataset = Evaluator.align_entity_types(input_samples=dataset, entities_mapping=i2b2_presidio_dict, 
                                           allow_missing_mappings=True)

Re-calculate frequency per entity_type

In [None]:
count_per_entity_new = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in new_dataset])])
count_per_entity_new

In [None]:
# Set up analyzer
analyzer = AnalyzerEngine()


# Run evaluation
presidio = PresidioAnalyzerWrapper(analyzer_engine=analyzer,
                                   entities_to_keep=list(count_per_entity_new.keys()))
evaluator = Evaluator(model=presidio)
evaluated = evaluator.evaluate_all(new_dataset)

In [None]:
evaluation_result = evaluator.calculate_score(evaluated)

In [None]:
evaluation_result.print()

Analyze wrong predictions

In [None]:
errors = evaluation_result.model_errors

False positives analysis

In [None]:
ModelError.most_common_fp_tokens(errors,n=5)

In [None]:
ModelError.get_fps_dataframe(errors,entity='DATE_TIME')

False negatives analysis

In [None]:
ModelError.most_common_fn_tokens(errors,n=5)

In [None]:
ModelError.get_fns_dataframe(errors,entity='DATE_TIME')