# Evaluate Presidio Analyzer using the Presidio Evaluator framework

In [None]:
# install presidio via pip if not yet installed

#!pip install presidio-evaluator
#!pip install "presidio-analyzer[transformers]"
#!pip install presidio-evaluator

In [None]:
from pathlib import Path
from copy import deepcopy
from pprint import pprint
from collections import Counter
from typing import List

import warnings
warnings.filterwarnings('ignore')

from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.models import PresidioAnalyzerWrapper
from presidio_evaluator.experiment_tracking import get_experiment_tracker

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

%reload_ext autoreload
%autoreload 2
%matplotlib inline

Select data for evaluation:

In [None]:
dataset_name = "synth_dataset_v2.json"
dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, "data", dataset_name))

dataset = dataset[:300] # top 300 samples

print(len(dataset))

In [None]:
def get_entity_counts(dataset:List[InputSample]):
    entity_counter = Counter()
    for sample in dataset:
        for tag in sample.tags:
            entity_counter[tag] += 1
    return entity_counter


In [None]:
print("Count per entity:")
pprint(get_entity_counts(dataset).most_common())

print("\nExample sentence:")
print(dataset[1])

print("\nMin and max number of tokens in dataset:")
print(
    f"Min: {min([len(sample.tokens) for sample in dataset])}, "
    f"Max: {max([len(sample.tokens) for sample in dataset])}"
)

print("\nMin and max sentence length in dataset:")
print(
    f"Min: {min([len(sample.full_text) for sample in dataset])}, "
    f"Max: {max([len(sample.full_text) for sample in dataset])}"
)

### Define the AnalyzerEngine object 
In this case, using a huggingface model: obi/deid_roberta_i2b2

In [None]:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration


# Here we define a transformers based NLP engine, 
# but you can use this cell to customize your Presidio Analyzer instance

# Define which model to use
model_config = [{"lang_code": "en", "model_name": {
    "spacy": "en_core_web_sm",  # use a small spaCy model for lemmas, tokens etc.
    "transformers": "obi/deid_roberta_i2b2"
    }
}]

# Map transformers model labels to Presidio's
model_to_presidio_entity_mapping = dict(
    PER="PERSON",
    PERSON="PERSON",
    LOC= "LOCATION",
    LOCATION= "LOCATION",
    GPE="LOCATION",
    ORG="ORGANIZATION",
    ORGANIZATION="ORGANIZATION",
    NORP="NRP",
    AGE="AGE",
    ID="ID",
    EMAIL="EMAIL",
    PATIENT="PERSON",
    STAFF="PERSON",
    HOSP="ORGANIZATION",
    PATORG="ORGANIZATION",
    DATE="DATE_TIME",
    TIME="DATE_TIME",
    PHONE="PHONE_NUMBER",
    HCW="PERSON",
    HOSPITAL="ORGANIZATION",
    FACILITY="LOCATION",
)

ner_model_configuration = NerModelConfiguration(labels_to_ignore = ["O"], 
                                                model_to_presidio_entity_mapping=model_to_presidio_entity_mapping)

nlp_engine = TransformersNlpEngine(models=model_config,
                                   ner_model_configuration=ner_model_configuration)

# Set up the engine, loads the NLP module (spaCy model by default) 
# and other PII recognizers
analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)

### Run evaluation

#### Define experiment

In [None]:
experiment = get_experiment_tracker()
model = PresidioAnalyzerWrapper(analyzer_engine)

# Define evaluator and experiment tracking

evaluator = Evaluator(model=model)
dataset = Evaluator.align_entity_types(
    deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map
)

print("Count per entity after alignment:")
pprint(get_entity_counts(dataset).most_common())

# Track model and dataset params
params = {"dataset_name": dataset_name, "model_name": model.name}
params.update(model.to_log())
experiment.log_parameters(params)
experiment.log_dataset_hash(dataset)

#### Run experiment

In [None]:
# Run experiment
evaluation_results = evaluator.evaluate_all(dataset)
results = evaluator.calculate_score(evaluation_results)

# Track experiment results
experiment.log_metrics(results.to_log())
entities, confmatrix = results.to_confusion_matrix()
experiment.log_confusion_matrix(matrix=confmatrix, 
                                labels=entities)

# Plot output
plotter = evaluator.Plotter(model=model, 
                            results=results, 
                            output_folder = ".", 
                            model_name = model.name, 
                            beta = 2)

# end experiment
experiment.end()

In [None]:
plotter.plot_scores()

### Results analysis

In [None]:
sent = "I am taiwanese but I live in Cambodia."
# sent = input("Enter sentence: ")
model.predict(InputSample(full_text=sent))

In [None]:
errors = results.model_errors

#### False positives

1. Most common false positive tokens:

In [None]:
ModelError.most_common_fp_tokens(errors)

In [None]:
fps_df = ModelError.get_fps_dataframe(errors, entity=["LOCATION"])
fps_df[["full_text", "token", "annotation", "prediction"]]

2. Most common false negative examples

In [None]:
ModelError.most_common_fn_tokens(errors, n=50)

More FN analysis

In [None]:
fns_df = ModelError.get_fns_dataframe(errors, entity=["IP_ADDRESS"])

In [None]:
fns_df[["full_text", "token", "annotation", "prediction"]]

In [None]:
print("All errors:\n")
[print(error, "\n") for error in errors]