Evaluate Presidio Analyzer using the Presidio Evaluator framework

In [None]:
%cd ..
!ls

In [None]:
from pathlib import Path
from copy import deepcopy
from pprint import pprint
from collections import Counter

from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.models import PresidioAnalyzerWrapper
from presidio_evaluator.experiment_tracking import get_experiment_tracker

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

%reload_ext autoreload
%autoreload 2

Select data for evaluation:

In [None]:
# dataset_name = "synth_dataset_v2.json"
# dataset = InputSample.read_dataset_json(f"data/{dataset_name}")
# print(len(dataset))

dataset_name = "generated_size_15_date_August_06_2022.json"
dataset = InputSample.read_dataset_json(f"data/{dataset_name}")
print(len(dataset))

In [None]:
entity_counter = Counter()
for sample in dataset:
    for tag in sample.tags:
        entity_counter[tag] += 1

In [None]:
print("Count per entity:")
pprint(entity_counter.most_common())

print("\nExample sentence:")
print(dataset[1])

print("\nMin and max number of tokens in dataset:")
print(
    f"Min: {min([len(sample.tokens) for sample in dataset])}, "
    f"Max: {max([len(sample.tokens) for sample in dataset])}"
)

print("\nMin and max sentence length in dataset:")
print(
    f"Min: {min([len(sample.full_text) for sample in dataset])}, "
    f"Max: {max([len(sample.full_text) for sample in dataset])}"
)

In [None]:
dataset[0]

Run evaluation:

In [None]:
print("Evaluating Presidio Analyzer")

experiment = get_experiment_tracker()
model_name = "Presidio Analyzer"
model = PresidioAnalyzerWrapper()

evaluator = Evaluator(model=model)
dataset = Evaluator.align_entity_types(
    deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map
)

evaluation_results = evaluator.evaluate_all(dataset)
results = evaluator.calculate_score(evaluation_results)

# update params tracking
params = {"dataset_name": dataset_name, "model_name": model_name}
params.update(model.to_log())
experiment.log_parameters(params)
experiment.log_dataset_hash(dataset)
experiment.log_metrics(results.to_log())
entities, confmatrix = results.to_confusion_matrix()
experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)

print("Confusion matrix:")
print(pd.DataFrame(confmatrix, columns=entities, index=entities))

print("Precision and recall")
print(results)

# end experiment
experiment.end()

### Results analysis

In [None]:
sent = "Electronically Signed by Matthew Chung."
# sent = input("Enter sentence: ")
model.predict(InputSample(full_text=sent))

In [None]:
errors = results.model_errors

#### False positives

1. Most false positive tokens:

In [None]:
ModelError.most_common_fp_tokens(errors)

In [None]:
# fps_df = ModelError.get_fps_dataframe(errors, entity=["LOCATION"])
# fps_df[["full_text", "token", "prediction"]].head()

2. False negative examples

In [None]:
ModelError.most_common_fn_tokens(errors, n=50, entity=["PERSON"])

More FN analysis

In [None]:
fns_df = ModelError.get_fns_dataframe(errors, entity=["PERSON"])

In [None]:
fns_df[["full_text", "token", "annotation", "prediction"]].head()

In [None]:
print("All errors:\n")
[print(error, "\n") for error in errors]