Evaluate Presidio Analyzer using the Presidio Evaluator framework

In [1]:
%cd ..
!ls

/Users/mattc/Desktop/presidio/presidio-research-fork
CODE_OF_CONDUCT.md              experiment_20220807-073416.json
LICENSE                         [34mnotebooks[m[m
NOTICE                          [34mpresidio_evaluator[m[m
README.md                       pytest.ini
SECURITY.md                     requirements.txt
VERSION                         requirements_all.txt
azure-pipelines.yml             setup.cfg
[34mdata[m[m                            setup.py
[34mdocs[m[m                            [34mtests[m[m
experiment_20220807-073124.json [34mvenv[m[m


In [2]:
from pathlib import Path
from copy import deepcopy
from pprint import pprint
from collections import Counter

from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.models import PresidioAnalyzerWrapper
from presidio_evaluator.experiment_tracking import get_experiment_tracker

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

%reload_ext autoreload
%autoreload 2

stanza and spacy_stanza are not installed
Flair is not installed by default
Flair is not installed


Select data for evaluation:

In [5]:
dataset_name = "synth_dataset_v2.json"
dataset = InputSample.read_dataset_json("data/" + dataset_name)
print(len(dataset))

tokenizing input:   0%|          | 0/1500 [00:00<?, ?it/s]

loading model en_core_web_sm


tokenizing input: 100%|██████████| 1500/1500 [00:07<00:00, 192.01it/s]


1500


In [6]:
entity_counter = Counter()
for sample in dataset:
    for tag in sample.tags:
        entity_counter[tag] += 1

In [7]:
print("Count per entity:")
pprint(entity_counter.most_common())

print("\nExample sentence:")
print(dataset[1])

print("\nMin and max number of tokens in dataset:")
print(
    f"Min: {min([len(sample.tokens) for sample in dataset])}, "
    f"Max: {max([len(sample.tokens) for sample in dataset])}"
)

print("\nMin and max sentence length in dataset:")
print(
    f"Min: {min([len(sample.full_text) for sample in dataset])}, "
    f"Max: {max([len(sample.full_text) for sample in dataset])}"
)

Count per entity:
[('O', 19626),
 ('STREET_ADDRESS', 3071),
 ('PERSON', 1369),
 ('GPE', 521),
 ('ORGANIZATION', 504),
 ('PHONE_NUMBER', 350),
 ('DATE_TIME', 219),
 ('TITLE', 142),
 ('CREDIT_CARD', 136),
 ('US_SSN', 80),
 ('AGE', 74),
 ('NRP', 55),
 ('ZIP_CODE', 50),
 ('EMAIL_ADDRESS', 49),
 ('DOMAIN_NAME', 37),
 ('IP_ADDRESS', 22),
 ('IBAN_CODE', 21),
 ('US_DRIVER_LICENSE', 9)]

Example sentence:
Full text: What are my options?
Spans: []
Tokens: What are my options?
Tags: ['O', 'O', 'O', 'O', 'O']


Min and max number of tokens in dataset:
Min: 3, Max: 78

Min and max sentence length in dataset:
Min: 9, Max: 407


Run evaluation:

In [8]:
print("Evaluating Presidio Analyzer")

experiment = get_experiment_tracker()
model_name = "Presidio Analyzer"
model = PresidioAnalyzerWrapper()

evaluator = Evaluator(model=model)
dataset = Evaluator.align_entity_types(
    deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map
)

evaluation_results = evaluator.evaluate_all(dataset)
results = evaluator.calculate_score(evaluation_results)

# update params tracking
params = {"dataset_name": dataset_name, "model_name": model_name}
params.update(model.to_log())
experiment.log_parameters(params)
experiment.log_dataset_hash(dataset)
experiment.log_metrics(results.to_log())
entities, confmatrix = results.to_confusion_matrix()
experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)

print("Confusion matrix:")
print(pd.DataFrame(confmatrix, columns=entities, index=entities))

print("Precision and recall")
print(results)

# end experiment
experiment.end()

Evaluating Presidio Analyzer
Entities supported by this Presidio Analyzer instance:
NRP, LOCATION, AU_ACN, URL, MEDICAL_LICENSE, CREDIT_CARD, PERSON, DATE_TIME, AU_ABN, IBAN_CODE, US_SSN, PHONE_NUMBER, AU_MEDICARE, SG_NRIC_FIN, AU_TFN, US_DRIVER_LICENSE, US_BANK_NUMBER, US_PASSPORT, CRYPTO, US_ITIN, UK_NHS, EMAIL_ADDRESS, IP_ADDRESS


Evaluating <class 'presidio_evaluator.models.presidio_analyzer_wrapper.PresidioAnalyzerWrapper'>: 100%|██████████| 1500/1500 [00:09<00:00, 157.28it/s]

Confusion matrix:
                   CREDIT_CARD  DATE_TIME  EMAIL_ADDRESS  IBAN_CODE  \
CREDIT_CARD                105          1              0          0   
DATE_TIME                    0        199              0          0   
EMAIL_ADDRESS                0          0             49          0   
IBAN_CODE                    0          0              0         20   
IP_ADDRESS                   0          0              0          0   
LOCATION                     0          0              0          0   
NRP                          0          0              0          0   
O                            0        595              0          0   
PERSON                       0          0              0          0   
PHONE_NUMBER                 0          2              0          0   
US_DRIVER_LICENSE            0          0              0          0   
US_SSN                       0          0              0          0   

                   IP_ADDRESS  LOCATION  NRP      O  PERSO




### Results analysis

In [9]:
sent = "I am taiwanese but I live in Cambodia."
# sent = input("Enter sentence: ")
model.predict(InputSample(full_text=sent))

['O', 'O', 'NRP', 'O', 'O', 'O', 'O', 'LOCATION', 'O']

In [10]:
errors = results.model_errors

#### False positives

1. Most false positive tokens:

In [11]:
ModelError.most_common_fp_tokens(errors)

Most common false positive tokens:
[('the', 61), ('year', 56), ('morning', 34), ('old', 32), ('years', 27), ('Apt', 23), ('a', 22), ('last', 22), ('this', 21), ('months', 20)]
Example sentence with each FP token:
My birthday is on the weekend. I'll turn 23.
This 79 year old female complaining of stomach pain.
Mr. Leiva flew to LEPPEN on Tuesday morning.
This 79 year old female complaining of stomach pain.
He just turned 69 years old
The address of Persint is 6750 Koskikatu 25 Apt. 864
Artilleros
, CO
 Uruguay 64677
follow up with patricia desrosiers in a couple of months.
The letter arrived at 6400 12 Timms Drive Suite 215 NUNGATTA Australia last night.
My card 4131034282458809939 is expiring this month. Please let me know process to it's extend validity.
follow up with patricia desrosiers in a couple of months.


In [12]:
fps_df = ModelError.get_fps_dataframe(errors, entity=["LOCATION"])
fps_df[["full_text", "token", "prediction"]]

Unnamed: 0,full_text,token,prediction
0,"The address of Persint is 6750 Koskikatu 25 Apt. 864\nArtilleros\n, CO\n Uruguay 64677",CO,LOCATION
1,"The address of Persint is 6750 Koskikatu 25 Apt. 864\nArtilleros\n, CO\n Uruguay 64677",\n,LOCATION
2,"The address of Persint is 6750 Koskikatu 25 Apt. 864\nArtilleros\n, CO\n Uruguay 64677",Uruguay,LOCATION
3,"The Exversion Orchestra was founded in 1977. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Tunisia",Southern,LOCATION
4,Billing address: Sara Schwarz\n 28245 Puruntie 82 Apt. 595\n LAPPEENRANTA\n SK\n 53650,Apt,LOCATION
5,"card number 347415977307943 is lost, can you please send a new one to 14 Crown Street Kishiev Squares\n Suite 321\n LONDON\n United Kingdom 75419? I am in Sutri for a business trip",LONDON,LOCATION
6,"card number 347415977307943 is lost, can you please send a new one to 14 Crown Street Kishiev Squares\n Suite 321\n LONDON\n United Kingdom 75419? I am in Sutri for a business trip",United,LOCATION
7,"card number 347415977307943 is lost, can you please send a new one to 14 Crown Street Kishiev Squares\n Suite 321\n LONDON\n United Kingdom 75419? I am in Sutri for a business trip",Kingdom,LOCATION
8,"The Davis, Reynolds and Williamson Orchestra was founded in 1977. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Italy",Southern,LOCATION
9,"bot: where would you like this to be sent to? user: 0605 πεντέλης 210 apt. 999\ndelmas, mp 35739",delmas,LOCATION


2. False negative examples

In [None]:
ModelError.most_common_fn_tokens(errors, n=50, entity=["PERSON"])

More FN analysis

In [None]:
fns_df = ModelError.get_fns_dataframe(errors, entity=["PHONE_NUMBER"])

In [None]:
fns_df[["full_text", "token", "annotation", "prediction"]]

In [None]:
print("All errors:\n")
[print(error, "\n") for error in errors]