In [1]:
from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import ModelError, Evaluator
from presidio_evaluator.models import PresidioAnalyzerWrapper
from presidio_analyzer import AnalyzerEngine
from collections import Counter

import pandas as pd

%load_ext autoreload
%autoreload 2

pd.options.display.max_columns = None
pd.options.display.width=None

# Evaluate Presidio Analyzer
This notebook runs the PresidioAnalyzerEvaluator class on top of synthetic data.

One can perform the following changes:
1. Replace the synthetic data creation with real data or with other type of synthetic data
2. Adapt the Presidio `AnalyzerEngine` to a specific engine with a different set of recognizers or configured to be used on different languages




#### A. Read dataset for evaluation

In [2]:
input_samples = InputSample.read_dataset_json("../data/synth_dataset.json")
print("Read {} samples".format(len(input_samples)))
input_samples[0]

tokenizing input:   0%|                                                                       | 0/3000 [00:00<?, ?it/s]

loading model en_core_web_sm


tokenizing input: 100%|███████████████████████████████████████████████████████████| 3000/3000 [00:25<00:00, 118.48it/s]

Read 3000 samples





Full text: I want to update my primary and secondary address to the same: 19 Ingelbrecht Knudssøns gate 222
 Suite 598
 OSLO
 Bangladesh
Spans: [Type: ADDRESS, value: 19 Ingelbrecht Knudssøns gate 222
 Suite 598
 OSLO
 Bangladesh, start: 63, end: 125]
Tokens: I want to update my primary and secondary address to the same: 19 Ingelbrecht Knudssøns gate 222
 Suite 598
 OSLO
 Bangladesh
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ADDRESS', 'ADDRESS', 'ADDRESS', 'ADDRESS', 'ADDRESS', 'ADDRESS', 'ADDRESS', 'ADDRESS', 'ADDRESS', 'ADDRESS', 'ADDRESS', 'ADDRESS']

#### B. Descriptive statistics

In [3]:
flatten = lambda l: [item for sublist in l for item in sublist]

count_per_entity = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in input_samples])])
count_per_entity

Counter({'ADDRESS': 1512,
         'LOCATION': 817,
         'PHONE_NUMBER': 264,
         'PERSON': 1800,
         'CREDIT_CARD': 313,
         'IBAN_CODE': 41,
         'US_SSN': 44,
         'ORGANIZATION': 448,
         'DOMAIN_NAME': 40,
         'EMAIL_ADDRESS': 71,
         'PREFIX': 43,
         'DATE_TIME': 112,
         'TITLE': 23,
         'IP_ADDRESS': 24,
         'US_DRIVER_LICENSE': 13})

#### C. Remove entities not supported by Presidio

In [4]:
entities_to_ignore = {"ADDRESS", "TITLE", "PREFIX"}
entities_to_keep = set(count_per_entity.keys()) - entities_to_ignore
entities_to_keep

{'CREDIT_CARD',
 'DATE_TIME',
 'DOMAIN_NAME',
 'EMAIL_ADDRESS',
 'IBAN_CODE',
 'IP_ADDRESS',
 'LOCATION',
 'ORGANIZATION',
 'PERSON',
 'PHONE_NUMBER',
 'US_DRIVER_LICENSE',
 'US_SSN'}

#### D. Run the presidio-evaluator framework with Presidio's API as the 'model' at test

In [5]:
presidio = PresidioAnalyzerWrapper(entities_to_keep=list(entities_to_keep))
evaluator = Evaluator(model=presidio)
evaluted_samples = evaluator.evaluate_all(input_samples)

Entities supported by this Presidio Analyzer instance:
MEDICAL_LICENSE, DOMAIN_NAME, UK_NHS, AU_ACN, CRYPTO, CREDIT_CARD, AU_ABN, US_ITIN, LOCATION, NRP, US_DRIVER_LICENSE, PHONE_NUMBER, PERSON, AU_TFN, EMAIL_ADDRESS, US_SSN, IP_ADDRESS, US_PASSPORT, US_BANK_NUMBER, SG_NRIC_FIN, AU_MEDICARE, IBAN_CODE, DATE_TIME
Entity ORGANIZATION is not supported by this instance of Presidio Analyzer Engine
Added ORGANIZATION as a supported entity from spaCy/Stanza


Evaluating <class 'presidio_evaluator.evaluation.evaluator.Evaluator'>: 100%|██████| 3000/3000 [00:31<00:00, 95.34it/s]


#### D. Extract statistics
- Presicion, recall and F measure are calculated based on a PII/Not PII binary classification per token.
- Specific entity recall and precision are calculated on the specific PII entity level.

In [6]:
evaluation_result = evaluator.calculate_score(evaluted_samples)

In [7]:
evaluation_result.print()

                        Entity                     Precision                        Recall             Number of samples
                   CREDIT_CARD                       100.00%                       100.00%                          2728
                     DATE_TIME                        14.72%                        89.14%                            40
                   DOMAIN_NAME                       100.00%                        82.50%                            41
                 EMAIL_ADDRESS                       100.00%                       100.00%                           313
                     IBAN_CODE                       100.00%                        90.24%                          1114
                    IP_ADDRESS                        91.18%                        83.78%                            71
                      LOCATION                        53.84%                        35.91%                           220
                  ORGANIZATION  

#### G. Analyze wrong predictions

In [8]:
errors = evaluation_result.model_errors

In [9]:
ModelError.most_common_fp_tokens(errors,n=5)

Most common false positive tokens:
[('\n', 202), ('the', 110), ('\n ', 96), ('last', 68), ('year', 48)]
Example sentence with each FP token:
how do i change my address to unit 9456 box 8731
dpo ap 71610 for post mail?
Muslija began writing as a teenager, publishing her first story, "The Dimensions of a Shadow", in 1950 while studying English and journalism at the University of El Tanque.
As promised, here's Božica's address:

99 Sahankatu 77
Ortovero
, SV
 Nigeria 21148
At my suggestion, one morning over breakfast, she agreed, and on the last Sunday before Labor Day we returned to Los Angeles by helicopter.
Ewan spent a year at BBC as the assistant to Aaron Panina, and the following year at Sanders-Gill in Seguin, which later became Weather Decision Technologies in 1965.


In [10]:
fps_df = ModelError.get_fps_dataframe(errors,entity='DATE_TIME')
if fps_df is not None:
    fps_df[['full_text','token','prediction']]
fps_df

Unnamed: 0,error_type,annotation,prediction,token,full_text,0
0,FP,O,DATE_TIME,8731,how do i change my address to unit 9456 box 87...,
1,FP,O,DATE_TIME,\n,how do i change my address to unit 9456 box 87...,
2,FP,O,DATE_TIME,dpo,how do i change my address to unit 9456 box 87...,
3,FP,O,DATE_TIME,ap,how do i change my address to unit 9456 box 87...,
4,FP,O,DATE_TIME,71610,how do i change my address to unit 9456 box 87...,
...,...,...,...,...,...,...
1224,FP,O,DATE_TIME,this,My card 5115922521155230 is expiring this mont...,
1225,FP,O,DATE_TIME,month,My card 5115922521155230 is expiring this mont...,
1226,FP,O,DATE_TIME,33649,"As promised, here's Zlata's address:\n\n29 Rue...",
1227,FP,O,DATE_TIME,2,Follow up with Edward Baranova in 2 months.,


In [11]:
fns_df = ModelError.get_fns_dataframe(errors,entity='PHONE_NUMBER')
fns_df

Unnamed: 0,error_type,annotation,prediction,token,full_text,0
0,Wrong entity,PHONE_NUMBER,DATE_TIME,0910,Terry Cardoso PhD\n\n65 Bodbysund 61\n Suite 5...,
1,Wrong entity,PHONE_NUMBER,DATE_TIME,-,Terry Cardoso PhD\n\n65 Bodbysund 61\n Suite 5...,
2,Wrong entity,PHONE_NUMBER,DATE_TIME,5877671,Terry Cardoso PhD\n\n65 Bodbysund 61\n Suite 5...,
3,Wrong entity,PHONE_NUMBER,DATE_TIME,-,Terry Cardoso PhD\n\n65 Bodbysund 61\n Suite 5...,
4,Wrong entity,PHONE_NUMBER,DATE_TIME,4466x8827,Terry Cardoso PhD\n\n65 Bodbysund 61\n Suite 5...,
...,...,...,...,...,...,...
532,FN,PHONE_NUMBER,O,81,Kelly Björgvinsdóttir\nAdaptive\n63 Via Verban...,
533,FN,PHONE_NUMBER,O,21,Laura Gorski\nMinistry Of Agriculture\n07 57 a...,
534,FN,PHONE_NUMBER,O,232,Laura Gorski\nMinistry Of Agriculture\n07 57 a...,
535,FN,PHONE_NUMBER,O,945,Laura Gorski\nMinistry Of Agriculture\n07 57 a...,
