Evaluate Flair models for person names, orgs and locations using the Presidio Evaluator framework

Data = `generated_test_November 12 2019`

In [None]:
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator.evaluation import ModelError, Evaluator
%reload_ext autoreload
%autoreload 2

Select data for evaluation:

In [None]:
synth_samples = read_synth_dataset("../../data/synth_dataset.txt")
print(len(synth_samples))


Map entity types

In [None]:
presidio_entities_map = {
    "PERSON": "PER",
    "EMAIL_ADDRESS": "O",
    "CREDIT_CARD": "O",
    "FIRST_NAME": "PER",
    "PHONE_NUMBER": "O",
    "BIRTHDAY": "O",
    "DATE_TIME": "O",
    "DOMAIN": "O",
    "CITY": "LOC",
    "ADDRESS": "LOC",
    "NATIONALITY": "LOC",
    "LOCATION": "LOC",
    "IBAN": "O",
    "URL": "O",
    "US_SSN": "O",
    "IP_ADDRESS": "O",
    "ORGANIZATION": "ORG",
    "TITLE" : "O", # skipping evaluation of titles
    "O": "O",
}

synth_samples = Evaluator.align_entity_types(synth_samples, presidio_entities_map)

In [None]:
from collections import Counter
entity_counter = Counter()
for sample in synth_samples:
    for tag in sample.tags:
        entity_counter[tag]+=1

In [None]:
entity_counter

In [None]:
#max length sentence
max([len(sample.tokens) for sample in synth_samples])

Select models for evaluation:

In [None]:
flair_ner = 'ner'
flair_ner_fast = 'ner-fast'
flair_ontonotes = 'ner-ontonotes-fast'
models = [flair_ner, flair_ner_fast]

In [None]:
from presidio_evaluator.models import FlairModel

for model in models:
    print("-----------------------------------")
    print("Evaluating model {}".format(model))
    flair_model = FlairModel(model_path=model)
    evaluator = Evaluator(model=flair_model)
    evaluation_results = evaluator.evaluate_all(synth_samples)
    scores = evaluator.calculate_score(evaluation_results)
    
     
    print("Confusion matrix:")
    print(scores.results)

    print("Precision and recall")
    scores.print()
    errors = scores.model_errors


Custom evaluation

#### False positives

1. Most false positive tokens:

In [None]:
errors = scores.model_errors

ModelError.most_common_fp_tokens(errors)

In [None]:
fps_df = ModelError.get_fps_dataframe(errors,entity=['PERSON'])
fps_df[['full_text','token','prediction']]

2. False negative examples

In [None]:
ModelError.most_common_fn_tokens(errors,n=50, entity=['PER'])

More FN analysis

In [None]:
fns_df = ModelError.get_fns_dataframe(errors,entity=['PERSON'])

In [None]:
fns_df[['full_text','token','annotation','prediction']]