Evaluate Spacy models for person names, orgs and locations using the Presidio Evaluator framework

Data = `generated_test_November 12 2019`

In [None]:
import spacy

from presidio_evaluator import ModelEvaluator
from presidio_evaluator.data_generator import read_synth_dataset
%reload_ext autoreload
%autoreload 2



Select data for evaluation:

In [None]:
synth_samples = read_synth_dataset("../../data/synth_dataset.txt")
print(len(synth_samples))
DATASET = synth_samples

In [None]:
from collections import Counter
entity_counter = Counter()
for sample in DATASET:
    for span in sample.spans:
        entity_counter[span.entity_type]+=1

In [None]:
entity_counter

In [None]:
DATASET[1]

In [None]:
#max length sentence
max([len(sample.tokens) for sample in DATASET])

Select models for evaluation:

In [None]:
models = []

en_core_web_lg = r"en_core_web_lg"
spacy_new_ontonotes28 = r"C:\Users\ommendel\OneDrive - Microsoft\Projects\presidio\Presidio-internal\presidio-evaluator\models\spacy_new_ontonotes28"

spacy_ft_100 = r"C:\Users\ommendel\OneDrive - Microsoft\Projects\presidio\Presidio-internal\presidio-evaluator\models\spacy_ft_100\model-final"

models = [en_core_web_lg, spacy_new_ontonotes28, spacy_ft_100]

Run evaluation on all models:

In [None]:
from presidio_evaluator.spacy_evaluator import SpacyEvaluator

for model in models:
    print("-----------------------------------")
    print("Evaluating model {}".format(model))
    nlp = spacy.load(model)
    spacy_evaluator = SpacyEvaluator(model=nlp,entities_to_keep=['PERSON','GPE','ORG'])
    evaluation_results = spacy_evaluator.evaluate_all(DATASET)
    scores = spacy_evaluator.calculate_score(evaluation_results)
    
    print("Confusion matrix:")
    print(scores.results)

    print("Precision and recall")
    scores.print()
    errors = scores.model_errors

Custom evaluation

In [None]:
#evaluate custom sentences
nlp = spacy.load(spacy_ft_100)


### Results analysis

In [None]:
#sent = input("Enter sentence: ")
sent = 'David is talking loudly'
doc = nlp(sent)
for ent in doc.ents:
    print("Entity = {} value = {}".format(ent.label_,ent.text))

#### False positives

1. Most false positive tokens:

In [None]:
ModelEvaluator.most_common_fp_tokens(errors)#[model_error for model_error in errors if model_error.error_type =='FP']

In [None]:
fps_df = ModelEvaluator.get_fps_dataframe(errors,entity=['LOCATION'])
fps_df[['full_text','token','prediction']]

2. False negative examples

In [None]:
errors = scores.model_errors
ModelEvaluator.most_common_fn_tokens(errors,n=50, entity=['PERSON'])

More FN analysis

In [None]:
fns_df = ModelEvaluator.get_fns_dataframe(errors,entity=['GPE'])

In [None]:
fns_df[['full_text','token','annotation','prediction']]

In [None]:
[print(error,"\n") for error in errors]