Evaluate CRF models for person names, orgs and locations using the Presidio Evaluator framework

Data = `generated_test_November 12 2019`

In [None]:
from tqdm import tqdm_notebook as tqdm
import logging
from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import read_synth_dataset
import spacy
import pandas as pd
import pickle

pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', -1)


%reload_ext autoreload
%autoreload 2



Select data for evaluation:

In [None]:
synth_samples = read_synth_dataset("../../data/synth_dataset.txt")
print(len(synth_samples))


DATASET = synth_samples

In [None]:
from collections import Counter
entity_counter = Counter()
for sample in DATASET:
    for tag in sample.tags:
        entity_counter[tag]+=1

In [None]:
entity_counter

In [None]:
DATASET[1]

In [None]:
#max length sentence
max([len(sample.tokens) for sample in DATASET])


Select models for evaluation:

In [None]:
crf_vanilla = "../../model-outputs/crf.pickle"
    
models = [crf_vanilla]

Run evaluation on all models:

In [None]:
from presidio_evaluator.crf_evaluator import CRFEvaluator

for model in models:
    print("-----------------------------------")
    print("Evaluating model {}".format(model))
    crf_evaluator = CRFEvaluator(model_pickle_path=model)
    evaluation_results = crf_evaluator.evaluate_all(DATASET)
    scores = crf_evaluator.calculate_score(evaluation_results)
    
    print("Confusion matrix:")
    print(scores.results)

    print("Precision and recall")
    scores.print()

#### Custom evaluation of the model

In [None]:
# Try out the model
def sent_to_features(model_path,sent):
    """
    Translates a sentence into a prediction using a saved CRF model
    """
    
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    tokenizer = spacy.blank('en')
    tokens = tokenizer(sent)
    tags = ['O' for token in tokens] # Placeholder: Not used but required. 
    metadata = {'Template#':1,'Gender':'1','Country':'2'} #Placeholder: Not used but required
    input_sample = InputSample(full_text=sent,masked="",spans=None,tokens=tokens,tags=tags,metadata=metadata,create_tags_from_span=False,)

    return CRFEvaluator.crf_predict(input_sample, model)

In [None]:
SENTENCE = "Michael is American"

sent_to_features(model_path=crf_vanilla, sent=SENTENCE)

#### False positives

1. Most false positive tokens:

In [None]:
errors = scores.model_errors

from presidio_evaluator import ModelEvaluator
ModelEvaluator.most_common_fp_tokens(errors)#[model_error for model_error in errors if model_error.error_type =='FP']


2. review false positives for entity 'PERSON'

In [None]:
fps_df = ModelEvaluator.get_fps_dataframe(errors,entity='PERSON')
fps_df[['full_text','token','prediction']]

#### False negative examples

In [None]:
ModelEvaluator.most_common_fn_tokens(errors,n=50, entity='PERSON')

More FN analysis

In [None]:
fns_df = ModelEvaluator.get_fns_dataframe(errors,entity='PERSON')

In [None]:
fns_df[['full_text','token','annotation','prediction']]