Evaluate Conditional Random Field models using the Presidio Evaluator framework

In [None]:
from pathlib import Path
from copy import deepcopy
from pprint import pprint
from collections import Counter

from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.experiment_tracking import get_experiment_tracker
from presidio_evaluator.models import CRFModel

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

%reload_ext autoreload
%autoreload 2

In [None]:
#!pip install sklearn_crfsuite

Select data for evaluation:

In [None]:
DATA_DATE = "Dec-27-2023" # Date when the split to train/test notebook was ran
dataset_name = "../../data/test_{}.json".format(DATA_DATE)
dataset = InputSample.read_dataset_json(dataset_name)
print(len(dataset))

In [None]:
entity_counter = Counter()
for sample in dataset:
    for tag in sample.tags:
        entity_counter[tag] += 1

In [None]:
print("Count per entity:")
pprint(entity_counter.most_common())

print("\nExample sentence:")
print(dataset[1])

print("\nMin and max number of tokens in dataset:")
print(
    f"Min: {min([len(sample.tokens) for sample in dataset])}, "
    f"Max: {max([len(sample.tokens) for sample in dataset])}"
)

print("\nMin and max sentence length in dataset:")
print(
    f"Min: {min([len(sample.full_text) for sample in dataset])}, "
    f"Max: {max([len(sample.full_text) for sample in dataset])}"
)

Select models for evaluation:

In [None]:
crf_vanilla = "../../models/crf.pickle"
# Assuming there exists a trained CRF model
models = [crf_vanilla]

Run evaluation on all models:

In [None]:
for model_path in models:
    print("-----------------------------------")
    print(f"Evaluating model {model_path}")
    experiment = get_experiment_tracker()

    model = CRFModel(model_pickle_path=model_path)
    evaluator = Evaluator(model=model)
    evaluation_results = evaluator.evaluate_all(deepcopy(dataset))
    results = evaluator.calculate_score(evaluation_results)

    # update params tracking
    params = {"dataset_name": dataset_name, "model_name": model_path}
    params.update(model.to_log())
    experiment.log_parameters(params)
    experiment.log_dataset_hash(dataset)
    experiment.log_metrics(results.to_log())
    entities, confmatrix = results.to_confusion_matrix()
    experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)

    print("Confusion matrix:")
    print(pd.DataFrame(confmatrix, columns=entities, index=entities))

    print("Precision and recall")
    print(results)

    # end experiment
    experiment.end()

### Results analysis

In [None]:
sent = "I am taiwanese but I live in Cambodia."
# sent = input("Enter sentence: ")
model.predict(InputSample(full_text=sent))

### Error Analysis

In [None]:
errors = results.model_errors

#### False positives

1. Most false positive tokens:

In [None]:
ModelError.most_common_fp_tokens(errors)

In [None]:
fps_df = ModelError.get_fps_dataframe(errors, entity=["PERSON"])
fps_df[["full_text", "token", "prediction"]]

2. False negative examples

In [None]:
ModelError.most_common_fn_tokens(errors, n=50, entity=["ORGANIZATION"])

More FN analysis

In [None]:
fns_df = ModelError.get_fns_dataframe(errors, entity=["GPE"])

In [None]:
fns_df[["full_text", "token", "annotation", "prediction"]]

In [None]:
print("All errors:\n")
[print(error, "\n") for error in errors]