# Evaluate Presidio Analyzer at the span level using the Presidio Evaluator framework

In this notebook, we will go through the following steps:

1. Import the evaluation dataset into an InputSample format
2. Run the inference and metric at the span level using the SpanEvaluator class
3. Analyze the performance at the span level

In [None]:
from pathlib import Path
from copy import deepcopy

from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import Evaluator, SpanEvaluator, SpanOutput
from presidio_evaluator.models import PresidioAnalyzerWrapper
from presidio_evaluator.experiment_tracking import get_experiment_tracker

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

%reload_ext autoreload
%autoreload 2

## 1. Load Evaluation Dataset into InputSample format

In [None]:
dataset_name = "synth_dataset_v2.json"

#dataset_name = "sample_data.json"
dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, "data", dataset_name))
print(len(dataset))

## 2. Run the inference and metric at the span level using the SpanEvaluator class

In [None]:
dataset = Evaluator.align_entity_types(
    deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map
)

In [None]:
print("Evaluating Presidio Analyzer")
model_name = "Presidio Analyzer"
model = PresidioAnalyzerWrapper()

In [None]:
# Initialize a SpanEvaluator class
# evaluator_span = SpanEvaluator(model=model, entities_to_keep=["PERSON", "EMAIL_ADDRESS", "LOCATION", "DATE_TIME", "TITLE"])
evaluator_span = SpanEvaluator(model=model)
evaluation_span = evaluator_span.evaluate_span(dataset)

## 3. Results analysis

In [None]:
df_span_output, df_metrics = evaluation_span.visualize_metric()

In [None]:
df_span_output

In [None]:
df_metrics

In [None]:
output_df = SpanOutput.get_span_output_df(evaluation_span.span_outputs)
output_df

#### Visualize all correct cases

In [None]:
correct = SpanOutput.get_spans_output_by_type(output_type = "correct", outputs=evaluation_span.span_outputs)
correct_df = SpanOutput.get_span_output_df(correct)
correct_df

#### Visualize all partial cases

In [None]:
partial = SpanOutput.get_spans_output_by_type(output_type = "partial", outputs=evaluation_span.span_outputs)
partial_df = SpanOutput.get_span_output_df(partial)
partial_df

#### Visualize all incorrect cases

In [None]:
incorrect = SpanOutput.get_spans_output_by_type(output_type = "incorrect", outputs=evaluation_span.span_outputs)
incorrect_df = SpanOutput.get_span_output_df(incorrect)
incorrect_df

#### Visualize all spurious cases

In [None]:
spurious = SpanOutput.get_spans_output_by_type(output_type = "spurious", outputs=evaluation_span.span_outputs)
spurious_df = SpanOutput.get_span_output_df(spurious)
spurious_df

#### Visualize all miss cases

In [None]:
miss = SpanOutput.get_spans_output_by_type(output_type = "miss", outputs=evaluation_span.span_outputs)
miss_df = SpanOutput.get_span_output_df(miss)
miss_df