# Score mentions from tags

The goal is to score a named entity recognition solution given the tags output by a sequence tagging model.
The two main tagging schemes used nowadays are:
- `BIO:   O O B-LOC I-LOC I-LOC O B-LOC O`
- `BIOUL: O O B-LOC I-LOC L-LOC O U-LOC O`

The function we code in this notebook returns the precision, recall and f1 scores by matching the predicted and gold sequences.
Many overlapping schemes are available:
- `exact`: mention pred and mention gold must match exactly
- `partial_strict`: mention pred and mention gold must share at least one token
- `partial`: mention pred and mention gold must touch
- any scheme using a custom formula `begin_pred < end_gold and begin_gold < end_pred` to reproduce partial_strict for example

In [28]:
import numpy as np
import torch
import pandas as pd
from scipy.sparse import coo_matrix
from itertools import zip_longest

from nlstruct.layers.crf import BIODecoder, BIOULDecoder
from nlstruct.core.batcher import factorize
from nlstruct.core.scoring import compute_metrics, merge_pred_and_gold

def score_mentions_from_tags(y_true, y_pred):
    # Detect the tag scheme
    tags = factorize(y_true, freeze_reference=False)[2]
    heads, labels = tuple(zip_longest(*(tag.split("-") for tag in tags)))
    heads, labels = set(heads), np.asarray(list(set(label for label in labels if label is not None)))
    if heads <= set("BIO"):
        tags = ["O"] + [tag for label in labels for tag in (f"B-{label}", f"I-{label}")]
        decoder = BIODecoder # let's reuse the LinearCRF tag decoder
    elif heads <= set("BIOUL"):
        tags = ["O"] + [tag for label in labels for tag in (f"B-{label}", f"I-{label}", f"L-{label}", f"U-{label}")]
        decoder = BIOULDecoder # let's reuse the LinearCRF tag decoder
    else:
        raise Exception("Unrecognized tags {}. Allowed schemes are BIO and BIOUL".format(tuple(heads)))
    
    # Transform all the string tags to numbers, according to the detected `tags` (the order matters !)  
    y_true = factorize(y_true, reference_values=tags)[0]
    y_pred = factorize(y_pred, reference_values=tags)[0]

    # Convert the tags variable length sequences to matrices of size n_sequence * max_tokens_per_seq -> id of the tag
    true_sparse_content = np.asarray([(row, col, val) for row, vals in enumerate(y_true) for col, val in enumerate(vals)])
    pred_sparse_content = np.asarray([(row, col, val) for row, vals in enumerate(y_pred) for col, val in enumerate(vals)])
    sp_true = coo_matrix((true_sparse_content[:, 2], (true_sparse_content[:, 0], true_sparse_content[:, 1])))
    sp_pred = coo_matrix((pred_sparse_content[:, 2], (pred_sparse_content[:, 0], pred_sparse_content[:, 1])))
    
    # Use the BIO(UL)Decoder tags_to_spans method to extract begin/end indices, labels and doc_id of each span
    true_spans = decoder.tags_to_spans(torch.as_tensor(sp_true.toarray()))
    pred_spans = decoder.tags_to_spans(torch.as_tensor(sp_pred.toarray()))
    
    # Build dataframes using those computed indice/label arrays
    pred=pd.DataFrame({"begin": true_spans["span_begin"], "end": true_spans["span_end"], "doc_id": true_spans["span_doc_id"], "label": labels[true_spans["span_label"]]})
    gold=pd.DataFrame({"begin": pred_spans["span_begin"], "end": pred_spans["span_end"], "doc_id": pred_spans["span_doc_id"], "label": labels[pred_spans["span_label"]]})

    # Compute the metrics, each compute_metrics function returns a dict
    metrics = {
        # True positive only when exact match and same label 
        **compute_metrics(merge_pred_and_gold(pred, gold,
            span_policy='exact',  # exact match, could also write "begin_x <= end_y or begin_y <= end_x" equivalently
            on=["doc_id", ("begin", "end"), "label"]), prefix='exact/full/').to_dict(),

        # True positive only when partial strict overlap and same label 
        **compute_metrics(merge_pred_and_gold(pred, gold,
            span_policy='partial_strict',
            on=["doc_id", ("begin", "end"), "label"]), prefix='relaxed/full/').to_dict(),

        # True positive only when exact match and the label maybe different
        **compute_metrics(merge_pred_and_gold(pred, gold,
            span_policy='exact',
            on=["doc_id", ("begin", "end")]), prefix='exact/span/').to_dict(),

        # True positive only when partial strict overlap and the label maybe different
        **compute_metrics(merge_pred_and_gold(pred, gold,
            span_policy='partial_strict',
            on=["doc_id", ("begin", "end")]), prefix='relaxed/span/').to_dict(),
    }
    # Compute per-label scores
    for label in gold['label'].drop_duplicates():
        subset_pred = pred.query(f"label == {repr(label)}")
        subset_gold = gold.query(f"label == {repr(label)}")
        metrics.update(compute_metrics(merge_pred_and_gold(subset_pred, subset_gold,
            span_policy='exact',
            on=["doc_id", ("begin", "end")]), prefix=f'exact/{label}/').to_dict(),)
        metrics.update(compute_metrics(merge_pred_and_gold(subset_pred, subset_gold,
            span_policy='partial_strict',
            on=["doc_id", ("begin", "end")]), prefix=f'relaxed/{label}/').to_dict(),)
    return metrics

In [29]:
y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]

In [30]:
# score_mentions_from_tags returns a dict
metrics = score_mentions_from_tags(y_true, y_pred)

In [31]:
# show the metrics using a pretty dataframe
pd.Series({tuple(k.split("/")): value for k, value in metrics.items()}).unstack(2)[
    ["f1", "precision", "recall", "pred_count", "gold_count", "tp"]].astype({"pred_count": int, "gold_count": int, "tp": int})

Unnamed: 0,Unnamed: 1,f1,precision,recall,pred_count,gold_count,tp
exact,MISC,0.0,0.0,0.0,1,1,0
exact,PER,1.0,1.0,1.0,1,1,1
exact,full,0.5,0.5,0.5,2,2,1
exact,span,0.5,0.5,0.5,2,2,1
relaxed,MISC,1.0,1.0,1.0,1,1,1
relaxed,PER,1.0,1.0,1.0,1,1,1
relaxed,full,1.0,1.0,1.0,2,2,2
relaxed,span,1.0,1.0,1.0,2,2,2
