# SpanMarker cross-validation evaluation

Loads per-fold models trained in `training.ipynb` and performs cross-validation. Note that KFold is used without randomization (as in the training notebook).

In [None]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

import collections

import numpy as np
import pandas as pd
from customized_spanmarker_training import NoTrainPreprocTrainer
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from sklearn.model_selection import KFold
from span_marker import SpanMarkerModel
from torch.optim import AdamW
from transformers import TrainingArguments, get_scheduler

Loading datasets remotely and from parquet.

In [3]:
germeval = load_dataset("gwlms/germeval2014")["train"]
germeval = germeval.select_columns(["tokens", "ner_tags"])
krp_19jhd = Dataset.from_parquet("krp_19jhd.parquet")
krp_20jhd = Dataset.from_parquet("krp_20jhd.parquet")
rrb_19jhd = Dataset.from_parquet("rrb_19jhd.parquet")
rrb_20jhd = Dataset.from_parquet("rrb_20jhd.parquet")
gszh = Dataset.from_parquet("gszh.parquet")

Mapping features to the original germeval2014 indices.

In [4]:
krp_19jhd = krp_19jhd.map(features=germeval.features)
krp_20jhd = krp_20jhd.map(features=germeval.features)
rrb_19jhd = rrb_19jhd.map(features=germeval.features)
rrb_20jhd = rrb_20jhd.map(features=germeval.features)
gszh = gszh.map(features=germeval.features)

Evaluation.

In [5]:
kf = KFold(n_splits=5)
gradient_accumulation_steps = 2
train_batch_size = 4
n_epochs = 10

metrics_overall_dict = collections.defaultdict(list)
metrics_per_ent_dict = collections.defaultdict(list)

for i, (
    (_, krp_19jhd_eval_idx),
    (_, krp_20jhd_eval_idx),
    (_, rrb_19jhd_eval_idx),
    (_, rrb_20jhd_eval_idx),
    (_, gszh_eval_idx),
) in enumerate(
    list(
        zip(
            kf.split(np.zeros(krp_19jhd.num_rows)),
            kf.split(np.zeros(krp_20jhd.num_rows)),
            kf.split(np.zeros(rrb_19jhd.num_rows)),
            kf.split(np.zeros(rrb_20jhd.num_rows)),
            kf.split(np.zeros(gszh.num_rows)),
        )
    )
):
    print(f"Fold {i}:")

    # Selecting the test subfolds
    krp_19jhd_eval = krp_19jhd.select(krp_19jhd_eval_idx)
    krp_20jhd_eval = krp_20jhd.select(krp_20jhd_eval_idx)
    rrb_19jhd_eval = rrb_19jhd.select(rrb_19jhd_eval_idx)
    rrb_20jhd_eval = rrb_20jhd.select(rrb_20jhd_eval_idx)
    gszh_eval = gszh.select(gszh_eval_idx)

    # Concatenating the test subfolds
    eval_fold = concatenate_datasets(
        [krp_19jhd_eval, krp_20jhd_eval, rrb_19jhd_eval, rrb_20jhd_eval, gszh_eval]
    )

    fold_dataset = DatasetDict(
        {
            "eval_all": eval_fold,
            "eval_krp_19jhd": krp_19jhd_eval,
            "eval_krp_20jhd": krp_20jhd_eval,
            "eval_rrb_19jhd": rrb_19jhd_eval,
            "eval_rrb_20jhd": rrb_20jhd_eval,
            "eval_gszh": gszh_eval,
        }
    )

    # Load previously saved SpanMarker fold model
    model = SpanMarkerModel.from_pretrained(
        f"models/span-marker-ktzh-stazh-cv/fold_{i}"
    )

    # Dummy definitions for optimizer, l_r_scheduler, trainer (we don't train the model here and only use the evaluate method)
    optimizer = AdamW(model.parameters(), lr=1e-05)

    l_r_scheduler = get_scheduler(
        "polynomial",
        optimizer,
        num_warmup_steps=1,
        num_training_steps=1,
        scheduler_specific_kwargs=dict(lr_end=5e-07, power=3),
    )

    trainer = NoTrainPreprocTrainer(
        model=model,
        args=TrainingArguments(output_dir="."),
        train_dataset=None,
        eval_dataset=None,
        optimizers=(optimizer, l_r_scheduler),
    )

    #
    # Evaluation
    #

    for pred_ds in [
        "eval_all",
        "eval_krp_19jhd",
        "eval_krp_20jhd",
        "eval_rrb_19jhd",
        "eval_rrb_20jhd",
        "eval_gszh",
    ]:
        metrics = trainer.evaluate(fold_dataset[pred_ds])

        # Filter metrics
        metrics_per_ent = {}
        for m, v in metrics.items():
            if any(
                m.endswith(ent_type)
                for ent_type in [
                    "PER",
                    "LOC",
                    "ORG",
                    "PERderiv",
                    "LOCderiv",
                    "ORGderiv",
                ]
            ):
                metrics_per_ent[m] = v

        # Build metrics dataframe from dict
        metrics_per_ent_df = pd.DataFrame.from_dict(metrics_per_ent).reindex(
            ["f1", "precision", "recall", "number"]
        )
        metrics_per_ent_df = metrics_per_ent_df.rename(
            columns=lambda x: x.split("_")[-1]
        )

        # Add missing metrics
        for col in ["PER", "LOC", "ORG", "PERderiv", "LOCderiv", "ORGderiv"]:
            if col not in metrics_per_ent_df.columns:
                metrics_per_ent_df[col] = 0.0

        # Reorder columns
        metrics_per_ent_df = metrics_per_ent_df[
            ["PER", "LOC", "ORG", "PERderiv", "LOCderiv", "ORGderiv"]
        ]

        # Store dataframe for aggregation
        metrics_per_ent_dict[pred_ds].append(metrics_per_ent_df)

        print(f"[[ Evaluation {pred_ds} ]]")
        display(metrics_per_ent_df.round(2))

    del trainer
    del model

Fold 0:


  _torch_pytree._register_pytree_node(


[[ Evaluation eval_all ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.97,0.95,0.93,0.0,0.86,0.73
precision,0.95,0.94,0.92,0.0,0.81,0.69
recall,0.98,0.95,0.94,0.0,0.92,0.77
number,442.0,780.0,1201.0,0.0,38.0,47.0


[[ Evaluation eval_krp_19jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.97,0.95,0.95,0.0,0.0,0.75
precision,0.97,0.97,0.95,0.0,0.0,0.75
recall,0.97,0.93,0.94,0.0,0.0,0.75
number,35.0,30.0,89.0,0.0,1.0,4.0


Tokenizing the evaluation dataset:   0%|          | 0/475 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 13.922261% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum model input length of 256 tokens.
A total of 197 (13.922261%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/475 [00:00<?, ? examples/s]

[[ Evaluation eval_krp_20jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.98,0.96,0.93,0.0,0.89,0.76
precision,0.98,0.97,0.91,0.0,0.84,0.8
recall,0.99,0.95,0.95,0.0,0.94,0.73
number,287.0,414.0,478.0,0.0,17.0,22.0


[[ Evaluation eval_rrb_19jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.93,0.92,0.95,0.0,0.89,0.8
precision,0.94,0.89,0.96,0.0,1.0,1.0
recall,0.91,0.96,0.94,0.0,0.8,0.67
number,34.0,68.0,173.0,0.0,5.0,6.0


[[ Evaluation eval_rrb_20jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.95,0.97,0.92,0.0,1.0,0.86
precision,0.91,0.94,0.92,0.0,1.0,0.75
recall,1.0,1.0,0.91,0.0,1.0,1.0
number,52.0,117.0,196.0,0.0,4.0,3.0


[[ Evaluation eval_gszh ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.92,0.9,0.91,0.0,0.81,0.62
precision,0.85,0.88,0.89,0.0,0.69,0.5
recall,1.0,0.93,0.92,0.0,1.0,0.83
number,34.0,151.0,265.0,0.0,11.0,12.0


Fold 1:


[[ Evaluation eval_all ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.97,0.97,0.94,1.0,0.81,0.73
precision,0.97,0.97,0.92,1.0,0.8,0.72
recall,0.97,0.97,0.96,1.0,0.83,0.74
number,501.0,663.0,1022.0,2.0,29.0,42.0


[[ Evaluation eval_krp_19jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.93,0.93,0.95,0.0,1.0,0.0
precision,0.94,0.9,0.95,0.0,1.0,0.0
recall,0.92,0.96,0.95,0.0,1.0,0.0
number,53.0,28.0,57.0,0.0,1.0,0.0


Tokenizing the evaluation dataset:   0%|          | 0/474 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 1.706161% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum model input length of 256 tokens.
A total of 18 (1.706161%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/474 [00:00<?, ? examples/s]

[[ Evaluation eval_krp_20jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.99,0.98,0.95,0.0,0.88,0.76
precision,0.98,0.98,0.93,0.0,0.85,0.74
recall,0.99,0.98,0.97,0.0,0.92,0.78
number,279.0,335.0,393.0,0.0,12.0,18.0


[[ Evaluation eval_rrb_19jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.93,0.96,0.9,1.0,0.62,0.5
precision,0.97,0.94,0.89,1.0,0.8,0.5
recall,0.89,0.98,0.92,1.0,0.5,0.5
number,65.0,96.0,145.0,2.0,8.0,2.0


[[ Evaluation eval_rrb_20jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,1.0,0.96,0.95,0.0,1.0,0.91
precision,1.0,0.97,0.93,0.0,1.0,1.0
recall,1.0,0.95,0.98,0.0,1.0,0.83
number,55.0,130.0,188.0,0.0,3.0,6.0


[[ Evaluation eval_gszh ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.96,0.98,0.93,0.0,0.77,0.67
precision,0.94,0.99,0.92,0.0,0.62,0.65
recall,0.98,0.97,0.94,0.0,1.0,0.69
number,49.0,74.0,239.0,0.0,5.0,16.0


Fold 2:


[[ Evaluation eval_all ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.96,0.97,0.94,0.0,0.85,0.75
precision,0.97,0.96,0.93,0.0,0.89,0.73
recall,0.94,0.98,0.94,0.0,0.81,0.77
number,493.0,685.0,1211.0,0.0,42.0,57.0


[[ Evaluation eval_krp_19jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.97,0.99,0.94,0.0,0.67,0.5
precision,0.99,0.97,0.96,0.0,0.5,0.5
recall,0.95,1.0,0.91,0.0,1.0,0.5
number,81.0,38.0,58.0,0.0,1.0,4.0


Tokenizing the evaluation dataset:   0%|          | 0/474 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 4.166667% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum model input length of 256 tokens.
A total of 50 (4.166667%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/474 [00:00<?, ? examples/s]

[[ Evaluation eval_krp_20jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.98,0.97,0.93,0.0,0.9,0.75
precision,0.98,0.98,0.92,0.0,0.93,0.72
recall,0.97,0.97,0.95,0.0,0.87,0.79
number,264.0,311.0,531.0,0.0,15.0,29.0


[[ Evaluation eval_rrb_19jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.91,0.97,0.95,0.0,0.55,0.0
precision,0.92,0.95,0.94,0.0,0.75,0.0
recall,0.91,1.0,0.96,0.0,0.43,0.0
number,53.0,76.0,202.0,0.0,7.0,1.0


[[ Evaluation eval_rrb_20jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.94,0.97,0.94,0.0,0.92,0.8
precision,0.95,0.97,0.94,0.0,1.0,1.0
recall,0.93,0.97,0.95,0.0,0.86,0.67
number,57.0,155.0,169.0,0.0,7.0,3.0


[[ Evaluation eval_gszh ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.85,0.93,0.93,0.0,0.92,0.81
precision,0.91,0.88,0.92,0.0,0.92,0.77
recall,0.79,0.98,0.94,0.0,0.92,0.85
number,38.0,105.0,251.0,0.0,12.0,20.0


Fold 3:


[[ Evaluation eval_all ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.97,0.94,0.93,0.67,0.88,0.7
precision,0.96,0.93,0.92,1.0,0.92,0.67
recall,0.99,0.95,0.95,0.5,0.83,0.74
number,480.0,675.0,1046.0,2.0,42.0,50.0


[[ Evaluation eval_krp_19jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.98,0.91,0.96,0.0,0.5,0.4
precision,0.98,0.83,0.98,0.0,1.0,0.25
recall,0.98,1.0,0.94,0.0,0.33,1.0
number,61.0,35.0,68.0,0.0,3.0,1.0


Tokenizing the evaluation dataset:   0%|          | 0/474 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 2.228164% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum model input length of 256 tokens.
A total of 25 (2.228164%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/474 [00:00<?, ? examples/s]

[[ Evaluation eval_krp_20jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.99,0.97,0.92,0.0,0.95,0.81
precision,0.99,0.98,0.9,0.0,0.9,0.78
recall,1.0,0.97,0.95,0.0,1.0,0.84
number,276.0,340.0,438.0,0.0,18.0,25.0


[[ Evaluation eval_rrb_19jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.95,0.91,0.92,0.67,0.71,0.5
precision,0.91,0.94,0.91,1.0,0.83,0.5
recall,0.98,0.89,0.93,0.5,0.62,0.5
number,44.0,71.0,137.0,2.0,8.0,6.0


[[ Evaluation eval_rrb_20jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.92,0.95,0.96,0.0,1.0,1.0
precision,0.9,0.93,0.97,0.0,1.0,1.0
recall,0.94,0.96,0.96,0.0,1.0,1.0
number,68.0,127.0,168.0,0.0,2.0,2.0


[[ Evaluation eval_gszh ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.95,0.88,0.92,0.0,0.9,0.62
precision,0.91,0.85,0.89,0.0,1.0,0.62
recall,1.0,0.92,0.94,0.0,0.82,0.62
number,31.0,102.0,235.0,0.0,11.0,16.0


Fold 4:


[[ Evaluation eval_all ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.96,0.96,0.94,0.0,0.87,0.8
precision,0.97,0.96,0.93,0.0,0.86,0.81
recall,0.96,0.97,0.95,0.0,0.88,0.78
number,520.0,721.0,1061.0,2.0,41.0,45.0


[[ Evaluation eval_krp_19jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.96,0.98,0.94,0.0,0.67,0.67
precision,0.96,1.0,0.96,0.0,0.5,1.0
recall,0.96,0.95,0.93,0.0,1.0,0.5
number,49.0,22.0,69.0,0.0,1.0,4.0


Tokenizing the evaluation dataset:   0%|          | 0/474 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 10.711553% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum model input length of 256 tokens.
A total of 140 (10.711553%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/474 [00:00<?, ? examples/s]

[[ Evaluation eval_krp_20jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.98,0.97,0.94,0.0,0.88,0.77
precision,0.98,0.97,0.94,0.0,0.88,0.86
recall,0.98,0.97,0.94,0.0,0.88,0.71
number,314.0,390.0,429.0,0.0,17.0,17.0


[[ Evaluation eval_rrb_19jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.92,0.94,0.94,0.0,0.55,0.73
precision,0.93,0.96,0.94,0.0,0.6,0.57
recall,0.9,0.91,0.94,0.0,0.5,1.0
number,60.0,81.0,162.0,2.0,6.0,4.0


[[ Evaluation eval_rrb_20jhd ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.98,0.98,0.97,0.0,1.0,1.0
precision,0.98,0.98,0.96,0.0,1.0,1.0
recall,0.98,0.98,0.98,0.0,1.0,1.0
number,53.0,129.0,159.0,0.0,4.0,10.0


[[ Evaluation eval_gszh ]]


Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.88,0.94,0.92,0.0,0.96,0.7
precision,0.9,0.91,0.89,0.0,0.93,0.7
recall,0.86,0.97,0.94,0.0,1.0,0.7
number,44.0,99.0,242.0,0.0,13.0,10.0


Generating aggregated metrics across folds (mean and median).

In [6]:
mean_dfs = []

for pred_ds in [
    "eval_all",
    "eval_krp_19jhd",
    "eval_krp_20jhd",
    "eval_rrb_19jhd",
    "eval_rrb_20jhd",
    "eval_gszh",
]:
    print(
        f"""-------------------
Aggregated metrics ({pred_ds})
-------------------"""
    )
    metrics_per_ent_conc = pd.concat(metrics_per_ent_dict[pred_ds])
    metrics_per_ent_conc = metrics_per_ent_conc.groupby(metrics_per_ent_conc.index)
    print("Per-ent metrics (mean)", end="")
    mean_df = (
        metrics_per_ent_conc.mean()
        .round(2)
        .reindex(["f1", "precision", "recall", "number"])
    )
    display(mean_df)
    mean_dfs.append(
        (
            pred_ds.replace("eval_", ""),
            mean_df.rename({"f1": "F1", "precision": "P", "recall": "R"}),
        )
    )
    print("Per-ent metrics (median)", end="")
    median_df = (
        metrics_per_ent_conc.median()
        .round(2)
        .reindex(["f1", "precision", "recall", "number"])
    )
    display(median_df)

-------------------
Aggregated metrics (eval_all)
-------------------
Per-ent metrics (mean)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.97,0.96,0.93,0.33,0.85,0.74
precision,0.97,0.95,0.92,0.4,0.86,0.73
recall,0.97,0.96,0.95,0.3,0.85,0.76
number,487.2,704.8,1108.2,1.2,38.4,48.2


Per-ent metrics (median)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.97,0.96,0.94,0.0,0.86,0.73
precision,0.97,0.96,0.92,0.0,0.86,0.72
recall,0.97,0.97,0.95,0.0,0.83,0.77
number,493.0,685.0,1061.0,2.0,41.0,47.0


-------------------
Aggregated metrics (eval_krp_19jhd)
-------------------
Per-ent metrics (mean)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.96,0.95,0.95,0.0,0.57,0.46
precision,0.97,0.93,0.96,0.0,0.6,0.5
recall,0.96,0.97,0.93,0.0,0.67,0.55
number,55.8,30.6,68.2,0.0,1.4,2.6


Per-ent metrics (median)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.97,0.95,0.95,0.0,0.67,0.5
precision,0.97,0.97,0.96,0.0,0.5,0.5
recall,0.96,0.96,0.94,0.0,1.0,0.5
number,53.0,30.0,68.0,0.0,1.0,4.0


-------------------
Aggregated metrics (eval_krp_20jhd)
-------------------
Per-ent metrics (mean)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.98,0.97,0.94,0.0,0.9,0.77
precision,0.98,0.97,0.92,0.0,0.88,0.78
recall,0.99,0.97,0.95,0.0,0.92,0.77
number,284.0,358.0,453.8,0.0,15.8,22.2


Per-ent metrics (median)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.98,0.97,0.93,0.0,0.89,0.76
precision,0.98,0.98,0.92,0.0,0.88,0.78
recall,0.99,0.97,0.95,0.0,0.92,0.78
number,279.0,340.0,438.0,0.0,17.0,22.0


-------------------
Aggregated metrics (eval_rrb_19jhd)
-------------------
Per-ent metrics (mean)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.93,0.94,0.93,0.33,0.66,0.51
precision,0.94,0.94,0.93,0.4,0.8,0.51
recall,0.92,0.95,0.94,0.3,0.57,0.53
number,51.2,78.4,163.8,1.2,6.8,3.8


Per-ent metrics (median)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.93,0.94,0.94,0.0,0.62,0.5
precision,0.93,0.94,0.94,0.0,0.8,0.5
recall,0.91,0.96,0.94,0.0,0.5,0.5
number,53.0,76.0,162.0,2.0,7.0,4.0


-------------------
Aggregated metrics (eval_rrb_20jhd)
-------------------
Per-ent metrics (mean)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.96,0.96,0.95,0.0,0.98,0.91
precision,0.95,0.96,0.94,0.0,1.0,0.95
recall,0.97,0.97,0.96,0.0,0.97,0.9
number,57.0,131.6,176.0,0.0,4.0,4.8


Per-ent metrics (median)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.95,0.97,0.95,0.0,1.0,0.91
precision,0.95,0.97,0.94,0.0,1.0,1.0
recall,0.98,0.97,0.96,0.0,1.0,1.0
number,55.0,129.0,169.0,0.0,4.0,3.0


-------------------
Aggregated metrics (eval_gszh)
-------------------
Per-ent metrics (mean)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.91,0.93,0.92,0.0,0.87,0.69
precision,0.9,0.9,0.9,0.0,0.83,0.65
recall,0.93,0.96,0.94,0.0,0.95,0.74
number,39.2,106.2,246.4,0.0,10.4,14.8


Per-ent metrics (median)

Unnamed: 0,PER,LOC,ORG,PERderiv,LOCderiv,ORGderiv
f1,0.92,0.93,0.92,0.0,0.9,0.67
precision,0.91,0.88,0.89,0.0,0.92,0.65
recall,0.98,0.97,0.94,0.0,1.0,0.7
number,38.0,102.0,242.0,0.0,11.0,16.0


Some dense pandas code to get a heatmap table of all aggregated metrics (mean).

In [7]:
pd.concat(
    [t[1].drop("number") for t in mean_dfs], keys=[t[0] for t in mean_dfs]
).T.style.format("{:.2f}").background_gradient(
    cmap="RdYlGn", vmin=0.0, vmax=1.0
).set_properties(**{"font-size": "12px"})

Unnamed: 0_level_0,all,all,all,krp_19jhd,krp_19jhd,krp_19jhd,krp_20jhd,krp_20jhd,krp_20jhd,rrb_19jhd,rrb_19jhd,rrb_19jhd,rrb_20jhd,rrb_20jhd,rrb_20jhd,gszh,gszh,gszh
Unnamed: 0_level_1,F1,P,R,F1,P,R,F1,P,R,F1,P,R,F1,P,R,F1,P,R
PER,0.97,0.97,0.97,0.96,0.97,0.96,0.98,0.98,0.99,0.93,0.94,0.92,0.96,0.95,0.97,0.91,0.9,0.93
LOC,0.96,0.95,0.96,0.95,0.93,0.97,0.97,0.97,0.97,0.94,0.94,0.95,0.96,0.96,0.97,0.93,0.9,0.96
ORG,0.93,0.92,0.95,0.95,0.96,0.93,0.94,0.92,0.95,0.93,0.93,0.94,0.95,0.94,0.96,0.92,0.9,0.94
PERderiv,0.33,0.4,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.33,0.4,0.3,0.0,0.0,0.0,0.0,0.0,0.0
LOCderiv,0.85,0.86,0.85,0.57,0.6,0.67,0.9,0.88,0.92,0.66,0.8,0.57,0.98,1.0,0.97,0.87,0.83,0.95
ORGderiv,0.74,0.73,0.76,0.46,0.5,0.55,0.77,0.78,0.77,0.51,0.51,0.53,0.91,0.95,0.9,0.69,0.65,0.74


The same for the mean frequency of named entity types per fold and dataset.

In [8]:
entity_freq_df = pd.concat(
    [t[1].drop(["F1", "P", "R"]) for t in mean_dfs], keys=[t[0] for t in mean_dfs]
).T.droplevel(1, axis="columns")
entity_freq_df.style.format("{:.2f}").background_gradient(
    cmap="RdYlGn", vmin=entity_freq_df.min().min(), vmax=entity_freq_df.max().max()
).set_properties(**{"font-size": "12px"})

Unnamed: 0,all,krp_19jhd,krp_20jhd,rrb_19jhd,rrb_20jhd,gszh
PER,487.2,55.8,284.0,51.2,57.0,39.2
LOC,704.8,30.6,358.0,78.4,131.6,106.2
ORG,1108.2,68.2,453.8,163.8,176.0,246.4
PERderiv,1.2,0.0,0.0,1.2,0.0,0.0
LOCderiv,38.4,1.4,15.8,6.8,4.0,10.4
ORGderiv,48.2,2.6,22.2,3.8,4.8,14.8
