# SpanMarker cross-validation baseline evaluation

Loads the baseline model and applies it to all folds of a cross-validation that is identical to the one specified in `training.py`, for the purpose of comparing performance. Note that KFold is used without randomization (as in the training notebook).

In [None]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

import collections

import numpy as np
import pandas as pd
from customized_spanmarker_training import NoTrainPreprocTrainer
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from sklearn.model_selection import KFold
from span_marker import SpanMarkerModel
from torch.optim import AdamW
from transformers import TrainingArguments, get_scheduler

Loading datasets remotely and from parquet.

In [None]:
germeval = load_dataset("gwlms/germeval2014")["train"]
germeval = germeval.select_columns(["tokens", "ner_tags"])
krp_19jhd = Dataset.from_parquet("krp_19jhd.parquet")
krp_20jhd = Dataset.from_parquet("krp_20jhd.parquet")
rrb_19jhd = Dataset.from_parquet("rrb_19jhd.parquet")
rrb_20jhd = Dataset.from_parquet("rrb_20jhd.parquet")
gszh = Dataset.from_parquet("gszh.parquet")

Mapping features to the original germeval2014 indices.

In [None]:
krp_19jhd = krp_19jhd.map(features=germeval.features)
krp_20jhd = krp_20jhd.map(features=germeval.features)
rrb_19jhd = rrb_19jhd.map(features=germeval.features)
rrb_20jhd = rrb_20jhd.map(features=germeval.features)
gszh = gszh.map(features=germeval.features)

Evaluation.

In [None]:
kf = KFold(n_splits=5)
gradient_accumulation_steps = 2
train_batch_size = 4
n_epochs = 10

metrics_overall_dict = collections.defaultdict(list)
metrics_per_ent_dict = collections.defaultdict(list)

for i, (
    (_, krp_19jhd_eval_idx),
    (_, krp_20jhd_eval_idx),
    (_, rrb_19jhd_eval_idx),
    (_, rrb_20jhd_eval_idx),
    (_, gszh_eval_idx),
) in enumerate(
    list(
        zip(
            kf.split(np.zeros(krp_19jhd.num_rows)),
            kf.split(np.zeros(krp_20jhd.num_rows)),
            kf.split(np.zeros(rrb_19jhd.num_rows)),
            kf.split(np.zeros(rrb_20jhd.num_rows)),
            kf.split(np.zeros(gszh.num_rows)),
        )
    )
):
    print(f"Fold {i}:")

    # Selecting the test subfolds
    krp_19jhd_eval = krp_19jhd.select(krp_19jhd_eval_idx)
    krp_20jhd_eval = krp_20jhd.select(krp_20jhd_eval_idx)
    rrb_19jhd_eval = rrb_19jhd.select(rrb_19jhd_eval_idx)
    rrb_20jhd_eval = rrb_20jhd.select(rrb_20jhd_eval_idx)
    gszh_eval = gszh.select(gszh_eval_idx)

    # Concatenating the test subfolds
    eval_fold = concatenate_datasets(
        [krp_19jhd_eval, krp_20jhd_eval, rrb_19jhd_eval, rrb_20jhd_eval, gszh_eval]
    )

    fold_dataset = DatasetDict(
        {
            "eval_all": eval_fold,
            "eval_krp_19jhd": krp_19jhd_eval,
            "eval_krp_20jhd": krp_20jhd_eval,
            "eval_rrb_19jhd": rrb_19jhd_eval,
            "eval_rrb_20jhd": rrb_20jhd_eval,
            "eval_gszh": gszh_eval,
        }
    )

    # Load baseline SpanMarker model
    model = SpanMarkerModel.from_pretrained(
        "stefan-it/span-marker-gelectra-large-germeval14"
    )

    # Dummy definitions for optimizer, l_r_scheduler, trainer (we don't train the model here and only use the evaluate method)
    optimizer = AdamW(model.parameters(), lr=1e-05)

    l_r_scheduler = get_scheduler(
        "polynomial",
        optimizer,
        num_warmup_steps=1,
        num_training_steps=1,
        scheduler_specific_kwargs=dict(lr_end=5e-07, power=3),
    )

    trainer = NoTrainPreprocTrainer(
        model=model,
        args=TrainingArguments(output_dir="."),
        train_dataset=None,
        eval_dataset=None,
        optimizers=(optimizer, l_r_scheduler),
    )

    #
    # Evaluation
    #

    for pred_ds in [
        "eval_all",
        "eval_krp_19jhd",
        "eval_krp_20jhd",
        "eval_rrb_19jhd",
        "eval_rrb_20jhd",
        "eval_gszh",
    ]:
        metrics = trainer.evaluate(fold_dataset[pred_ds])

        # Filter metrics
        metrics_per_ent = {}
        for m, v in metrics.items():
            if any(
                m.endswith(ent_type)
                for ent_type in [
                    "PER",
                    "LOC",
                    "ORG",
                    "PERderiv",
                    "LOCderiv",
                    "ORGderiv",
                ]
            ):
                metrics_per_ent[m] = v

        # Build metrics dataframe from dict
        metrics_per_ent_df = pd.DataFrame.from_dict(metrics_per_ent).reindex(
            ["f1", "precision", "recall", "number"]
        )
        metrics_per_ent_df = metrics_per_ent_df.rename(
            columns=lambda x: x.split("_")[-1]
        )

        # Add missing metrics
        for col in ["PER", "LOC", "ORG", "PERderiv", "LOCderiv", "ORGderiv"]:
            if col not in metrics_per_ent_df.columns:
                metrics_per_ent_df[col] = 0.0

        # Reorder columns
        metrics_per_ent_df = metrics_per_ent_df[
            ["PER", "LOC", "ORG", "PERderiv", "LOCderiv", "ORGderiv"]
        ]

        # Store dataframe for aggregation
        metrics_per_ent_dict[pred_ds].append(metrics_per_ent_df)

        print(f"[[ Evaluation {pred_ds} ]]")
        display(metrics_per_ent_df.round(2))

    del trainer
    del model

Generating aggregated metrics across folds (mean and median).

In [None]:
mean_dfs = []

for pred_ds in [
    "eval_all",
    "eval_krp_19jhd",
    "eval_krp_20jhd",
    "eval_rrb_19jhd",
    "eval_rrb_20jhd",
    "eval_gszh",
]:
    print(
        f"""-------------------
Aggregated metrics ({pred_ds})
-------------------"""
    )
    metrics_per_ent_conc = pd.concat(metrics_per_ent_dict[pred_ds])
    metrics_per_ent_conc = metrics_per_ent_conc.groupby(metrics_per_ent_conc.index)
    print("Per-ent metrics (mean)", end="")
    mean_df = (
        metrics_per_ent_conc.mean()
        .round(2)
        .reindex(["f1", "precision", "recall", "number"])
    )
    display(mean_df)
    mean_dfs.append(
        (
            pred_ds.replace("eval_", ""),
            mean_df.rename({"f1": "F1", "precision": "P", "recall": "R"}),
        )
    )
    print("Per-ent metrics (median)", end="")
    median_df = (
        metrics_per_ent_conc.median()
        .round(2)
        .reindex(["f1", "precision", "recall", "number"])
    )
    display(median_df)

Some dense pandas code to get a heatmap table of all aggregated metrics (mean).

In [None]:
pd.concat(
    [t[1].drop("number") for t in mean_dfs], keys=[t[0] for t in mean_dfs]
).T.style.format("{:.2f}").background_gradient(
    cmap="RdYlGn", vmin=0.0, vmax=1.0
).set_properties(**{"font-size": "12px"})

The same for the mean frequency of named entity types per fold and dataset.

In [None]:
entity_freq_df = pd.concat(
    [t[1].drop(["F1", "P", "R"]) for t in mean_dfs], keys=[t[0] for t in mean_dfs]
).T.droplevel(1, axis="columns")
entity_freq_df.style.format("{:.2f}").background_gradient(
    cmap="RdYlGn", vmin=entity_freq_df.min().min(), vmax=entity_freq_df.max().max()
).set_properties(**{"font-size": "12px"})