# SpanMarker cross-validation training
This notebook performs 5-fold crossvalidation for a SpanMarker model. Parquet files must be stored in the same directory. An internet connection is required to download the germeval2014 dataset necessary for feature mapping.

The evaluation is performed separately in `cv_model_evaluation.ipynb`.

In [None]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)


import numpy as np
from customized_spanmarker_training import NoTrainPreprocTrainer, preprocess_dataset
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from sklearn.model_selection import KFold
from span_marker import SpanMarkerModel
from span_marker.label_normalizer import AutoLabelNormalizer
from torch.optim import AdamW
from transformers import TrainingArguments, get_scheduler

Loading datasets remotely and from parquet.

In [5]:
germeval = load_dataset("gwlms/germeval2014")["train"]
germeval = germeval.select_columns(["tokens", "ner_tags"])
krp_19jhd = Dataset.from_parquet("krp_19jhd.parquet")
krp_20jhd = Dataset.from_parquet("krp_20jhd.parquet")
rrb_19jhd = Dataset.from_parquet("rrb_19jhd.parquet")
rrb_20jhd = Dataset.from_parquet("rrb_20jhd.parquet")
gszh = Dataset.from_parquet("gszh.parquet")

Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/datasets_modules/datasets/gwlms--germeval2014/66c688527595f251c65f33876dfb2dee0ae258c4f5df3a6e908f3511559bf4c3 (last modified on Fri Jan  5 13:59:07 2024) since it couldn't be found locally at gwlms/germeval2014, or remotely on the Hugging Face Hub.


Mapping features to the original germeval2014 indices.

In [None]:
krp_19jhd = krp_19jhd.map(features=germeval.features)
krp_20jhd = krp_20jhd.map(features=germeval.features)
rrb_19jhd = rrb_19jhd.map(features=germeval.features)
rrb_20jhd = rrb_20jhd.map(features=germeval.features)
gszh = gszh.map(features=germeval.features)

Training.

In [8]:
# Overarching training parameters
gradient_accumulation_steps = 2
train_batch_size = 4
n_epochs = 10

# Cross-validation with custom stratified KFold over all five datasets
kf = KFold(n_splits=5)

for i, (
    (krp_19jhd_train_idx, krp_19jhd_eval_idx),
    (krp_20jhd_train_idx, krp_20jhd_eval_idx),
    (rrb_19jhd_train_idx, rrb_19jhd_eval_idx),
    (rrb_20jhd_train_idx, rrb_20jhd_eval_idx),
    (gszh_train_idx, gszh_eval_idx),
) in enumerate(
    list(
        zip(
            kf.split(np.zeros(krp_19jhd.num_rows)),
            kf.split(np.zeros(krp_20jhd.num_rows)),
            kf.split(np.zeros(rrb_19jhd.num_rows)),
            kf.split(np.zeros(rrb_20jhd.num_rows)),
            kf.split(np.zeros(gszh.num_rows)),
        )
    )
):
    print(f"Fold {i}:")

    # Selecting the training subfolds
    krp_19jhd_train = krp_19jhd.select(krp_19jhd_train_idx)
    krp_20jhd_train = krp_20jhd.select(krp_20jhd_train_idx)
    rrb_19jhd_train = rrb_19jhd.select(rrb_19jhd_train_idx)
    rrb_20jhd_train = rrb_20jhd.select(rrb_20jhd_train_idx)
    gszh_train = gszh.select(gszh_train_idx)

    # Selecting the test subfolds
    krp_19jhd_eval = krp_19jhd.select(krp_19jhd_eval_idx)
    krp_20jhd_eval = krp_20jhd.select(krp_20jhd_eval_idx)
    rrb_19jhd_eval = rrb_19jhd.select(rrb_19jhd_eval_idx)
    rrb_20jhd_eval = rrb_20jhd.select(rrb_20jhd_eval_idx)
    gszh_eval = gszh.select(gszh_eval_idx)

    # Concatenating training subfolds
    train_fold = concatenate_datasets(
        [krp_19jhd_train, krp_20jhd_train, rrb_19jhd_train, rrb_20jhd_train, gszh_train]
    ).shuffle(
        seed=42
    )

    # Concatenating test subfolds
    eval_fold = concatenate_datasets(
        [krp_19jhd_eval, krp_20jhd_eval, rrb_19jhd_eval, rrb_20jhd_eval, gszh_eval]
    )

    # Creating fold dataset
    fold_dataset = DatasetDict(
        {
            "train": train_fold,
            "eval": eval_fold,
            "eval_krp_19jhd": krp_19jhd_eval,
            "eval_krp_20jhd": krp_20jhd_eval,
            "eval_rrb_19jhd": rrb_19jhd_eval,
            "eval_rrb_20jhd": rrb_20jhd_eval,
            "eval_gszh": gszh_eval,
        }
    )

    # Model instantiation
    encoder_id = "stefan-it/span-marker-gelectra-large-germeval14"
    model = SpanMarkerModel.from_pretrained(
        # Required arguments
        encoder_id,
        # Optional arguments
        model_max_length=256,
        entity_max_length=8,
    )

    # Training arguments
    args = TrainingArguments(
        output_dir="models/span-marker-ktzh-stazh-cv/tmp",
        learning_rate=1e-05,
        gradient_accumulation_steps=gradient_accumulation_steps,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=4,
        num_train_epochs=n_epochs,
        evaluation_strategy="steps",
        save_strategy="no",
        eval_steps=1500,
        push_to_hub=False,
        logging_steps=50,
        warmup_ratio=0.05,
    )

    # Preprocessing dataset
    train_dataset = preprocess_dataset(
        model,
        fold_dataset["train"],
        AutoLabelNormalizer.from_config(model.config),
        model.tokenizer,
        dataset_name="train",
        is_evaluate=False,
    )

    
    # Setting up learning rate scheduler
    num_training_steps = int(
        len(train_dataset) / gradient_accumulation_steps / train_batch_size * n_epochs
    )

    optimizer = AdamW(model.parameters(), lr=1e-05)

    l_r_scheduler = get_scheduler(
        "polynomial",
        optimizer,
        num_warmup_steps=int(0.05 * num_training_steps),
        num_training_steps=num_training_steps,
        scheduler_specific_kwargs=dict(lr_end=5e-07, power=3),
    )

    # Instantiating trainer
    trainer = NoTrainPreprocTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=fold_dataset["eval"],
        optimizers=(optimizer, l_r_scheduler),
    )

    # Training and saving separate model per fold
    trainer.train()
    trainer.save_model(f"models/span-marker-ktzh-stazh-cv/fold_{i}")

    del trainer
    del model

Fold 0:


  _torch_pytree._register_pytree_node(


Step,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1500,0.003,0.00473,0.907415,0.941786,0.924281,0.989128
3000,0.0023,0.004246,0.92728,0.94059,0.933888,0.991111
4500,0.0012,0.004632,0.925623,0.947767,0.936564,0.991401
6000,0.0013,0.004906,0.923821,0.952552,0.937966,0.991668
7500,0.0002,0.005432,0.930051,0.948963,0.939412,0.991802
9000,0.0008,0.005429,0.923997,0.954944,0.939216,0.991802
10500,0.0004,0.005449,0.926299,0.952153,0.939048,0.991935
12000,0.0003,0.005776,0.926659,0.952153,0.939233,0.991935


Label normalizing the evaluation dataset:   0%|          | 0/1632 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenizing the evaluation dataset:   0%|          | 0/1632 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 7.828004% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words and the maximum model input length of 256 tokens.
These are the frequencies of the missed entities due to maximum entity length out of 2721 total entities:
- 4 missed entities with 9 words (0.147005%)
- 1 missed entities with 10 words (0.036751%)
Additionally, a total of 208 (7.644248%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/1632 [00:00<?, ? examples/s]

Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.


Tokenizing the evaluation dataset:   0%|          | 0/1632 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 7.828004% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words and the maximum model input length of 256 tokens.
These are the frequencies of the missed entities due to maximum entity length out of 2721 total entities:
- 4 missed entities with 9 words (0.147005%)
- 1 missed entities with 10 words (0.036751%)
Additionally, a total of 208 (7.644248%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/1632 [00:00<?, ? examples/s]

Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't 

Fold 1:


Label normalizing the train dataset:   0%|          | 0/6515 [00:00<?, ? examples/s]

Tokenizing the train dataset:   0%|          | 0/6515 [00:00<?, ? examples/s]

This SpanMarker model will ignore 4.319035% of all annotated entities in the train dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words and the maximum model input length of 256 tokens.
These are the frequencies of the missed entities due to maximum entity length out of 10118 total entities:
- 4 missed entities with 9 words (0.039534%)
- 1 missed entities with 10 words (0.009883%)
- 1 missed entities with 13 words (0.009883%)
Additionally, a total of 431 (4.259735%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/6515 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1500,0.0035,0.004377,0.919292,0.942895,0.930944,0.990336
3000,0.0019,0.003788,0.934085,0.953519,0.943702,0.992221
4500,0.0009,0.004141,0.9375,0.956175,0.946746,0.992555
6000,0.0011,0.00422,0.938971,0.953519,0.946189,0.992603
7500,0.0004,0.00459,0.937121,0.956618,0.946769,0.992794
9000,0.0004,0.00475,0.941228,0.957061,0.949078,0.993176
10500,0.0003,0.004806,0.941815,0.960159,0.950899,0.993128
12000,0.0004,0.004751,0.942048,0.957061,0.949495,0.993128


Label normalizing the evaluation dataset:   0%|          | 0/1629 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenizing the evaluation dataset:   0%|          | 0/1629 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 1.267483% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words and the maximum model input length of 256 tokens.
These are the frequencies of the missed entities due to maximum entity length out of 2288 total entities:
- 1 missed entities with 9 words (0.043706%)
Additionally, a total of 28 (1.223776%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/1629 [00:00<?, ? examples/s]

Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't 

Fold 2:


Label normalizing the train dataset:   0%|          | 0/6516 [00:00<?, ? examples/s]

Tokenizing the train dataset:   0%|          | 0/6516 [00:00<?, ? examples/s]

This SpanMarker model will ignore 4.157372% of all annotated entities in the train dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words and the maximum model input length of 256 tokens.
These are the frequencies of the missed entities due to maximum entity length out of 9862 total entities:
- 5 missed entities with 9 words (0.050700%)
- 1 missed entities with 10 words (0.010140%)
- 1 missed entities with 13 words (0.010140%)
Additionally, a total of 403 (4.086392%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/6516 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1500,0.003,0.003916,0.912301,0.944936,0.928332,0.989736
3000,0.0013,0.003928,0.926916,0.948151,0.937413,0.991526
4500,0.001,0.00422,0.929329,0.951367,0.940218,0.991616
6000,0.001,0.004234,0.941295,0.940916,0.941106,0.991979
7500,0.0007,0.00443,0.936356,0.946141,0.941224,0.99182
9000,0.0015,0.004704,0.93225,0.94574,0.938947,0.991412
10500,0.0004,0.004717,0.932411,0.948151,0.940215,0.991865
12000,0.0012,0.004804,0.937873,0.946543,0.942188,0.992069


Label normalizing the evaluation dataset:   0%|          | 0/1628 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenizing the evaluation dataset:   0%|          | 0/1628 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 2.201258% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum model input length of 256 tokens.
A total of 56 (2.201258%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/1628 [00:00<?, ? examples/s]

Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't 

Fold 3:


Label normalizing the train dataset:   0%|          | 0/6516 [00:00<?, ? examples/s]

Tokenizing the train dataset:   0%|          | 0/6516 [00:00<?, ? examples/s]

This SpanMarker model will ignore 4.343945% of all annotated entities in the train dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words and the maximum model input length of 256 tokens.
These are the frequencies of the missed entities due to maximum entity length out of 10083 total entities:
- 5 missed entities with 9 words (0.049588%)
- 1 missed entities with 10 words (0.009918%)
Additionally, a total of 432 (4.284439%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/6516 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1500,0.0038,0.004088,0.898425,0.944227,0.920756,0.989814
3000,0.0013,0.004027,0.913498,0.952505,0.932594,0.991203
4500,0.0017,0.003975,0.921186,0.947277,0.934049,0.991527
6000,0.0008,0.00426,0.922204,0.955556,0.938583,0.992129
7500,0.001,0.004304,0.92204,0.953377,0.937446,0.992106
9000,0.0007,0.004758,0.92032,0.951198,0.935505,0.991851
10500,0.0004,0.004728,0.920775,0.95207,0.936161,0.99199
12000,0.0003,0.00487,0.921486,0.951198,0.936106,0.991898


Label normalizing the evaluation dataset:   0%|          | 0/1628 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenizing the evaluation dataset:   0%|          | 0/1628 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 1.205338% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words and the maximum model input length of 256 tokens.
These are the frequencies of the missed entities due to maximum entity length out of 2323 total entities:
- 1 missed entities with 13 words (0.043048%)
Additionally, a total of 27 (1.162290%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/1628 [00:00<?, ? examples/s]

Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't 

Fold 4:


Label normalizing the train dataset:   0%|          | 0/6517 [00:00<?, ? examples/s]

Tokenizing the train dataset:   0%|          | 0/6517 [00:00<?, ? examples/s]

This SpanMarker model will ignore 3.300932% of all annotated entities in the train dataset. This is caused by the SpanMarkerModel maximum entity length of 8 words and the maximum model input length of 256 tokens.
These are the frequencies of the missed entities due to maximum entity length out of 9876 total entities:
- 5 missed entities with 9 words (0.050628%)
- 1 missed entities with 10 words (0.010126%)
- 1 missed entities with 13 words (0.010126%)
Additionally, a total of 319 (3.230053%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/6517 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1500,0.0025,0.005242,0.906837,0.948954,0.927418,0.990738
3000,0.001,0.004627,0.929947,0.949791,0.939764,0.991942
4500,0.0012,0.004889,0.933498,0.951464,0.942395,0.992191
6000,0.0016,0.005022,0.929301,0.951464,0.940252,0.992214
7500,0.0005,0.005156,0.946128,0.940586,0.943349,0.992509
9000,0.0006,0.005134,0.941152,0.950209,0.945659,0.992986
10500,0.0008,0.00525,0.938246,0.953556,0.945839,0.993031
12000,0.0003,0.005411,0.945394,0.948954,0.947171,0.993167


Label normalizing the evaluation dataset:   0%|          | 0/1627 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenizing the evaluation dataset:   0%|          | 0/1627 [00:00<?, ? examples/s]

This SpanMarker model won't be able to predict 5.533597% of all annotated entities in the evaluation dataset. This is caused by the SpanMarkerModel maximum model input length of 256 tokens.
A total of 140 (5.533597%) entities were missed due to the maximum input length.


Spreading data between multiple samples:   0%|          | 0/1627 [00:00<?, ? examples/s]

Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/adrianvanderlek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Fri Jan  5 15:34:57 2024) since it couldn't 