# SpanMarker full model training

This notebook trains a SpanMarker model on the full provided (composite) dataset. Parquet files must be stored in the same directory. An internet connection is required to download the germeval2014 dataset necessary for feature mapping.

In [None]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

from customized_spanmarker_training import NoTrainPreprocTrainer, preprocess_dataset
from datasets import Dataset, concatenate_datasets, load_dataset
from span_marker import SpanMarkerModel, SpanMarkerModelCardData
from span_marker.label_normalizer import AutoLabelNormalizer
from torch.optim import AdamW
from transformers import TrainingArguments, get_scheduler, set_seed

set_seed(42)

Loading datasets remotely and from parquet.

In [3]:
germeval = load_dataset("gwlms/germeval2014")["train"]
germeval = germeval.select_columns(["tokens", "ner_tags"])
krp_19jhd = Dataset.from_parquet("krp_19jhd.parquet")
krp_20jhd = Dataset.from_parquet("krp_20jhd.parquet")
rrb_19jhd = Dataset.from_parquet("rrb_19jhd.parquet")
rrb_20jhd = Dataset.from_parquet("rrb_20jhd.parquet")
gszh = Dataset.from_parquet("gszh.parquet")

Mapping features to the original germeval2014 indices.

In [4]:
krp_19jhd = krp_19jhd.map(features=germeval.features)
krp_20jhd = krp_20jhd.map(features=germeval.features)
rrb_19jhd = rrb_19jhd.map(features=germeval.features)
rrb_20jhd = rrb_20jhd.map(features=germeval.features)
gszh = gszh.map(features=germeval.features)

Training.

In [5]:
# Overarching training parameters
gradient_accumulation_steps = 2
train_batch_size = 4
n_epochs = 10

# The datasets are concatenated into a single dataset that is shuffled
ds = concatenate_datasets([krp_19jhd, krp_20jhd, rrb_19jhd, rrb_20jhd, gszh]).shuffle(
    seed=42
)

# Model instantiation
encoder_id = "stefan-it/span-marker-gelectra-large-germeval14"
model = SpanMarkerModel.from_pretrained(
    encoder_id,
    labels=ds.features["ner_tags"].feature.names,
    model_max_length=256,
    entity_max_length=8,
    model_card_data=SpanMarkerModelCardData(
        language=["de"],
        license="mit",
        encoder_name="SpanMarker for GermEval 2014 NER",
        encoder_id="stefan-it/span-marker-gelectra-large-germeval14",
        model_name="SpanMarker KtZH",
        model_id="span-marker-ktzh-stazh",
    ),
)

# Training arguments
args = TrainingArguments(
    output_dir="models/span-marker-ktzh-stazh/output",
    learning_rate=1e-05,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=4,
    num_train_epochs=n_epochs,
    evaluation_strategy="no",
    save_strategy="epoch",
    push_to_hub=False,
    logging_steps=500,
    warmup_ratio=0.05,
)

# Preprocessing dataset
ds_preproc = preprocess_dataset(
    model,
    ds,
    AutoLabelNormalizer.from_config(model.config),
    model.tokenizer,
    dataset_name="train",
    is_evaluate=False,
)


# Setting up learning rate scheduler
num_training_steps = int(
    len(ds_preproc) / gradient_accumulation_steps / train_batch_size * n_epochs
)
optimizer = AdamW(model.parameters(), lr=1e-05)
l_r_scheduler = get_scheduler(
    "polynomial",
    optimizer,
    num_warmup_steps=int(0.05 * num_training_steps),
    num_training_steps=num_training_steps,
    scheduler_specific_kwargs=dict(lr_end=5e-07, power=3),
)

# Instantiating trainer
trainer = NoTrainPreprocTrainer(
    model=model,
    args=args,
    train_dataset=ds_preproc,
    optimizers=(optimizer, l_r_scheduler),
)

# Training and saving model
trainer.train()
trainer.save_model("models/span-marker-ktzh-stazh/checkpoint-final")

The provided 'span-marker-stazh' model ID should include the organization or user, such as "tomaarsen/span-marker-mbert-base-multinerd". Setting `model_id` to None.
  _torch_pytree._register_pytree_node(


Step,Training Loss
500,0.0139
1000,0.005
1500,0.0044
2000,0.003
2500,0.0029
3000,0.0026
3500,0.0023
4000,0.0018
4500,0.0019
5000,0.0017
