##### Fine-Tuning of d'Alembert to our dataset (NER task)

In [None]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForTokenClassification

### Load our medieval french dataset (locally)
dataset = load_from_disk("./data/ck_ner_dataset_hg")

Breakdown of the dataset

In [None]:
dataset

Tokenization

In [None]:
### get all labels and convert them to ids required be the model
label_list = sorted({tag for example in dataset['train'] for tag in example['ner_tags']})
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [None]:
from transformers import AutoTokenizer

model_name = "pjox/dalembert"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [None]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True, ##we have it already split from spacy
        truncation=True,
        padding="max_length",
    )
    word_ids = tokenized.word_ids()
    ### adapt labels to subwords
    labels = []
    prev_word_id = None

    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(label_to_id[example["ner_tags"][word_id]])
        else:
            labels.append(-100)
        prev_word_id = word_id

    tokenized["labels"] = labels
    return tokenized

Tokenize and map train, test, and validation

In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

Training

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels = len(label_list),
    id2label = id_to_label,
    label2id = label_to_id
)

In [None]:
from transformers import TrainingArguments

### Trained on a RTX 4080 SUPER
training_args = TrainingArguments(
    output_dir = "./models/dalembert-ner",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-5,  # slightly higher, since you can afford faster convergence
    per_device_train_batch_size = 16,  
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 1,  
    num_train_epochs = 3,  
    weight_decay = 0.01,
    logging_dir = "./logs",
    logging_steps = 10,
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "f1",
    fp16 = True, 
    dataloader_num_workers = 4,
    report_to = "none" 
)

Define evaluation

In [None]:
import evaluate
import numpy as np
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

Save models to file (optional)

In [None]:
# trainer.save_model("./models/dalembert-ner-finetuned_ep3")
# tokenizer.save_pretrained("./models/dalembert-ner-finetuned_tokenizer")