In [1]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForTokenClassification

### Load our medieval french dataset (locally)
dataset = load_from_disk("./data/ck_ner_dataset_hg")

Breakdown of the dataset

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 114293
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 12700
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 14110
    })
})

Tokenization

In [3]:
### get all labels and convert them to ids required be the model
label_list = sorted({tag for example in dataset['train'] for tag in example['ner_tags']})
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [4]:
from transformers import AutoTokenizer

model_name = "pjox/dalembert"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [5]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True, ##we have it already split from spacy
        truncation=True,
        padding="max_length",
    )
    word_ids = tokenized.word_ids()
    ### adapt labels to subwords
    labels = []
    prev_word_id = None

    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(label_to_id[example["ner_tags"][word_id]])
        else:
            labels.append(-100)
        prev_word_id = word_id

    tokenized["labels"] = labels
    return tokenized

Tokenize and map train, test, and validation

In [6]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

Training

In [7]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels = len(label_list),
    id2label = id_to_label,
    label2id = label_to_id
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at pjox/dalembert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

### Trained on a RTX 4080 SUPER
training_args = TrainingArguments(
    output_dir = "./models/dalembert-ner",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-5,  # slightly higher, since you can afford faster convergence
    per_device_train_batch_size = 16,  
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 1,  
    num_train_epochs = 5,  
    weight_decay = 0.01,
    logging_dir = "./logs",
    logging_steps = 10,
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "f1",
    fp16 = True, 
    dataloader_num_workers = 4,
    report_to = "none" 
)

Define evaluation

In [9]:
import evaluate
import numpy as np
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [10]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

  0%|          | 0/7144 [00:00<?, ?it/s]

{'loss': 2.1226, 'grad_norm': 10.646590232849121, 'learning_rate': 2.9974804031354985e-05, 'epoch': 0.0}
{'loss': 0.2763, 'grad_norm': 0.9334936738014221, 'learning_rate': 2.9932810750279955e-05, 'epoch': 0.0}
{'loss': 0.2243, 'grad_norm': 0.9919390678405762, 'learning_rate': 2.9890817469204928e-05, 'epoch': 0.0}
{'loss': 0.1228, 'grad_norm': 1.310013771057129, 'learning_rate': 2.9848824188129897e-05, 'epoch': 0.01}
{'loss': 0.0936, 'grad_norm': 0.5646450519561768, 'learning_rate': 2.9806830907054874e-05, 'epoch': 0.01}
{'loss': 0.1154, 'grad_norm': 0.43260371685028076, 'learning_rate': 2.9764837625979843e-05, 'epoch': 0.01}
{'loss': 0.0542, 'grad_norm': 0.6235882639884949, 'learning_rate': 2.9722844344904816e-05, 'epoch': 0.01}
{'loss': 0.0732, 'grad_norm': 0.5747964382171631, 'learning_rate': 2.968085106382979e-05, 'epoch': 0.01}
{'loss': 0.0686, 'grad_norm': 0.7399935126304626, 'learning_rate': 2.9638857782754762e-05, 'epoch': 0.01}
{'loss': 0.0695, 'grad_norm': 0.5156792402267456, 

  0%|          | 0/794 [00:00<?, ?it/s]

{'eval_loss': 0.011544337496161461, 'eval_precision': 0.8988987816307404, 'eval_recall': 0.9271387143547608, 'eval_f1': 0.9128003806804663, 'eval_accuracy': 0.9963943290718026, 'eval_runtime': 522.8954, 'eval_samples_per_second': 24.288, 'eval_steps_per_second': 1.518, 'epoch': 1.0}
{'train_runtime': 1931.6643, 'train_samples_per_second': 59.168, 'train_steps_per_second': 3.698, 'train_loss': 0.022459978886440912, 'epoch': 1.0}


TrainOutput(global_step=7144, training_loss=0.022459978886440912, metrics={'train_runtime': 1931.6643, 'train_samples_per_second': 59.168, 'train_steps_per_second': 3.698, 'total_flos': 2.986844024951501e+16, 'train_loss': 0.022459978886440912, 'epoch': 1.0})

In [None]:
trainer.save_model("./models/dalembert-ner-finetuned_ep5")
tokenizer.save_pretrained("./models/dalembert-ner-finetuned_tokenizer")

('./models/dalembert-ner-finetuned_tokenizer\\tokenizer_config.json',
 './models/dalembert-ner-finetuned_tokenizer\\special_tokens_map.json',
 './models/dalembert-ner-finetuned_tokenizer\\vocab.json',
 './models/dalembert-ner-finetuned_tokenizer\\merges.txt',
 './models/dalembert-ner-finetuned_tokenizer\\added_tokens.json',
 './models/dalembert-ner-finetuned_tokenizer\\tokenizer.json')

In [12]:
import torch
torch.cuda.empty_cache()

trainer.args.per_device_eval_batch_size = 4
results = trainer.evaluate(tokenized_dataset["test"])
print(results)

  0%|          | 0/3528 [00:00<?, ?it/s]

{'eval_loss': 0.012649307027459145, 'eval_precision': 0.8875523012552301, 'eval_recall': 0.9204816663050553, 'eval_f1': 0.9037171157737779, 'eval_accuracy': 0.9961633534249842, 'eval_runtime': 1427.0406, 'eval_samples_per_second': 9.888, 'eval_steps_per_second': 2.472, 'epoch': 1.0}
