In [1]:
%load_ext autoreload
%autoreload 2
import utils as bu  

Import required packages

In [3]:
from pathlib import Path
import random
from transformers.utils import logging
import torch
import os
from functools import partial
from transformers import (AutoTokenizer, DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification, TrainingArguments, 
                          Trainer, EarlyStoppingCallback, set_seed) 
from utils import (dataset_generator, get_tokenize_and_align_labels_fn, 
                       compute_metrics, build_token_cls, hp_space, compute_objective)
from argparse import Namespace
import ipywidgets as widgets
from IPython.display import display
import evaluate
import numpy as np 
from datetime import datetime

Check if running on GPU or CPU

In [4]:

if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    print(f"Number of available GPUs: {gpu_count}")

    for i in range(gpu_count):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Using:", device)

Number of available GPUs: 1
GPU 0: NVIDIA RTX 500 Ada Generation Laptop GPU
Using: cuda


Define paths to folders 

In [5]:
PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
TRAIN_CSV_FILE = PROJECT_ROOT / "data/csv_format/ner_sentences_train.csv"
TEST_CSV_FILE  = PROJECT_ROOT / "data/csv_format/ner_sentences_test.csv"


Load datasets from .csv file

In [6]:
# Load datasets from CSV
train_dataset = dataset_generator(TRAIN_CSV_FILE)
test_dataset = dataset_generator(TEST_CSV_FILE)

# Remove columns that are not needed for training
cols_to_remove = train_dataset["train"].column_names  # e.g., ['words','tags',...]

label_list = train_dataset["train"].features[f"tags"].feature.names
print("Labels: ", label_list)
label2id = {label: id for id, label in enumerate(label_list)}
id2label = {id: label for label, id in label2id.items()}

Labels:  ['O', 'B-morphologie', 'I-morphologie', 'B-topographie', 'I-topographie', 'B-differenciation', 'I-differenciation', 'B-stade', 'I-stade']


Define tokenizer and model (and arguments??)

* 'model_name': huggingface model name. Has been tested with: camembert-base, almanach/camembert-bio-base, flaubert/flaubert_base_cased and bert-base-multilingual-cased.

In [17]:
# model options
model_options = [
    "almanach/camembert-bio-base",
    "camembert-base",
    "flaubert/flaubert_base_cased",
    # "Dr-BERT/DrBERT-7GB",
    "bert-base-multilingual-cased", 
    "cservan/french-albert-base-cased"
]

# model default hyperparameters
model_args_dict = {
    "model_name":"almanach/camembert-bio-base",
    "num_train_epochs": 5,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps":2, 
    "learning_rate": 5e-5,
    "remove_unused_columns": True,
    "seed": 42,
    "logging_dir": "logs",
    "logging_steps": 100,           # log every N steps
    "report_to": ["tensorboard"],  # log to TensorBoard
    "load_best_model_at_end": True,
    "save_total_limit": 2,  # keep only the last 2 checkpoints
    "metric_for_best_model": "f1",
    "greater_is_better": True,  # for f1, higher is better
    "eval_strategy": "epoch",            # evaluate each epoch
    "save_strategy": "epoch",          # checkpoint each epoch
}

model_args = Namespace(**model_args_dict)

model_name_widget = widgets.Dropdown(
    options=model_options,
    value=model_args.model_name,
    description='Model:',
)
num_epochs_widget = widgets.IntSlider(value=model_args.num_train_epochs, min=1, max=16, description='Epochs:')
train_batch_size_widget = widgets.IntSlider(value=model_args.per_device_train_batch_size, min=1, max=32, description='Batch Size:')
lr_widget = widgets.FloatLogSlider(value=model_args.learning_rate, base=10, min=-6, max=-2, step=0.1, description='LR:', readout_format='.1e')

display(model_name_widget, num_epochs_widget, train_batch_size_widget, lr_widget)

# Sync function to update Namespace
def update_args(*_):
    model_args.model_name = model_name_widget.value
    model_args.num_train_epochs = num_epochs_widget.value
    model_args.per_device_train_batch_size = train_batch_size_widget.value
    model_args.learning_rate = lr_widget.value

for w in [model_name_widget, num_epochs_widget, train_batch_size_widget, lr_widget]:
    w.observe(update_args, names='value')

update_args()  # sync initially
print(model_args)



Dropdown(description='Model:', options=('almanach/camembert-bio-base', 'camembert-base', 'flaubert/flaubert_ba…

IntSlider(value=5, description='Epochs:', max=16, min=1)

IntSlider(value=4, description='Batch Size:', max=32, min=1)

FloatLogSlider(value=5e-05, description='LR:', max=-2.0, min=-6.0, readout_format='.1e')

Namespace(model_name='almanach/camembert-bio-base', num_train_epochs=5, per_device_train_batch_size=4, per_device_eval_batch_size=4, gradient_accumulation_steps=2, learning_rate=5e-05, remove_unused_columns=True, seed=42, logging_dir='logs', logging_steps=100, report_to=['tensorboard'], load_best_model_at_end=True, save_total_limit=2, metric_for_best_model='f1', greater_is_better=True, eval_strategy='epoch', save_strategy='epoch')


Load tokenizer and model

In [18]:
logging.set_verbosity_info()

tokenizer = AutoTokenizer.from_pretrained(model_args.model_name, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(model_args.model_name, num_labels=9, id2label=id2label, label2id=label2id)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) 

loading file sentencepiece.bpe.model from cache at /home/mv376/.cache/huggingface/hub/models--almanach--camembert-bio-base/snapshots/7a70e4b76571c8173b9416ba6d9e7ceba1c46ec3/sentencepiece.bpe.model
loading file tokenizer.json from cache at /home/mv376/.cache/huggingface/hub/models--almanach--camembert-bio-base/snapshots/7a70e4b76571c8173b9416ba6d9e7ceba1c46ec3/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/mv376/.cache/huggingface/hub/models--almanach--camembert-bio-base/snapshots/7a70e4b76571c8173b9416ba6d9e7ceba1c46ec3/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/mv376/.cache/huggingface/hub/models--almanach--camembert-bio-base/snapshots/7a70e4b76571c8173b9416ba6d9e7ceba1c46ec3/tokenizer_config.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at /home/mv376/.cache/huggingface/hub/models--almanach--camembert-bio-base/sna

Tokenize datasets

In [19]:

tokenize_fn = get_tokenize_and_align_labels_fn(
    tokenizer=tokenizer,
    label2id=None,          # skip mapping because labels are already integers
    label_all_tokens=False
)

tokenized_train_dataset = train_dataset.map(tokenize_fn, batched=True, remove_columns=cols_to_remove) 
tokenized_test_dataset = test_dataset.map(tokenize_fn, batched=True, remove_columns=cols_to_remove) 
split_train_dataset = tokenized_train_dataset["train"].train_test_split(test_size=0.1) # Split train dataset into train and validation sets 

Define output and model location

In [20]:
MODEL_NAME_MAP = {
    "almanach/camembert-bio-base": "camembert-bio",
    "camembert-base": "camembert-base",
    "flaubert/flaubert_base_cased": "flaubert-base",
    "bert-base-multilingual-cased": "bert-base-multilingual",
    "cservan/french-albert-base-cased": "fr-albert"
}
dir_model_name = MODEL_NAME_MAP[model_args.model_name]

OUTPUT_DIR_HPO = PROJECT_ROOT / "runs/hpo" / f"{dir_model_name}_{datetime.now().strftime('%Y-%m-%d_%H-%M')}"
OUTPUT_DIR_FINAL = PROJECT_ROOT / "runs/final_best" / f"{dir_model_name}_{datetime.now().strftime('%Y-%m-%d_%H-%M')}"
MODEL_SAVE_PATH = PROJECT_ROOT / "models" / dir_model_name

OUTPUT_DIR_HPO.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR_FINAL.mkdir(parents=True, exist_ok=True)
MODEL_SAVE_PATH.mkdir(parents=True, exist_ok=True)

Train model

In [21]:

model_init = partial(
    build_token_cls,
    model_args.model_name,
    len(label_list),
    id2label,
    label2id
)

# training_args = TrainingArguments(
#     output_dir = OUTPUT_DIR,
#     num_train_epochs=model_args.num_train_epochs,
#     per_device_train_batch_size=model_args.per_device_train_batch_size,
#     per_device_eval_batch_size=model_args.per_device_eval_batch_size,
#     learning_rate=model_args.learning_rate,
#     remove_unused_columns=model_args.remove_unused_columns,
#     seed=model_args.seed,
#     logging_dir=model_args.logging_dir,
#     logging_steps=model_args.logging_steps,           
#     report_to=model_args.report_to,           
#     load_best_model_at_end=model_args.load_best_model_at_end,
#     save_total_limit=model_args.save_total_limit,      
#     metric_for_best_model=model_args.metric_for_best_model,
#     greater_is_better=model_args.greater_is_better, 
#     eval_strategy=model_args.eval_strategy,             
#     save_strategy=model_args.save_strategy,             
# )

# trainer = Trainer(
#     model_init=model_init,
#     args=training_args,
#     train_dataset=split_train_dataset["train"], # train dataset
#     eval_dataset=split_train_dataset["test"], # validation dataset
#     processing_class=tokenizer,
#     data_collator=data_collator, 
#     compute_metrics=partial(compute_metrics, label_list=label_list),
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
# )


Search for best hyperparameters

In [None]:
hpo_args = TrainingArguments(
    output_dir=OUTPUT_DIR_HPO, 
    eval_strategy=model_args.eval_strategy,
    save_strategy="no",
    report_to=model_args.report_to, # or maybe not? 
    metric_for_best_model=model_args.metric_for_best_model,
    greater_is_better=model_args.greater_is_better,
    seed=42, data_seed=42,   # fix seed during HPO; vary seeds later
)

hpo_trainer = Trainer(
    args=hpo_args,
    model_init=model_init,
    train_dataset=split_train_dataset["train"],
    eval_dataset=split_train_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=partial(compute_metrics, label_list=label_list)  # must return "f1", "precision", "recall", etc.
)

best_run = hpo_trainer.hyperparameter_search(
    direction="maximize",
    hp_space=hp_space,
    compute_objective=partial(compute_objective, label_list=label_list),
    n_trials=20,
    backend="optuna"
)
print("Best hyperparameters:", best_run.hyperparameters)


PyTorch: setting up devices
average_tokens_across_devices is True but world size is 1. Setting it to False automatically.
  hpo_trainer = Trainer(
loading configuration file config.json from cache at /home/mv376/.cache/huggingface/hub/models--almanach--camembert-bio-base/snapshots/7a70e4b76571c8173b9416ba6d9e7ceba1c46ec3/config.json
Model config CamembertConfig {
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-morphologie",
    "2": "I-morphologie",
    "3": "B-topographie",
    "4": "I-topographie",
    "5": "B-differenciation",
    "6": "I-differenciation",
    "7": "B-stade",
    "8": "I-stade"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-differenciation": 5,
    "B-morphologie": 1,
    "B-stade": 7,
    "B-topog

Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy,F1/differenciation,Precision/differenciation,Recall/differenciation,Support/differenciation,F1/morphologie,Precision/morphologie,Recall/morphologie,Support/morphologie,F1/stade,Precision/stade,Recall/stade,Support/stade,F1/topographie,Precision/topographie,Recall/topographie,Support/topographie
1,0.2069,0.106852,0.710506,0.685672,0.737207,0.978366,0.0,0.0,0.0,51,0.791176,0.774472,0.808617,998,0.537931,0.527027,0.549296,71,0.654878,0.607466,0.710317,756



***** Running Evaluation *****
  Num examples = 2167
  Batch size = 8
[W 2025-08-19 00:23:01,451] Trial 0 failed with parameters: {'learning_rate': 1.5278879739898444e-05, 'num_train_epochs': 5, 'weight_decay': 0.06183063984787183, 'warmup_ratio': 0.19962740954251568} because of the following error: KeyError('eval_f1_topographie').
Traceback (most recent call last):
  File "/home/mv376/miniconda3/envs/camembert/lib/python3.10/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "/home/mv376/miniconda3/envs/camembert/lib/python3.10/site-packages/transformers/integrations/integration_utils.py", line 274, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/home/mv376/miniconda3/envs/camembert/lib/python3.10/site-packages/transformers/trainer.py", line 2238, in train
    return inner_training_loop(
  File "/home/mv376/miniconda3/envs/camembert/lib/python3.10/site-packages/transformers/trainer.py", l

KeyError: 'eval_f1_topographie'

Five random initializations

In [None]:
seeds = [11, 22, 33, 44, 55] # should i make this random?

results = []
for s in seeds:
    best_args = TrainingArguments(
        output_dir=os.fspath(OUTPUT_DIR_FINAL / f"s{s}"),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model=model_args.metric_for_best_model,
        greater_is_better=True,
        report_to=model_args.report_to,
        seed=s,                   # <- governs init/dropout/etc.
        data_seed=s,              # <- governs shuffling/samplers
        remove_unused_columns=model_args.remove_unused_columns,
        per_device_train_batch_size=best_run.hyperparameters.get("per_device_train_batch_size", 8),
        per_device_eval_batch_size=max(16, best_run.hyperparameters.get("per_device_train_batch_size", 8)*4),
        gradient_accumulation_steps=best_run.hyperparameters.get("gradient_accumulation_steps", 2),
        learning_rate=best_run.hyperparameters.get("learning_rate", 5e-5),
        num_train_epochs=best_run.hyperparameters.get("num_train_epochs", 5),
        weight_decay=best_run.hyperparameters.get("weight_decay", 0.01),
        warmup_ratio=best_run.hyperparameters.get("warmup_ratio", 0.1),
        save_total_limit=model_args.save_total_limit,
        logging_steps=model_args.logging_steps,
        logging_dir=model_args.logging_dir,
        load_best_model_at_end=model_args.load_best_model_at_end,
        report_to=model_args.report_to,
        run_name=f"{dir_model_name}_s{s}",  # TB run label
    )

    final_trainer = Trainer(
        args=best_args,
        model_init=model_init,   # ensures model is (re)initialized *after* seeding
        train_dataset=split_train_dataset["train"],
        eval_dataset=split_train_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,   # returns "f1", etc.
    )

    final_trainer.train()
    metrics = final_trainer.evaluate()
    metrics["seed"] = s
    results.append(metrics)


Train model with best hyperparameters

In [None]:
# best_args = TrainingArguments(
#     output_dir=OUTPUT_DIR_FINAL,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     metric_for_best_model="f1",
#     greater_is_better=True,
#     report_to=["tensorboard"],
#     seed=42, data_seed=42,
#     # merge best hyperparams (use .get in case some keys weren't tuned)
#     learning_rate=best_run.hyperparameters.get("learning_rate", 5e-5),
#     num_train_epochs=best_run.hyperparameters.get("num_train_epochs", 5),
#     per_device_train_batch_size=best_run.hyperparameters.get("per_device_train_batch_size", 8),
#     weight_decay=best_run.hyperparameters.get("weight_decay", 0.01),
#     warmup_ratio=best_run.hyperparameters.get("warmup_ratio", 0.1),
# )



Get metrics

In [None]:
metrics = final_trainer.evaluate(eval_dataset=tokenized_test_dataset["train"])
print(metrics)

Generating train split: 5418 examples [00:00, 124322.93 examples/s]
Map: 100%|██████████| 5418/5418 [00:00<00:00, 7053.39 examples/s]
Map: 100%|██████████| 5418/5418 [00:00<00:00, 14696.88 examples/s]
Casting the dataset: 100%|██████████| 5418/5418 [00:00<00:00, 842686.95 examples/s]
Map: 100%|██████████| 5418/5418 [00:00<00:00, 13747.16 examples/s]


{'eval_loss': 0.04378490895032883, 'eval_precision': 0.7585371227142542, 'eval_recall': 0.879887554306159, 'eval_f1': 0.8147184098438239, 'eval_accuracy': 0.9849361291877561, 'eval_runtime': 33.7551, 'eval_samples_per_second': 160.509, 'eval_steps_per_second': 20.086, 'epoch': 4.0}


Save final model

In [None]:
final_trainer.save_model(MODEL_SAVE_PATH)

Saving model checkpoint to /home/mv376/projects/CamemBERT-bio/models/almanach/camembert-bio-base
Configuration saved in /home/mv376/projects/CamemBERT-bio/models/almanach/camembert-bio-base/config.json
Model weights saved in /home/mv376/projects/CamemBERT-bio/models/almanach/camembert-bio-base/model.safetensors
tokenizer config file saved in /home/mv376/projects/CamemBERT-bio/models/almanach/camembert-bio-base/tokenizer_config.json
Special tokens file saved in /home/mv376/projects/CamemBERT-bio/models/almanach/camembert-bio-base/special_tokens_map.json
