In [1]:
TRAIN_CSV_FILE = "data/csv_format/ner_sentences_train.csv"
TEST_CSV_FILE =  "data/csv_format/ner_sentences_test.csv"
MODEL_SAVE_PATH = "models/model1_0"

In [2]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained("camembert-base")
# tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-bio-base")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from src.brat_utils import dataset_generator

dataset = dataset_generator(TRAIN_CSV_FILE)
example = dataset["train"][2]
tokenized_input = tokenizer(example["words"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

Generating train split: 21670 examples [00:00, 132664.69 examples/s]
Map: 100%|██████████| 21670/21670 [00:03<00:00, 6783.85 examples/s]
Map: 100%|██████████| 21670/21670 [00:01<00:00, 16531.20 examples/s]
Casting the dataset: 100%|██████████| 21670/21670 [00:00<00:00, 1181102.58 examples/s]


['<s>',
 '▁Di',
 'agno',
 'st',
 'iqué',
 '▁en',
 '▁2010',
 '▁avec',
 '▁un',
 '▁l',
 'ymph',
 'ome',
 '▁diffus',
 '▁à',
 '▁cellules',
 '▁B',
 '▁avec',
 '▁atteinte',
 '▁rétro',
 'péri',
 'ton',
 'é',
 'ale',
 '▁',
 ',',
 '▁traité',
 '▁par',
 '▁Q',
 'T',
 '▁selon',
 '▁R',
 '▁-',
 '▁CH',
 'OP',
 '14',
 '▁X',
 '▁6',
 '▁cycles',
 '▁+',
 '▁2',
 '▁doses',
 '▁de',
 '▁rit',
 'ux',
 'im',
 'ab',
 '▁sans',
 '▁aucun',
 '▁signe',
 '▁de',
 '▁maladie',
 '▁',
 'hémat',
 'ologique',
 '▁à',
 '▁ce',
 '▁jour',
 '▁',
 '.',
 '</s>']

In [4]:
label_list = dataset["train"].features[f"tags"].feature.names
label_list

['O',
 'B-morphologie',
 'I-morphologie',
 'B-topographie',
 'I-topographie',
 'B-differenciation',
 'I-differenciation',
 'B-stade',
 'I-stade']

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids: # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx: # Only label the first token of a given word. 
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
example = dataset["train"][2]
print("Sample example:", example)

Sample example: {'words': ['Diagnostiqué', 'en', '2010', 'avec', 'un', 'lymphome', 'diffus', 'à', 'cellules', 'B', 'avec', 'atteinte', 'rétropéritonéale', ',', 'traité', 'par', 'QT', 'selon', 'R', '-', 'CHOP14', 'X', '6', 'cycles', '+', '2', 'doses', 'de', 'rituximab', 'sans', 'aucun', 'signe', 'de', 'maladie', 'hématologique', 'à', 'ce', 'jour', '.'], 'tags': [0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [7]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset["train"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_dataset["train"].format['type']

Map: 100%|██████████| 21670/21670 [00:01<00:00, 12216.77 examples/s]


'torch'

In [8]:
from transformers import DataCollatorForTokenClassification
import evaluate

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
seqeval = evaluate.load("seqeval")


In [9]:
import numpy as np

labels = [label_list[i] for i in example[f"tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [10]:
id2label = {
    0: "O",
    1: "B-morphologie",
    2: "I-morphologie",
    3: "B-topographie",
    4: "I-topographie",
    5: "B-differenciation",
    6: "I-differenciation",
    7: "B-stade",
    8: "I-stade",
}

label2id = {
    "O": 0,
    "B-morphologie": 1,
    "I-morphologie": 2,
    "B-topographie": 3,
    "I-topographie": 4,
    "B-differenciation": 5,
    "I-differenciation": 6,
    "B-stade": 7,
    "I-stade": 8,
}

In [11]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback

model = AutoModelForTokenClassification.from_pretrained(
    "almanach/camembert-bio-base", num_labels=9, id2label=id2label, label2id=label2id
)

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at almanach/camembert-bio-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Validation set by splitting
split_train_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)

In [13]:
training_args = TrainingArguments(
    output_dir = "small_batch",
    remove_unused_columns=False,
    logging_dir="logs",
    logging_steps=100,           # log every N steps
    report_to="none",           # suppress wandb/tensorboard
    num_train_epochs=5,
    save_total_limit=1,      # keep only the latest checkpoint
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="epoch",             # checkpoint each epoch
    eval_strategy="epoch",             # evaluate each epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_train_dataset["train"],
    eval_dataset=split_train_dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator, 
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train() 


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0446,0.054363,0.763903,0.849831,0.804579,0.983046
2,0.0346,0.04897,0.77315,0.887514,0.826394,0.984578
3,0.0273,0.058638,0.757533,0.904949,0.824705,0.983026
4,0.0146,0.059189,0.809723,0.871204,0.839339,0.985375


TrainOutput(global_step=9752, training_loss=0.05306035815031437, metrics={'train_runtime': 1787.8864, 'train_samples_per_second': 54.542, 'train_steps_per_second': 6.818, 'total_flos': 3640364165547414.0, 'train_loss': 0.05306035815031437, 'epoch': 4.0})

In [14]:
test_dataset = dataset_generator(TEST_CSV_FILE)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset["train"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test_dataset["train"].format['type']
metrics = trainer.evaluate(eval_dataset=tokenized_test_dataset["train"])
print(metrics)

Generating train split: 5418 examples [00:00, 124322.93 examples/s]
Map: 100%|██████████| 5418/5418 [00:00<00:00, 7053.39 examples/s]
Map: 100%|██████████| 5418/5418 [00:00<00:00, 14696.88 examples/s]
Casting the dataset: 100%|██████████| 5418/5418 [00:00<00:00, 842686.95 examples/s]
Map: 100%|██████████| 5418/5418 [00:00<00:00, 13747.16 examples/s]


{'eval_loss': 0.04378490895032883, 'eval_precision': 0.7585371227142542, 'eval_recall': 0.879887554306159, 'eval_f1': 0.8147184098438239, 'eval_accuracy': 0.9849361291877561, 'eval_runtime': 33.7551, 'eval_samples_per_second': 160.509, 'eval_steps_per_second': 20.086, 'epoch': 4.0}


In [15]:
trainer.save_model(MODEL_SAVE_PATH)