# CamemBERT fine-tuning

Because of dependency conflicts, we will be fine-tuning the model here and then loading it and evaluating in [deepl_ner.ipynb](./deepl_ner.ipynb).


In [20]:
!pip install --upgrade transformers tf-keras focal-loss

Collecting focal-loss
  Downloading focal_loss-0.0.7-py3-none-any.whl.metadata (5.1 kB)
Downloading focal_loss-0.0.7-py3-none-any.whl (19 kB)
Installing collected packages: focal-loss
Successfully installed focal-loss-0.0.7

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [21]:
import os

os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [22]:
import tensorflow as tf

In [23]:
from app.travel_resolver.libs.nlp import data_processing as dp

sentences, labels, vocab, unique_labels = dp.from_bio_file_to_examples(
    "./data/bio/fr.bio/10k_train_small_samples.bio"
)

# To avoid overfitting the model on sentences that don't have any labels
# lambda_sentences, lambda_labels, _, __ = dp.from_bio_file_to_examples(
#     "./data/bio/fr.bio/1k_train_unlabeled_samples.bio"
# )

large_sentences, large_labels, _, __ = dp.from_bio_file_to_examples(
    "./data/bio/fr.bio/1k_train_large_samples.bio"
)

sentences = sentences + large_sentences
labels = labels + large_labels

In [24]:
import app.travel_resolver.libs.nlp.data_processing as dp

processed_sentences, processed_labels = dp.process_sentences_and_labels(
    sentences, labels, return_tokens=True, stemming=False
)

In [25]:
for i in range(len(processed_sentences)):
    for j in range(len(processed_sentences[i])):
        if processed_labels[i][j] > 0:
            processed_sentences[i][j] = processed_sentences[i][j].title()

In [7]:
"""
  This variable will control the maximum length of the sentence 
  as well as the embedding size
"""

MAX_LEN = 100

In [26]:
padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
    processed_labels, maxlen=MAX_LEN, padding="post"
)

In [27]:
from transformers import TFAutoModelForTokenClassification, CamembertTokenizer
import numpy as np

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

In [28]:
tokenized_sentences = tokenizer(
    processed_sentences,
    is_split_into_words=True,
    truncation=True,
    padding="max_length",
    max_length=MAX_LEN,
)

In [33]:
from sklearn.model_selection import train_test_split

(
    train_input_ids,
    test_input_ids,
    train_attention_masks,
    test_attention_masks,
    train_labels,
    test_labels,
) = train_test_split(
    tokenized_sentences["input_ids"],
    tokenized_sentences["attention_mask"],
    padded_labels,
    test_size=0.2,
)

In [39]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (
        {
            "input_ids": train_input_ids,
            "attention_mask": train_attention_masks,
        },
        train_labels,
    )
)

test_dataset = tf.data.Dataset.from_tensor_slices(
    (
        {
            "input_ids": test_input_ids,
            "attention_mask": test_attention_masks,
        },
        test_labels,
    )
)

In [40]:
def entity_accuracy(y_true, y_pred):
    """
    Calculate the accuracy based on the entities. Which mean that correct `O` tags will not be taken into account.

    Parameters:
    y_true (tensor): True labels.
    y_pred (tensor): Predicted logits.

    Returns:
    accuracy (tensor): Tag accuracy.
    """

    y_true = tf.cast(y_true, tf.float32)
    # We ignore the padding and the O tag
    mask = y_true > 0
    mask = tf.cast(mask, tf.float32)

    y_pred_class = tf.math.argmax(y_pred, axis=-1)
    y_pred_class = tf.cast(y_pred_class, tf.float32)

    matches_true_pred = tf.equal(y_true, y_pred_class)
    matches_true_pred = tf.cast(matches_true_pred, tf.float32)

    matches_true_pred *= mask

    masked_acc = tf.reduce_sum(matches_true_pred) / tf.reduce_sum(mask)

    return masked_acc

In [14]:
class_weights = {0: 0.1, 1: 20.0, 2: 20.0}


def weighted_loss(y_true, y_pred):
    weights = tf.constant(
        [class_weights[i] for i in range(len(class_weights))], dtype=tf.float32
    )
    weights = tf.gather(
        weights, tf.cast(y_true, tf.int32)
    )  # Get weights for true labels
    loss = tf.keras.losses.sparse_categorical_crossentropy(
        y_true, y_pred, from_logits=True
    )
    return loss * weights

In [61]:
from focal_loss import SparseCategoricalFocalLoss

camembert = TFAutoModelForTokenClassification.from_pretrained(
    "camembert-base", num_labels=len(unique_labels)
)

loss_func = SparseCategoricalFocalLoss(
    gamma=2, class_weight=[0.1, 2, 2], from_logits=True
)

camembert.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(5e-5),
    loss=loss_func,
    metrics=["accuracy", entity_accuracy],
)

All PyTorch model weights were used when initializing TFCamembertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFCamembertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
train_dataset = train_dataset.batch(32)
test_dataset = test_dataset.batch(32)

In [62]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=0, restore_best_weights=True
)

camembert.fit(
    train_dataset, validation_data=test_dataset, epochs=4, callbacks=[callback]
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tf_keras.src.callbacks.History at 0x2dab031a0>

In [None]:
from focal_loss import SparseCategoricalFocalLoss

loss_func = SparseCategoricalFocalLoss(gamma=1)
y_true = [0, 1, 2]
y_pred = [[0.8, 0.1, 0.1], [0.2, 0.7, 0.1], [0.2, 0.2, 0.6]]
loss_func(y_true, y_pred)

<tf.Tensor: shape=(), dtype=float32, numpy=0.1186538115143776>

In [63]:
camembert.save_pretrained("./models/camembert")

In [None]:
# camembert.push_to_hub("CamemBERT-NER-Travel")

tf_model.h5: 100%|██████████| 440M/440M [00:20<00:00, 21.8MB/s] 
