# CamemBERT fine-tuning

Because of dependency conflicts, we will be fine-tuning the model here and then loading it and evaluating in [deepl_ner.ipynb](./deepl_ner.ipynb).


In [None]:
!pip install --upgrade transformers tf-keras numpy sentencepiece


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os

os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [3]:
import tensorflow as tf

In [4]:
from app.travel_resolver.libs.nlp import data_processing as dp

sentences, labels, vocab, unique_labels = dp.from_bio_file_to_examples(
    "./data/bio/fr.bio/10k_train_small_samples.bio"
)

lambda_sentences, lambda_labels, _, __ = dp.from_bio_file_to_examples(
    "./data/bio/fr.bio/1k_train_unlabeled_samples.bio"
)

large_sentences, large_labels, _, __ = dp.from_bio_file_to_examples(
    "./data/bio/fr.bio/1k_train_large_samples.bio"
)

sentences = sentences + lambda_sentences + large_sentences
labels = labels + lambda_labels + large_labels

[nltk_data] Downloading package punkt_tab to /Users/az-r-
[nltk_data]     ow/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
import app.travel_resolver.libs.nlp.data_processing as dp

processed_sentences, processed_labels = dp.process_sentences_and_labels(
    sentences, labels, return_tokens=True, stemming=False
)

In [6]:
for i in range(len(processed_sentences)):
    for j in range(len(processed_sentences[i])):
        if processed_labels[i][j] > 0:
            processed_sentences[i][j] = processed_sentences[i][j].title()

In [7]:
"""
  This variable will control the maximum length of the sentence 
  as well as the embedding size
"""

MAX_LEN = 100

In [8]:
padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
    processed_labels, maxlen=MAX_LEN, padding="post"
)

In [None]:
from transformers import TFAutoModelForTokenClassification, CamembertTokenizer
import numpy as np

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

In [23]:
tokenized_sentences = tokenizer(
    processed_sentences,
    is_split_into_words=True,
    truncation=True,
    padding="max_length",
    max_length=MAX_LEN,
)

In [24]:
from sklearn.model_selection import train_test_split

(
    train_input_ids,
    test_input_ids,
    train_attention_masks,
    test_attention_masks,
    train_labels,
    test_labels,
) = train_test_split(
    tokenized_sentences["input_ids"],
    tokenized_sentences["attention_mask"],
    padded_labels,
    test_size=0.2,
)

In [26]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (
        {
            "input_ids": train_input_ids,
            "attention_mask": train_attention_masks,
        },
        train_labels,
    )
)

test_dataset = tf.data.Dataset.from_tensor_slices(
    (
        {
            "input_ids": test_input_ids,
            "attention_mask": test_attention_masks,
        },
        test_labels,
    )
)

In [32]:
camembert = TFAutoModelForTokenClassification.from_pretrained(
    "camembert-base", num_labels=len(unique_labels)
)

camembert.compile(
    optimizer=tf.keras.optimizers.Adam(5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

All PyTorch model weights were used when initializing TFCamembertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFCamembertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
train_dataset = train_dataset.batch(64)
test_dataset = test_dataset.batch(64)

In [34]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=0, restore_best_weights=True
)

camembert.fit(
    train_dataset, validation_data=test_dataset, epochs=3, callbacks=[callback]
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x295a015b0>

In [None]:
camembert.save_pretrained("./camembert")