In [1]:
import numpy as np
import transformers
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, CamembertForTokenClassification, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric, load_dataset, ClassLabel, DownloadConfig

import os
from pathlib import Path

from tokenizer import Tokenizer
from trajet_dataset import IOBTRAJETDataset

  from .autonotebook import tqdm as notebook_tqdm
  metric = load_metric("seqeval", trust_remote_code=True)


In [2]:
# !python -m spacy download fr_core_news_sm

In [3]:
metric = load_metric("seqeval", trust_remote_code=True)

In [4]:
def compute_metrics(p, label_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [5]:
model_n_version = "trajet_v2"
max_epochs = 6
learning_rate = 2e-5
batch_size = 32

In [6]:
pretrained_model_checkpoint = "camembert-base"
pretrained_tokenizer_checkpoint = "camembert-base"

In [7]:
# decomment if you don't have datas
# !python generate_data.py

In [8]:
dataset = IOBTRAJETDataset()




In [9]:
preprocessor = Tokenizer.init_vf(pretrained_tokenizer_checkpoint=pretrained_tokenizer_checkpoint)

In [10]:
model = CamembertForTokenClassification.from_pretrained(pretrained_model_checkpoint,
                                                            num_labels=len(dataset.labels))
    

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model.config.id2label = dataset.id2label

print(model.config.id2label)
model.config.label2id = dataset.label2id
print(model.config.label2id)

{0: 'B-DEP', 1: 'B-DEST', 2: 'I-DEP', 3: 'I-DEST', 4: 'O'}
{'B-DEP': 0, 'B-DEST': 1, 'I-DEP': 2, 'I-DEST': 3, 'O': 4}


In [12]:
tokenized_datasets = dataset.dataset.map(preprocessor.tokenize_and_align_labels, 
                                         batched=True,
                                         load_from_cache_file=True)

In [13]:
args = TrainingArguments(
        f"test-ner",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=max_epochs,
        weight_decay=0.01, 
        gradient_accumulation_steps=4,
        gradient_checkpointing=True
    )

In [14]:
data_collator = DataCollatorForTokenClassification(tokenizer=preprocessor.tokenizer, padding=False)

In [15]:
len(tokenized_datasets["train"])

59593

In [16]:
len(tokenized_datasets["validation"])

29796

In [17]:
trainer = Trainer(
        model.to('cpu'),
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=preprocessor.tokenizer,
        compute_metrics=lambda p: compute_metrics(p=p, label_list=dataset.labels),
    )

In [19]:
trainer.train()

  0%|          | 2/2790 [1:10:39<1641:37:23, 2119.74s/it]


In [None]:
trainer.evaluate()


In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

In [None]:
true_predictions = [
        [dataset.labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

true_labels = [
            [dataset.labels[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]


In [None]:
true_predictions

In [None]:
results = metric.compute(predictions=true_predictions, references=true_labels)
print("*" *200)
print(results)
print("*" *200)


In [None]:
import os
import json
import numpy as np

# Ensure the 'results' directory exists
results_directory = "results"
os.makedirs(results_directory, exist_ok=True)

# Specify the file path
results_file_path = os.path.join(results_directory, "results.json")

# Define a custom encoder function to handle int32 objects
def custom_encoder(obj):
    if isinstance(obj, np.int32):
        return int(obj)
    raise TypeError("Object not serializable")

# Save results to a JSON file with the custom encoder
with open(results_file_path, 'w') as results_file:
    json.dump(results, results_file, default=custom_encoder)

print(f"Results saved to: {results_file_path}")


In [None]:

out_dir = os.path.expanduser("models") + "/" + model_n_version #models\trajet_v1
trainer.save_model(out_dir)