In [16]:
import numpy as np
import transformers
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, CamembertForTokenClassification, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric, load_dataset, ClassLabel, DownloadConfig

import os
from pathlib import Path

from tokenizer import Tokenizer
from trajet_dataset import IOBTRAJETDataset

In [17]:
metric = load_metric("seqeval", trust_remote_code=True)

In [18]:
# def compute_metrics(p, label_list):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=2)

#     # Remove ignored index (special tokens)
#     true_predictions = [
#         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     true_labels = [
#         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]

#     results = metric.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["overall_f1"],
#         "accuracy": results["overall_accuracy"],
#     }

In [19]:
from sklearn.metrics import multilabel_confusion_matrix, classification_report

def compute_metrics(p):
    if "loss" in p:
        loss = p["loss"].item()
    else:
        predictions = np.argmax(p.predictions, axis=-1)
        labels = p.label_ids
        confusion_matrices = multilabel_confusion_matrix(labels.flatten(), predictions.flatten())
        true_positives = confusion_matrices[:, 1, 1]
        false_positives = confusion_matrices[:, 0, 1]
        false_negatives = confusion_matrices[:, 1, 0]

        precision = true_positives / np.maximum((true_positives + false_positives), 1)
        recall = true_positives / np.maximum((true_positives + false_negatives), 1)
        f1 = 2 * (precision * recall) / np.maximum((precision + recall), 1)

        loss = None

    return {
        'precision': np.mean(precision),
        'recall': np.mean(recall),
        'f1': np.mean(f1),
        'loss': loss
    }

In [20]:
model_n_version = "trajet_v2"
max_epochs = 6
learning_rate = 2e-5
batch_size = 16

In [21]:
pretrained_model_checkpoint = "camembert-base"
pretrained_tokenizer_checkpoint = "camembert-base"

In [31]:
# !python generate_data.py

Generating data...

2024-02-09 14:02:09,745 - INFO - SpaCy model loaded successfully.
2024-02-09 14:02:09,745 - INFO - Generating sentences...
2024-02-09 14:02:48,960 - INFO - Processing sentences...
2024-02-09 14:20:27,409 - INFO - Saving data to datas/train_iob.csv...
2024-02-09 14:20:45,436 - INFO - Saving data to datas/dev_iob.csv...
2024-02-09 14:20:58,386 - INFO - Saving data to datas/test_iob.csv...



Skipping span (does not align to tokens): 0 3 DEP Lor
Skipping span (does not align to tokens): 45 54 DEST Plourac'h
Skipping span (does not align to tokens): 47 57 DEST Kermoroc'h
Skipping span (does not align to tokens): 48 58 DEST Kermoroc'h
Skipping span (does not align to tokens): 36 50 DEST Guilligomarc'h
Skipping span (does not align to tokens): 86 96 DEST Kermoroc'h
Skipping span (does not align to tokens): 55 64 DEST Ploulec'h
Skipping span (does not align to tokens): 48 57 DEST Ploulec'h


In [32]:
dataset = IOBTRAJETDataset()


In [33]:
preprocessor = Tokenizer.init_vf(pretrained_tokenizer_checkpoint=pretrained_tokenizer_checkpoint)

In [34]:
model = CamembertForTokenClassification.from_pretrained(pretrained_model_checkpoint,
                                                            num_labels=len(dataset.labels))
    

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
model.config.id2label = dataset.id2label

print(model.config.id2label)
model.config.label2id = dataset.label2id
print(model.config.label2id)

{0: 'B-DEP', 1: 'B-DEST', 2: 'I-DEP', 3: 'I-DEST', 4: 'O'}
{'B-DEP': 0, 'B-DEST': 1, 'I-DEP': 2, 'I-DEST': 3, 'O': 4}


In [36]:
tokenized_datasets = dataset.dataset.map(preprocessor.tokenize_and_align_labels, 
                                         batched=True,
                                         load_from_cache_file=False)

Map:  76%|███████▋  | 91000/119185 [00:51<00:15, 1803.96 examples/s]

In [28]:
args = TrainingArguments(
        f"test-ner",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=max_epochs,
        weight_decay=0.01, 
    )

In [29]:
data_collator = DataCollatorForTokenClassification(tokenizer=preprocessor.tokenizer, padding=False)

In [30]:
trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=preprocessor.tokenizer,
        compute_metrics=lambda p: compute_metrics(p=p)
    )

In [17]:
trainer.train()

  0%|          | 0/5592 [00:00<?, ?it/s]You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 4/5592 [14:41<341:27:22, 219.98s/it]

In [None]:
trainer.evaluate()


In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

In [None]:
true_predictions = [
        [dataset.labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

true_labels = [
            [dataset.labels[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]


In [None]:
true_predictions

In [None]:
results = metric.compute(predictions=true_predictions, references=true_labels)
print("*" *200)
print(results)
print("*" *200)


In [None]:
import os
import json
import numpy as np

# Ensure the 'results' directory exists
results_directory = "results"
os.makedirs(results_directory, exist_ok=True)

# Specify the file path
results_file_path = os.path.join(results_directory, "results.json")

# Define a custom encoder function to handle int32 objects
def custom_encoder(obj):
    if isinstance(obj, np.int32):
        return int(obj)
    raise TypeError("Object not serializable")

# Save results to a JSON file with the custom encoder
with open(results_file_path, 'w') as results_file:
    json.dump(results, results_file, default=custom_encoder)

print(f"Results saved to: {results_file_path}")


In [None]:

out_dir = os.path.expanduser("models") + "/" + model_n_version #models\trajet_v1
trainer.save_model(out_dir)