In [1]:
import numpy as np
import transformers
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, CamembertForTokenClassification, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric, load_dataset, ClassLabel, DownloadConfig

import os
from pathlib import Path

from tokenizer import Tokenizer
from trajet_dataset import IOBTRAJETDataset

ModuleNotFoundError: No module named 'transformers'

In [2]:
metric = load_metric("seqeval", trust_remote_code=True)

In [3]:
def compute_metrics(p, label_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [4]:
model_n_version = "trajet_v1"
max_epochs = 6
learning_rate = 2e-5
batch_size = 16

In [5]:
pretrained_model_checkpoint = "camembert-base"
pretrained_tokenizer_checkpoint = "camembert-base"

In [6]:
# !python generate_data.py

In [7]:
dataset = IOBTRAJETDataset()


In [8]:
preprocessor = Tokenizer.init_vf(pretrained_tokenizer_checkpoint=pretrained_tokenizer_checkpoint)

In [9]:
model = CamembertForTokenClassification.from_pretrained(pretrained_model_checkpoint,
                                                            num_labels=len(dataset.labels))
    

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.config.id2label = dataset.id2label

print(model.config.id2label)
model.config.label2id = dataset.label2id
print(model.config.label2id)

{0: 'B-DEP', 1: 'B-DEST', 2: 'I-DEP', 3: 'I-DEST', 4: 'O'}
{'B-DEP': 0, 'B-DEST': 1, 'I-DEP': 2, 'I-DEST': 3, 'O': 4}


In [11]:
tokenized_datasets = dataset.dataset.map(preprocessor.tokenize_and_align_labels, 
                                         batched=True,
                                         load_from_cache_file=False)

Map:   0%|          | 0/30720 [00:00<?, ? examples/s]Map: 100%|██████████| 30720/30720 [00:19<00:00, 1582.82 examples/s]
Map: 100%|██████████| 15360/15360 [00:06<00:00, 2223.11 examples/s]
Map: 100%|██████████| 15360/15360 [00:06<00:00, 2382.86 examples/s]


In [12]:
args = TrainingArguments(
        f"test-ner",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=max_epochs,
        weight_decay=0.01, 
    )

In [13]:
data_collator = DataCollatorForTokenClassification(preprocessor.tokenizer)

In [14]:
trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=preprocessor.tokenizer,
        compute_metrics=lambda p: compute_metrics(p=p, label_list=dataset.labels)
    )

In [15]:
trainer.train()

  0%|          | 0/11520 [00:00<?, ?it/s]You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 1/11520 [03:02<584:28:22, 182.66s/it]

KeyboardInterrupt: 

In [None]:
    trainer.evaluate()


In [None]:
    predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
    predictions = np.argmax(predictions, axis=2)


In [None]:
true_predictions = [
        [dataset.labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
            [dataset.labels[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]


In [None]:

    results = metric.compute(predictions=true_predictions, references=true_labels)
    print("*" *200)
    print(results)
    print("*" *200)


In [None]:

    # Save results to a JSON file
    results_file_path = os.path.join("./results", "results.json")
    with open(results_file_path, 'w') as results_file:
        json.dump(results, results_file)

    print(f"Results saved to: {results_file_path}")


In [None]:

    out_dir = os.path.expanduser(model_root_dir) + "/" + model_n_version
    trainer.save_model(out_dir)