# Camembert model fine-tuned

In [58]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import sys
import os
from os import path

sys.path.append("./../src")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Data preprocessing

In [61]:
import pandas as pd
import numpy as np

DATA_PATH = path.join('..', 'dataset')
CACHE_DIR = path.join('..', '.cache')

# load jdf and jdr
df_jdf = pd.read_parquet(path.join(CACHE_DIR, 'jdf.parquet'))
df_jdr = pd.read_parquet(path.join(CACHE_DIR, 'jdr.parquet'))

# fusion into full data
full_data = pd.concat([df_jdf, df_jdr], ignore_index=True)
for col in full_data.columns:
    if isinstance(full_data.loc[0, col], np.ndarray):
        full_data[col] = full_data[col].apply(lambda x: x.tolist())
full_data.to_parquet(path.join(CACHE_DIR, 'full.parquet'))

# split and generate dataset
from sklearn.model_selection import train_test_split
import numpy as np

train, val = train_test_split(full_data, test_size=.15)
train.to_parquet(path.join(CACHE_DIR, 'train.parquet'), index=False)
val.to_parquet(path.join(CACHE_DIR, 'val.parquet'), index=False)

In [80]:
from data.textmine import TextMineDataset
from transformers import CamembertTokenizerFast, TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoModelForTokenClassification, EarlyStoppingCallback
from datasets import load_metric
from transformers.integrations import TensorBoardCallback


tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base", cache_dir=path.join(CACHE_DIR, 'transformers'))
jdr = TextMineDataset('jdr', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)
jdf = TextMineDataset('jdf', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)
trainset = TextMineDataset('train', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)
valset = TextMineDataset('val', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)

loading file sentencepiece.bpe.model from cache at ../.cache/transformers/models--camembert-base/snapshots/3f452b6e5a89b0e6c828c9bba2642bc577086eae/sentencepiece.bpe.model
loading file tokenizer.json from cache at ../.cache/transformers/models--camembert-base/snapshots/3f452b6e5a89b0e6c828c9bba2642bc577086eae/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at ../.cache/transformers/models--camembert-base/snapshots/3f452b6e5a89b0e6c828c9bba2642bc577086eae/config.json
Model config CamembertConfig {
  "_name_or_path": "camembert-base",
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate

Load cache data from ../.cache/jdr.parquet
Generate new label2idx
Load cache data from ../.cache/jdf.parquet
Generate new label2idx
Load cache data from ../.cache/train.parquet
Generate new label2idx
Load cache data from ../.cache/val.parquet
Generate new label2idx


In [79]:
from datasets import load_metric

metric = load_metric("seqeval")

def compute_metrics(p):
    
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if(k not in flattened_results.keys()):
            flattened_results[k+"_f1"]=results[k]["f1"]
            flattened_results[k+"_recall"]=results[k]["recall"]
            flattened_results[k+"_precision"]=results[k]["precision"]

    return flattened_results


In [64]:
id2label = {idx: label for idx, label in enumerate(trainset.id2label)}
model = AutoModelForTokenClassification.from_pretrained("camembert-base", num_labels=len(trainset.id2label), id2label=id2label, label2id=trainset.label2id, cache_dir=CACHE_DIR)

data_collator = DataCollatorForTokenClassification(tokenizer)

loading configuration file config.json from cache at ../.cache/models--camembert-base/snapshots/3f452b6e5a89b0e6c828c9bba2642bc577086eae/config.json
Model config CamembertConfig {
  "_name_or_path": "camembert-base",
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-Email",
    "2": "I-Email",
    "3": "B-Function",
    "4": "I-Function",
    "5": "B-Human",
    "6": "I-Human",
    "7": "B-Location",
    "8": "I-Location",
    "9": "B-Organization",
    "10": "I-Organization",
    "11": "B-Project",
    "12": "B-Phone_Number",
    "13": "I-Phone_Number",
    "14": "I-Project",
    "15": "B-Reference_CEDEX",
    "16": "B-Reference_User",
    "17": "B-Reference_CS",
    "18": "B-Reference_Code_Postal",
    "19": "I-Reference_CS",
    "20": "I-Reference_User",


In [72]:
training_args = TrainingArguments(
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    num_train_epochs=30,
    output_dir=path.join('..', '.cache', 'results', 'final'),
    logging_dir=path.join('..', '.cache', 'logs', 'final'),
    logging_steps=1,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to='tensorboard'
)

PyTorch: setting up devices


In [73]:
flat_labels = [l for label_array in trainset.data['labels'] for l in label_array]
label_frequency = dict()

for label in flat_labels:
    label_frequency[label] = label_frequency.get(label, 0) + 1

idx_labels = list(label_frequency.keys())

label_weight = [0] * len(label_frequency.keys())
for idx, f in label_frequency.items():
    label_weight[idx] = 1/f
sum_weight = sum(label_weight)
label_weight = [w/sum_weight for w in label_weight]

In [82]:
from torch import nn
import torch 

class CustomTrainer(Trainer):
    
    def __init__(self, label_weight: dict, **kwargs):
        self.label_weight = label_weight
        return super(CustomTrainer, self).__init__(**kwargs)
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(self.label_weight))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
    
trainer = CustomTrainer(
    label_weight=label_weight,
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=valset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3),
    ]
)

In [83]:
trainer.train()

***** Running training *****
  Num examples = 827
  Num Epochs = 30
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 780
  Number of trainable parameters = 110052123


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Email F1,Email Recall,Email Precision,Function F1,Function Recall,Function Precision,Human F1,Human Recall,Human Precision,Location F1,Location Recall,Location Precision,Organization F1,Organization Recall,Organization Precision,Phone Number F1,Phone Number Recall,Phone Number Precision,Project F1,Project Recall,Project Precision,Reference Cedex F1,Reference Cedex Recall,Reference Cedex Precision,Reference Cs F1,Reference Cs Recall,Reference Cs Precision,Reference Code Postal F1,Reference Code Postal Recall,Reference Code Postal Precision,Reference User F1,Reference User Recall,Reference User Precision,Social Network F1,Social Network Recall,Social Network Precision,Url F1,Url Recall,Url Precision
1,1.6702,2.492454,0.873171,0.77691,0.822232,0.942541,1.0,1.0,1.0,0.83871,0.928571,0.764706,0.973154,0.973154,0.973154,0.880795,0.923611,0.841772,0.839216,0.798507,0.884298,1.0,1.0,1.0,0.454545,0.714286,0.333333,0.96,1.0,0.923077,1.0,1.0,1.0,0.974619,1.0,0.950495,1.0,1.0,1.0,0.108108,0.057143,1.0,0.0,0.0,0.0
2,1.4627,2.398289,0.897817,0.78559,0.837963,0.946867,1.0,1.0,1.0,0.911111,0.97619,0.854167,0.963211,0.966443,0.96,0.953528,0.961806,0.945392,0.804598,0.783582,0.826772,1.0,1.0,1.0,0.344828,0.714286,0.227273,1.0,1.0,1.0,1.0,1.0,1.0,0.989691,1.0,0.979592,1.0,1.0,1.0,0.108108,0.057143,1.0,0.0,0.0,0.0
3,1.2883,2.347303,0.911178,0.792535,0.847725,0.951367,1.0,1.0,1.0,0.91954,0.952381,0.888889,0.963211,0.966443,0.96,0.957118,0.96875,0.945763,0.85283,0.843284,0.862595,1.0,1.0,1.0,0.380952,0.571429,0.285714,1.0,1.0,1.0,1.0,1.0,1.0,0.994819,1.0,0.989691,1.0,1.0,1.0,0.108108,0.057143,1.0,0.0,0.0,0.0
4,1.1924,2.314426,0.911178,0.792535,0.847725,0.951367,1.0,1.0,1.0,0.909091,0.952381,0.869565,0.966443,0.966443,0.966443,0.960549,0.972222,0.949153,0.834586,0.828358,0.840909,1.0,1.0,1.0,0.5,0.714286,0.384615,1.0,1.0,1.0,1.0,1.0,1.0,0.994819,1.0,0.989691,1.0,1.0,1.0,0.108108,0.057143,1.0,0.0,0.0,0.0
5,1.0466,2.300604,0.918675,0.794271,0.851955,0.951886,1.0,1.0,1.0,0.91954,0.952381,0.888889,0.959732,0.959732,0.959732,0.965517,0.972222,0.958904,0.870229,0.850746,0.890625,1.0,1.0,1.0,0.47619,0.714286,0.357143,0.96,1.0,0.923077,1.0,1.0,1.0,0.994819,1.0,0.989691,1.0,1.0,1.0,0.108108,0.057143,1.0,0.0,0.0,0.0
6,0.9958,2.286223,0.918919,0.796875,0.853556,0.95206,1.0,1.0,1.0,0.91954,0.952381,0.888889,0.966443,0.966443,0.966443,0.958904,0.972222,0.945946,0.893939,0.880597,0.907692,1.0,1.0,1.0,0.315789,0.428571,0.25,1.0,1.0,1.0,1.0,1.0,1.0,0.994819,1.0,0.989691,1.0,1.0,1.0,0.108108,0.057143,1.0,0.0,0.0,0.0
7,0.8986,2.277956,0.919563,0.803819,0.857805,0.952752,0.984615,0.989691,0.979592,0.91954,0.952381,0.888889,0.963211,0.966443,0.96,0.97094,0.986111,0.956229,0.907749,0.91791,0.89781,1.0,1.0,1.0,0.333333,0.428571,0.272727,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.108108,0.057143,1.0,0.0,0.0,0.0
8,0.8238,2.266903,0.922542,0.806424,0.860584,0.955175,0.984615,0.989691,0.979592,0.930233,0.952381,0.909091,0.966443,0.966443,0.966443,0.97094,0.986111,0.956229,0.933333,0.940299,0.926471,1.0,1.0,1.0,0.285714,0.428571,0.214286,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.108108,0.057143,1.0,0.0,0.0,0.0
9,0.8647,2.261973,0.925224,0.805556,0.861253,0.954483,0.984615,0.989691,0.979592,0.930233,0.952381,0.909091,0.966443,0.966443,0.966443,0.981132,0.993056,0.969492,0.904412,0.91791,0.891304,1.0,1.0,1.0,0.352941,0.428571,0.3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.108108,0.057143,1.0,0.0,0.0,0.0
10,0.8655,2.269711,0.925,0.802951,0.859665,0.953098,0.984615,0.989691,0.979592,0.91954,0.952381,0.888889,0.963211,0.966443,0.96,0.979452,0.993056,0.966216,0.892193,0.895522,0.888889,1.0,1.0,1.0,0.428571,0.428571,0.428571,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.108108,0.057143,1.0,0.0,0.0,0.0


***** Running Evaluation *****
  Num examples = 146
  Batch size = 32
Saving model checkpoint to ../.cache/results/final/checkpoint-26
Configuration saved in ../.cache/results/final/checkpoint-26/config.json
Model weights saved in ../.cache/results/final/checkpoint-26/pytorch_model.bin
tokenizer config file saved in ../.cache/results/final/checkpoint-26/tokenizer_config.json
Special tokens file saved in ../.cache/results/final/checkpoint-26/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 146
  Batch size = 32
Saving model checkpoint to ../.cache/results/final/checkpoint-52
Configuration saved in ../.cache/results/final/checkpoint-52/config.json
Model weights saved in ../.cache/results/final/checkpoint-52/pytorch_model.bin
tokenizer config file saved in ../.cache/results/final/checkpoint-52/tokenizer_config.json
Special tokens file saved in ../.cache/results/final/checkpoint-52/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 146
  Batch si

TrainOutput(global_step=364, training_loss=1.0484168075925702, metrics={'train_runtime': 6014.2827, 'train_samples_per_second': 4.125, 'train_steps_per_second': 0.13, 'total_flos': 406589507285670.0, 'train_loss': 1.0484168075925702, 'epoch': 14.0})

In [None]:
trainer.evaluate(valset)

In [None]:
model2 = AutoModelForTokenClassification.from_pretrained("camembert-base", num_labels=len(trainset.id2label), id2label=id2label, label2id=trainset.label2id, cache_dir=CACHE_DIR)

training_args = TrainingArguments(
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    num_train_epochs=20,
    output_dir=path.join('..', '.cache', 'results', 'train_val_split'),
    logging_dir=path.join('..', '.cache', 'logs', 'train_val_split'),
    logging_steps=1,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to='tensorboard'
)

trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=valset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3),
    ]
)

trainer2.train()

loading configuration file config.json from cache at ../.cache/models--camembert-base/snapshots/3f452b6e5a89b0e6c828c9bba2642bc577086eae/config.json
Model config CamembertConfig {
  "_name_or_path": "camembert-base",
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-Email",
    "2": "I-Email",
    "3": "B-Function",
    "4": "I-Function",
    "5": "B-Human",
    "6": "I-Human",
    "7": "B-Location",
    "8": "I-Location",
    "9": "B-Organization",
    "10": "I-Organization",
    "11": "B-Project",
    "12": "B-Phone_Number",
    "13": "I-Phone_Number",
    "14": "I-Project",
    "15": "B-Reference_CEDEX",
    "16": "B-Reference_User",
    "17": "B-Reference_CS",
    "18": "B-Reference_Code_Postal",
    "19": "I-Reference_CS",
    "20": "I-Reference_User",


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Email F1,Email Recall,Email Precision,Function F1,Function Recall,Function Precision,Human F1,Human Recall,Human Precision,Location F1,Location Recall,Location Precision,Organization F1,Organization Recall,Organization Precision,Phone Number F1,Phone Number Recall,Phone Number Precision,Project F1,Project Recall,Project Precision,Reference Cedex F1,Reference Cedex Recall,Reference Cedex Precision,Reference Cs F1,Reference Cs Recall,Reference Cs Precision,Reference Code Postal F1,Reference Code Postal Recall,Reference Code Postal Precision,Reference User F1,Reference User Recall,Reference User Precision,Social Network F1,Social Network Recall,Social Network Precision,Url F1,Url Recall,Url Precision
1,2.2302,2.183944,0.467532,0.28125,0.35122,0.712184,0.761905,0.907216,0.656716,0.0,0.0,0.0,0.801262,0.852349,0.755952,0.101053,0.083333,0.128342,0.111628,0.089552,0.148148,0.595918,0.598361,0.593496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.7609,1.69898,0.838677,0.726562,0.778605,0.869505,0.801802,0.917526,0.712,0.0,0.0,0.0,0.959732,0.959732,0.959732,0.898223,0.965278,0.839879,0.724832,0.80597,0.658537,0.983607,0.983607,0.983607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034091,0.018072,0.3
3,1.4344,1.425219,0.869439,0.780382,0.822507,0.928868,0.979592,0.989691,0.969697,0.906977,0.928571,0.886364,0.959732,0.959732,0.959732,0.903021,0.986111,0.832845,0.882353,0.895522,0.869565,0.991803,0.991803,0.991803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.2435,1.259483,0.914915,0.793403,0.849837,0.94081,0.974619,0.989691,0.96,0.906977,0.928571,0.886364,0.956522,0.959732,0.953333,0.967521,0.982639,0.952862,0.923077,0.940299,0.906475,1.0,1.0,1.0,0.0,0.0,0.0,0.72,0.75,0.692308,0.0,0.0,0.0,0.979592,1.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


***** Running Evaluation *****
  Num examples = 146
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../.cache/results/train_val_split/checkpoint-26
Configuration saved in ../.cache/results/train_val_split/checkpoint-26/config.json
Model weights saved in ../.cache/results/train_val_split/checkpoint-26/pytorch_model.bin
tokenizer config file saved in ../.cache/results/train_val_split/checkpoint-26/tokenizer_config.json
Special tokens file saved in ../.cache/results/train_val_split/checkpoint-26/special_tokens_map.json
Deleting older checkpoint [../.cache/results/train_val_split/checkpoint-52] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 146
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../.cache/results/train_val_split/checkpoint-52
Configuration saved in ../.cache/results/train_val_split/checkpoint-52/config.json
Model weights saved in ../.cache/resul

## Inference

In [None]:
JDA_PATH = path.join(DATA_PATH, 'JDA.json')

with open(JDA_PATH, 'r') as f:
    data_jda = json.load(f)

In [None]:
def most_frequent(List):
    return max(set(List), key = List.count)

for idx_sent, sentence in enumerate(data_jda):
    tokens = tokenizer(sentence['text'], return_offsets_mapping=True, return_tensors='pt')
    offsets = tokens.pop('offset_mapping')
    offsets = offsets.squeeze()
    results = model(**tokens)
    label_bert = results.logits.squeeze().argmax(dim=1)
    label_bert = torch.cat([offsets, label_bert.unsqueeze(dim=1)], dim=1)
    idx_predict = 0
    
    annotations = list()
    
    for entity in sentence['annotations']:
        
        predictions = list()
        
        while label_bert[idx_predict][0] < entity['begin'] or label_bert[idx_predict][1] < entity['end']:
            idx_predict += 1
        
        while label_bert[idx_predix][0] < entity['end']:
            entity_label = int(label_bert[idx_predict, 2])
            entity_label = jdf.id2label[entity_label]
            entity_label = entity_label if len(entity_label) == 1 else entity_label[2:]
            predictions.append(entity_label)
        
        
        entity['label_bert'] = most_frequent(predictions)
        annotations.append(entity)
        
    data_jda[idx_sent]['annotations'] = annotations

In [None]:
with open(path.join(DATA_PATH, 'jda_bert.json'), 'w') as f:
    json.dump(data_jda, f)