In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import sys
import os
from os import path

sys.path.append("./../src")

In [2]:
import pandas as pd
import json

DATA_PATH = path.join('..', 'dataset')
JDF_PATH = path.join(DATA_PATH, 'JDF.json')
JDR_PATH = path.join(DATA_PATH, 'JDR.json')

data = dict()

with open(JDR_PATH, 'r') as f:
    data['jdr'] = json.load(f)
    
with open(JDF_PATH, 'r') as f:
    data['jdf'] = json.load(f)

print('JDR #examples :',len(data['jdr']))
print('JDF #examples :',len(data['jdf']))

annotations = [a for d in data['jdr'] for a in d['annotations']]
df_annotations = pd.DataFrame(annotations)
df_annotations["label"] = df_annotations["label"].astype("category")
display(HTML('<h3>Les entités nommées</h3>'))
display(df_annotations.head())

print('Labels:', df_annotations['label'].unique())

JDR #examples : 473
JDF #examples : 500


Unnamed: 0,form,label,begin,end
0,Faustin,Human,0,7
1,Chabot,Human,8,14
2,19,Location,28,30
3,rue,Location,31,34
4,Descartes,Location,35,44


Labels: ['Human', 'Location', 'Reference_Code_Postal', 'Reference_CEDEX', 'Reference_CS', ..., 'Phone_Number', 'Social_Network', 'Reference_User', 'Organization', 'Url']
Length: 13
Categories (13, object): ['Email', 'Function', 'Human', 'Location', ..., 'Reference_Code_Postal', 'Reference_User', 'Social_Network', 'Url']


<div class="alert alert-block alert-info">Format d'une pharse donnée</div>

In [3]:
data['jdr'][0]

{'identifier': 500,
 'text': 'Faustin Chabot\\r\\nAdresse : 19 rue Descartes 94370 Sucy-en-Brie (France)\\r\\nCedex 9 CS 12468\\r\\nData Engineer / Algorithm XZ Project\\r\\nfaustinchabot@teleworm.com / Tel : +33 0134354919\\r\\nLinkedin : https://fr.linkedin.com/in/fauchab\\r\\nTeleworm France\\r\\nteleworm.france.com',
 'annotations': [{'form': 'Faustin', 'label': 'Human', 'begin': 0, 'end': 7},
  {'form': 'Chabot', 'label': 'Human', 'begin': 8, 'end': 14},
  {'form': '19', 'label': 'Location', 'begin': 28, 'end': 30},
  {'form': 'rue', 'label': 'Location', 'begin': 31, 'end': 34},
  {'form': 'Descartes', 'label': 'Location', 'begin': 35, 'end': 44},
  {'form': '94370', 'label': 'Reference_Code_Postal', 'begin': 45, 'end': 50},
  {'form': 'Sucy-en-Brie', 'label': 'Location', 'begin': 51, 'end': 63},
  {'form': 'France', 'label': 'Location', 'begin': 65, 'end': 71},
  {'form': 'Cedex', 'label': 'Reference_CEDEX', 'begin': 76, 'end': 81},
  {'form': '9', 'label': 'Reference_CEDEX', 'be

<div class="alert alert-block alert-info">Est-ce qu'il y a de motif pour le numéro de téléphone?</div>

> Apparemment non

In [4]:
df_annotations[df_annotations['label']=='Phone_Number'].head(20)

Unnamed: 0,form,label,begin,end
18,+33 0134354919,Phone_Number,171,185
45,03.18.38.37.37,Phone_Number,136,150
72,+ 03 81 20 48 27,Phone_Number,152,168
91,01.75.88.25.30,Phone_Number,77,91
122,+ 33 01 77 83 74 05,Phone_Number,157,176
134,+33 0365962110,Phone_Number,70,84
170,01.55.29.21.75,Phone_Number,118,132
191,+ 33 01 79 28 30 87,Phone_Number,75,94
225,33 0147908347,Phone_Number,150,163
250,03.54.57.86.42,Phone_Number,162,176


<div class="alert alert-block alert-info">Liste des étiquettes à prédire</div>

In [5]:
list(df_annotations['label'].unique())

['Human',
 'Location',
 'Reference_Code_Postal',
 'Reference_CEDEX',
 'Reference_CS',
 'Function',
 'Project',
 'Email',
 'Phone_Number',
 'Social_Network',
 'Reference_User',
 'Organization',
 'Url']

<div class="alert alert-block alert-info">Les étiquettes sont-elles chevauchées?</div>

> Oui, il semble un bruit dans l'outil d'annotation. Il suffit de surrprimer celui contenu dans la vraie étiquette

In [5]:
def check_overlapping(data):
    for data_row in data:
        annotations = data_row['annotations']
        for pre, post in zip(annotations[:-1], annotations[1:]):
            if pre['end'] > post['begin']:
                print('TEXT:', data_row['text'])
                print(pre)
                print(post)
                print('='*15)

def remove_overlapping(data):
    for data_row in data:
        annotations = data_row['annotations']
        for pre, post in zip(annotations[:-1], annotations[1:]):
            if pre['end'] >= post['end']:
                annotations.remove(post)

for split in data:
    display(HTML('<h3>'+split+'</h3>'))
    check_overlapping(data[split])

for split in data:
    remove_overlapping(data[split])
    check_overlapping(data[split])

TEXT: Sibyla Chandonnet\nTechnicienne en radiologie / Health For All\nTel : +33 0365962110 \nsibylachandonnet@yahoo.fr\n89 rue du Général Ailleret 62300 Lens\nCedex 08 CS 40362\nThe Happy Bear / happybearhealth.com\nLinkedin : https://fr.linkedin.com/in/sibylac
{'form': 'Technicienne', 'label': 'Function', 'begin': 19, 'end': 31}
{'form': 'en', 'label': 'Function', 'begin': 27, 'end': 29}
TEXT: Dorothée Charrette\n94 rue du Faubourg National 94320 Thiais\nCedex 02\n+33 01 70 92 06 69 - dorotheecharette@outlook.com\nEscrow Papers - escrowpapers.fr\nPoste : Technicienne en téléphonie mobile
{'form': 'Technicienne', 'label': 'Function', 'begin': 164, 'end': 176}
{'form': 'en', 'label': 'Function', 'begin': 172, 'end': 174}
TEXT: Coralie Sanschagrin\nVendeuse en magasin - Supermarket Lists\nsupermarketlists.fr\nAdresse : 12 place Maurice-Charretier 94220 Charenton-sur-le-Pont\nTél : 0171220748\nEmail : coraliesanschagrin@gmail.com
{'form': 'Vendeuse', 'label': 'Function', 'begin': 21, 'end

<div class="alert alert-block alert-info">Est-ce qu'il existe un exemple qui manque d'annotation? Combien? Lesquels?</div>

> Non, apparemment très cohérent!

In [6]:
for split in data:
    for data_row in data[split]:
        annotations = data_row['annotations']
        if len(annotations) == 0:
            print(data_row)

## Tokenization test

In [7]:
from transformers import CamembertTokenizerFast

MAX_LINE = 1000000
CACHE_DIR = path.join('..', '.cache')
TRANSFORMERS_DIR = path.join(CACHE_DIR, 'transformers')


texts = [d['text'] for d in data['jdr'][:MAX_LINE]]
print('Testing text:')
display(texts[:2])

#fast_tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base", cache_dir=CACHE_DIR, additional_special_tokens=['\\n', '\\r', 'https://'])
fast_tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base", cache_dir=TRANSFORMERS_DIR)

token_encodings = fast_tokenizer(texts, return_offsets_mapping=True)

token_strings = [fast_tokenizer.convert_ids_to_tokens(input_ids) for input_ids in token_encodings.input_ids]

for tkstr in token_strings[:2]:
    print(tkstr)

Testing text:


['Faustin Chabot\\r\\nAdresse : 19 rue Descartes 94370 Sucy-en-Brie (France)\\r\\nCedex 9 CS 12468\\r\\nData Engineer / Algorithm XZ Project\\r\\nfaustinchabot@teleworm.com / Tel : +33 0134354919\\r\\nLinkedin : https://fr.linkedin.com/in/fauchab\\r\\nTeleworm France\\r\\nteleworm.france.com',
 'Vallis Lachance\\r\\nConcepteur de publications web - Un Site, une BD\\r\\n14 rue Victor Hugo 60200 Compiègne\\r\\nCedex 12 CS 10202\\r\\nTel : 03.18.38.37.37\\r\\nEmail : vallislachance@monwax.com\\r\\nMonwax \\r\\nmonwax.com\\r\\nFacebook : https://www.facebook.com/vallislachance']

['<s>', '▁Faust', 'in', '▁Cha', 'bot', '\\', 'r', '\\', 'n', 'Ad', 'resse', '▁:', '▁19', '▁rue', '▁Descartes', '▁94', '370', '▁Suc', 'y', '-', 'en', '-', 'B', 'rie', '▁(', 'France', ')', '\\', 'r', '\\', 'n', 'Ce', 'dex', '▁9', '▁CS', '▁124', '68', '\\', 'r', '\\', 'n', 'D', 'ata', '▁Engine', 'er', '▁/', '▁Al', 'gor', 'ith', 'm', '▁X', 'Z', '▁Project', '\\', 'r', '\\', 'n', 'fa', 'ustin', 'cha', 'bot', '@', 'tel', 'e', 'w', 'orm', '.', 'com', '▁/', '▁Tel', '▁:', '▁+', '33', '▁01', '34', '35', '49', '19', '\\', 'r', '\\', 'n', 'L', 'ink', 'e', 'din', '▁:', '▁https', '://', 'fr', '.', 'link', 'e', 'din', '.', 'com', '/', 'in', '/', 'fa', 'uch', 'ab', '\\', 'r', '\\', 'n', 'T', 'ele', 'w', 'orm', '▁France', '\\', 'r', '\\', 'nt', 'ele', 'w', 'orm', '.', 'france', '.', 'com', '</s>']
['<s>', '▁Val', 'lis', '▁La', 'ch', 'ance', '\\', 'r', '\\', 'n', 'Con', 'cept', 'eur', '▁de', '▁publications', '▁web', '▁-', '▁Un', '▁Site', ',', '▁une', '▁BD', '\\', 'r', '\\', 'n', '14', '▁rue', '▁Victor', 

In [8]:
def mapping_label_token(token_span_batch, annotations_batch):
    """
    Remap IOB tag to each token generated by tokenizer. Should provide the span (begin/end)
    """
    
    labels = list()
    
    for token_span_sent, annotations in zip(token_span_batch, annotations_batch):
        
        annotations = annotations.copy()
        entity = annotations.pop(0)
        
        last_label = 'O'
        token_label = list()
        
        for token in token_span_sent:
            
            while entity['end'] < token['begin']: entity = annotations.pop(0)

            if token['begin'] == token['end']:
                label = 'O'    
            elif entity['begin'] <= token['begin'] and token['end'] <= entity['end']:
                prefix = 'B-' if last_label == 'O' or last_label[2:] != entity['label'] else 'I-'
                label = prefix + entity['label']
            else:
                label = 'O'
                
            token_label.append(label)
            last_label = label
                
        labels.append(token_label)
        
    return labels

def tokenize_text(texts, annotations, tokenizer):
    
    # Tokenize text
    token_encodings = tokenizer(texts, return_offsets_mapping=True)
    token_encodings['tokens'] = [fast_tokenizer.convert_ids_to_tokens(input_ids) for input_ids in token_encodings.input_ids]
    
    # Mapping labels
    token_span = token_encodings.offset_mapping
    token_span_dict = [[{'begin': span[0], 'end': span[1]} for span in token_sent ] for token_sent in token_span]
    token_encodings['ner_tags'] = mapping_label_token(token_span_dict, annotations)
    
    return token_encodings

annotations = [d['annotations'] for d in data['jdr'][:MAX_LINE]]
texts = [d['text'] for d in data['jdr'][:MAX_LINE]]
tokenized = tokenize_text(texts, annotations, fast_tokenizer)

all_labels = [i for l in tokenized['ner_tags'] for i in l ]
unique_label = set(all_labels)
id2label = list(unique_label)
print('id2label =',id2label)
label2id = {label: idx for idx, label in enumerate(id2label)}
print('label2id =',label2id)

tokenized['labels'] = [[label2id[label] for label in label_sentence] for label_sentence in tokenized['ner_tags']]

id2label = ['I-Reference_CS', 'B-Reference_Code_Postal', 'I-Project', 'I-Reference_Code_Postal', 'I-Human', 'B-Human', 'I-Reference_CEDEX', 'I-Email', 'I-Social_Network', 'I-Location', 'B-Reference_CEDEX', 'B-Reference_CS', 'B-Email', 'B-Phone_Number', 'B-Social_Network', 'I-Url', 'I-Organization', 'I-Phone_Number', 'B-Location', 'B-Organization', 'B-Url', 'B-Reference_User', 'I-Function', 'I-Reference_User', 'B-Project', 'B-Function', 'O']
label2id = {'I-Reference_CS': 0, 'B-Reference_Code_Postal': 1, 'I-Project': 2, 'I-Reference_Code_Postal': 3, 'I-Human': 4, 'B-Human': 5, 'I-Reference_CEDEX': 6, 'I-Email': 7, 'I-Social_Network': 8, 'I-Location': 9, 'B-Reference_CEDEX': 10, 'B-Reference_CS': 11, 'B-Email': 12, 'B-Phone_Number': 13, 'B-Social_Network': 14, 'I-Url': 15, 'I-Organization': 16, 'I-Phone_Number': 17, 'B-Location': 18, 'B-Organization': 19, 'B-Url': 20, 'B-Reference_User': 21, 'I-Function': 22, 'I-Reference_User': 23, 'B-Project': 24, 'B-Function': 25, 'O': 26}


In [9]:
import json

id2label = list(unique_label)
def fn_sort(x):
    val = 0 if len(x) == 1 else (ord(x[2]))*1e3 + ord(x[0])
    return val

id2label = sorted(id2label, key=fn_sort, reverse=False)
label2id = {label: idx for idx, label in enumerate(id2label)}
label2id

with open(path.join(CACHE_DIR, 'label_idx.json'), "w") as f:
    json.dump({
        'label2id': label2id,
        'id2label': id2label
    }, f, indent='\t')

In [11]:
df = pd.DataFrame(dict(tokenized))
os.makedirs(path.join('..', '.cache'), exist_ok=True)
df.to_parquet(path.join('..','.cache', 'jdr.parquet'))
df

Unnamed: 0,input_ids,attention_mask,offset_mapping,tokens,ner_tags,labels
0,"[5, 28119, 236, 2614, 8674, 3155, 81, 3155, 25...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 5), (5, 7), (8, 11), (11, 14), (1...","[<s>, ▁Faust, in, ▁Cha, bot, \, r, \, n, Ad, r...","[O, B-Human, I-Human, I-Human, I-Human, O, O, ...","[18, 3, 7, 7, 7, 18, 18, 18, 18, 18, 18, 18, 2..."
1,"[5, 1598, 4026, 61, 751, 1269, 3155, 81, 3155,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 3), (3, 6), (7, 9), (9, 11), (11,...","[<s>, ▁Val, lis, ▁La, ch, ance, \, r, \, n, Co...","[O, B-Human, I-Human, I-Human, I-Human, I-Huma...","[18, 3, 7, 7, 7, 7, 18, 18, 18, 18, 11, 8, 8, ..."
2,"[5, 11904, 73, 6445, 276, 8348, 88, 3155, 81, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (4, 6), (6, 8), (8, 10), (11,...","[<s>, ▁Arch, ai, mb, au, ▁Mass, on, \, r, \, n...","[O, B-Human, I-Human, I-Human, I-Human, I-Huma...","[18, 3, 7, 7, 7, 7, 7, 18, 18, 18, 18, 11, 8, ..."
3,"[5, 470, 1606, 9313, 2265, 3155, 81, 3155, 255...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (4, 8), (9, 12), (12, 16), (1...","[<s>, ▁Jean, ette, ▁Fre, mont, \, r, \, n, 8, ...","[O, B-Human, I-Human, I-Human, I-Human, O, O, ...","[18, 3, 7, 7, 7, 18, 18, 18, 18, 2, 10, 10, 10..."
4,"[5, 3696, 19483, 236, 3155, 81, 3155, 255, 137...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (5, 8), (8, 10), (10, 11), (1...","[<s>, ▁Cher, ▁Baz, in, \, r, \, n, Mé, can, ic...","[O, B-Human, I-Human, I-Human, O, O, O, O, B-F...","[18, 3, 7, 7, 18, 18, 18, 18, 11, 8, 8, 8, 8, ..."
...,...,...,...,...,...,...
468,"[5, 4114, 61, 29807, 3155, 255, 6179, 7148, 43...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 6), (7, 9), (9, 13), (13, 14), (1...","[<s>, ▁Claude, ▁La, ndry, \, n, Ad, resse, ▁:,...","[O, B-Human, I-Human, I-Human, O, O, O, O, O, ...","[18, 3, 7, 7, 18, 18, 18, 18, 18, 2, 10, 10, 1..."
469,"[5, 11853, 9625, 10, 1981, 3155, 255, 3853, 30...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 9), (10, 13), (13, 14), (14, 18),...","[<s>, ▁Charlotte, ▁Bus, s, ière, \, n, 59, ▁co...","[O, B-Human, I-Human, I-Human, I-Human, O, O, ...","[18, 3, 7, 7, 7, 18, 18, 2, 10, 10, 10, 10, 12..."
470,"[5, 18467, 24817, 3155, 255, 3225, 9, 3220, 9,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 6), (7, 14), (14, 15), (15, 16), ...","[<s>, ▁Cédric, ▁Garnier, \, n, 04, ., 27, ., 1...","[O, B-Human, I-Human, O, O, B-Phone_Number, I-...","[18, 3, 7, 18, 18, 16, 0, 0, 0, 0, 0, 0, 0, 0,..."
471,"[5, 14147, 10223, 11734, 4461, 3155, 255, 3395...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 3), (3, 9), (10, 13), (13, 16), (...","[<s>, ▁Fla, vienne, ▁Dev, ost, \, n, 02, ., 56...","[O, B-Human, I-Human, I-Human, I-Human, O, O, ...","[18, 3, 7, 7, 7, 18, 18, 16, 0, 0, 0, 0, 0, 0,..."


In [15]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoModelForTokenClassification, EarlyStoppingCallback
from datasets import load_metric
import numpy as np
from transformers.integrations import TensorBoardCallback

metric = load_metric("seqeval")


from torch.utils.data import Dataset

class TextMineDataset(Dataset):
    
    def __init__(self, df):
        self.data = df

    def __getitem__(self, idx):
        if idx >= len(self): raise IndexError  # meet the end of dataset
        sample = self.data.loc[idx].to_dict()
        #for k, v in sample.items():
        #   print(k, '=', len(v))
        return {'input_ids': sample['input_ids'], 'attention_mask': sample['attention_mask'],'labels': sample['labels']}

    def __len__(self):
        return len(self.data)

dataset = TextMineDataset(df)

def compute_metrics(p):
    
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if(k not in flattened_results.keys()):
            flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

model = AutoModelForTokenClassification.from_pretrained("camembert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id, cache_dir=CACHE_DIR)
data_collator = DataCollatorForTokenClassification(fast_tokenizer)

training_args = TrainingArguments(
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=20,
    output_dir=path.join('..', '.cache', 'results'),
    logging_dir=path.join('..', '.cache', 'logs'),
    logging_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
    tokenizer=fast_tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3),
        TensorBoardCallback(),
    ]
)

trainer.train()

  metric = load_metric("seqeval")
Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForTokenClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing CamembertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRA

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Make Pytorch TextMine Dataset from raw files

In [41]:
from data.textmine import TextMineDataset
from transformers import CamembertTokenizerFast

DATA_PATH = path.join('..', 'dataset')
CACHE_DIR = path.join('..', '.cache')

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base", cache_dir=path.join(CACHE_DIR, 'transformers'))
jdr = TextMineDataset('jdr', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)
jdf = TextMineDataset('jdf', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)

Load cache data from ../.cache/jdr.parquet
Load label idx from ../.cache/label_idx.json
Load cache data from ../.cache/jdf.parquet
Load label idx from ../.cache/label_idx.json


In [42]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoModelForTokenClassification, EarlyStoppingCallback
from datasets import load_metric
import numpy as np
from transformers.integrations import TensorBoardCallback
from data.textmine import TextMineDataset

metric = load_metric("seqeval")

def compute_metrics(p): 
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [ [jdf.id2label[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ]
    true_labels = [ [jdf.id2label[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ]
    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=-1.)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if(k not in flattened_results.keys()):
            flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

model = AutoModelForTokenClassification.from_pretrained("camembert-base", num_labels=len(jdf.id2label), id2label=jdf.id2label, label2id=jdf.label2id, cache_dir=CACHE_DIR)
data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=20,
    output_dir=path.join('..', '.cache', 'results', 'train_jdf_val_jdr'),
    logging_dir=path.join('..', '.cache', 'logs', 'train_jdf_val_jdr'),
    logging_steps=1,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True
)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing CamembertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream tas

Train on `jdr`, val on `jdf`

In [43]:
print('JDF:')
labels = [set(l) for l in jdf.data.labels]
labels = [l for l_array in labels for l in l_array]
labels = set(labels)
print('idx_labels:', labels)
print('str_labels:', [jdf.id2label[l] for l in labels])

JDF:
idx_labels: {0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 14, 15, 20}
str_labels: ['O', 'B-Email', 'I-Email', 'B-Human', 'I-Human', 'B-Location', 'I-Location', 'B-Organization', 'I-Organization', 'B-Phone_Number', 'I-Phone_Number', 'B-Reference_Code_Postal', 'I-Reference_Code_Postal']


In [44]:
print('JDR:')
labels = [set(l) for l in jdr.data.labels]
labels = [l for l_array in labels for l in l_array]
labels = set(labels)
print('idx_labels:', labels)
print('str_labels:', [jdr.id2label[l] for l in labels])

JDR:
idx_labels: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}
str_labels: ['O', 'B-Email', 'I-Email', 'B-Function', 'I-Function', 'B-Human', 'I-Human', 'B-Location', 'I-Location', 'B-Organization', 'I-Organization', 'B-Phone_Number', 'B-Project', 'I-Project', 'I-Phone_Number', 'B-Reference_Code_Postal', 'B-Reference_CEDEX', 'B-Reference_CS', 'B-Reference_User', 'I-Reference_CS', 'I-Reference_Code_Postal', 'I-Reference_CEDEX', 'I-Reference_User', 'B-Social_Network', 'I-Social_Network', 'B-Url', 'I-Url']


In [88]:
from transformers import AdamW

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=jdf,
    eval_dataset=jdr,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3),
    ]
)

trainer.train()

***** Running training *****
  Num examples = 500
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 640
  Number of trainable parameters = 110052123
You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Email F1,Function F1,Human F1,Location F1,Organization F1,Phone Number F1,Project F1,Reference Cedex F1,Reference Cs F1,Reference Code Postal F1,Reference User F1,Social Network F1,Url F1
1,2.0968,2.249593,0.455177,0.389309,0.419674,0.648971,0.200466,0.0,0.905847,0.609732,0.023064,0.477642,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.5472,1.808935,0.599062,0.62041,0.609549,0.772674,0.37785,0.0,0.889324,0.813008,0.38162,0.618474,0.0,0.0,0.0,0.962162,0.0,0.0,0.0
3,1.2824,1.631734,0.572599,0.603672,0.587725,0.7845,0.350679,0.0,0.865878,0.785948,0.434583,0.488281,0.0,0.0,0.0,0.938918,0.0,0.0,0.0
4,1.103,1.503148,0.539953,0.62392,0.578908,0.792828,0.346112,0.0,0.881592,0.790226,0.362292,0.563654,0.0,0.0,0.0,0.938918,0.0,0.0,0.0
5,0.9654,1.381335,0.585372,0.646058,0.61422,0.805681,0.424173,0.0,0.873767,0.809204,0.434048,0.614341,0.0,0.0,0.0,0.940559,0.0,0.0,0.0
6,0.8779,1.325044,0.549306,0.63013,0.586948,0.796479,0.397714,0.0,0.859406,0.794159,0.383595,0.579512,0.0,0.0,0.0,0.908475,0.0,0.0,0.0
7,0.8526,1.248609,0.568321,0.6277,0.596536,0.801612,0.416953,0.0,0.865135,0.781991,0.397653,0.608016,0.0,0.0,0.0,0.924399,0.0,0.0,0.0
8,0.6654,1.181911,0.542783,0.618251,0.578064,0.806518,0.416465,0.0,0.91875,0.789174,0.298524,0.61629,0.0,0.0,0.0,0.930796,0.0,0.0,0.0
9,0.6503,1.152512,0.529952,0.62338,0.572882,0.800966,0.419394,0.0,0.86742,0.788973,0.314903,0.624506,0.0,0.0,0.0,0.934028,0.0,0.0,0.0
10,0.5794,1.074131,0.598232,0.657667,0.626543,0.816785,0.457831,0.0,0.89613,0.802804,0.376187,0.761715,0.0,0.0,0.0,0.940351,0.0,0.0,0.0


***** Running Evaluation *****
  Num examples = 473
  Batch size = 16
Saving model checkpoint to ../.cache/results/train_jdf_val_jdr/checkpoint-32
Configuration saved in ../.cache/results/train_jdf_val_jdr/checkpoint-32/config.json
Model weights saved in ../.cache/results/train_jdf_val_jdr/checkpoint-32/pytorch_model.bin
tokenizer config file saved in ../.cache/results/train_jdf_val_jdr/checkpoint-32/tokenizer_config.json
Special tokens file saved in ../.cache/results/train_jdf_val_jdr/checkpoint-32/special_tokens_map.json
Deleting older checkpoint [../.cache/results/train_jdf_val_jdr/checkpoint-96] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 473
  Batch size = 16
Saving model checkpoint to ../.cache/results/train_jdf_val_jdr/checkpoint-64
Configuration saved in ../.cache/results/train_jdf_val_jdr/checkpoint-64/config.json
Model weights saved in ../.cache/results/train_jdf_val_jdr/checkpoint-64/pytorch_model.bin
tokenizer config file saved in ../.cache/

TrainOutput(global_step=576, training_loss=0.8499794393881328, metrics={'train_runtime': 3730.2654, 'train_samples_per_second': 2.681, 'train_steps_per_second': 0.172, 'total_flos': 241727585790744.0, 'train_loss': 0.8499794393881328, 'epoch': 18.0})

## Reshuffle JDR and JDF, get new splits

In [38]:
# load jdf and jdr
df_jdf = pd.read_parquet(path.join(CACHE_DIR, 'jdf.parquet'))
df_jdr = pd.read_parquet(path.join(CACHE_DIR, 'jdr.parquet'))
df_jdr = df_jdr.drop(columns=['labels'])

# fusion into full data
full_data = pd.concat([df_jdf, df_jdr], ignore_index=True)
for col in full_data.columns:
    if isinstance(full_data.loc[0, col], np.ndarray):
        full_data[col] = full_data[col].apply(lambda x: x.tolist())
full_data.to_parquet(path.join(CACHE_DIR, 'full.parquet'))

# split and generate dataset
from sklearn.model_selection import train_test_split
import numpy as np

train, val = train_test_split(full_data, test_size=.15)
train.to_parquet(path.join(CACHE_DIR, 'train.parquet'), index=False)
val.to_parquet(path.join(CACHE_DIR, 'val.parquet'), index=False)

trainset = TextMineDataset('train', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)
valset = TextMineDataset('val', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)

Load cache data from ../.cache/train.parquet
Load label idx from ../.cache/label_idx.json
Load cache data from ../.cache/val.parquet
Load label idx from ../.cache/label_idx.json


In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=20,
    output_dir=path.join('..', '.cache', 'results', 'train_vaL_split'),
    logging_dir=path.join('..', '.cache', 'logs', 'train_vaL_split'),
    logging_steps=1,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=valset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3),
    ]
)

trainer.train()

***** Running training *****
  Num examples = 827
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1040
  Number of trainable parameters = 110052123
You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Email F1,Function F1,Human F1,Location F1,Organization F1,Phone Number F1,Project F1,Reference Cedex F1,Reference Cs F1,Reference Code Postal F1,Reference User F1,Social Network F1,Url F1
1,1.7417,1.658992,0.908745,0.918348,0.913521,0.891095,0.953271,0.734694,0.983051,0.962233,0.854167,0.928302,0.0,0.0,0.0,0.924528,0.0,0.0,0.86747
2,1.302,1.210943,0.945817,0.955812,0.950788,0.963212,1.0,0.90566,0.986395,0.970492,0.907143,0.96063,0.0,0.866667,0.0,0.947867,0.0,0.0,0.930233
3,1.0465,0.960262,0.946869,0.958694,0.952745,0.968613,0.995215,0.90566,0.986395,0.983498,0.921986,0.97992,0.0,0.606061,0.0,0.930233,0.0,0.0,0.942529
4,0.7879,0.771485,0.960615,0.960615,0.960615,0.981606,0.990385,0.912621,0.986395,0.985124,0.943262,0.991935,0.0,0.666667,0.0,0.952381,0.0,0.0,0.931818
5,0.6497,0.626723,0.954286,0.962536,0.958393,0.985109,0.995215,0.914286,0.986395,0.980198,0.946996,0.991935,0.0,0.666667,0.0,0.952381,0.666667,0.0,0.941176
6,0.5807,0.517963,0.955238,0.963497,0.95935,0.985693,1.0,0.903846,0.989831,0.983498,0.939929,0.991935,0.0,0.666667,0.0,0.952381,0.5,0.0,0.964706
7,0.4344,0.434567,0.964695,0.971182,0.967927,0.986277,1.0,0.914286,0.989831,0.98843,0.95,0.991935,0.0,0.866667,0.6,0.952381,0.4,0.0,0.964706
8,0.3582,0.372642,0.965682,0.973103,0.969378,0.987299,1.0,0.90566,0.989831,0.98843,0.942446,0.991935,0.0,1.0,1.0,0.952381,0.4,0.0,0.953488
9,0.4113,0.32801,0.964829,0.975024,0.9699,0.988029,1.0,0.90566,0.986395,0.98843,0.960289,0.991935,0.25,1.0,1.0,0.952381,0.4,0.0,0.942529
10,0.2706,0.295367,0.966667,0.975024,0.970827,0.989489,1.0,0.914286,0.97973,0.986799,0.952727,0.991935,0.666667,1.0,1.0,0.952381,1.0,0.0,0.953488


***** Running Evaluation *****
  Num examples = 146
  Batch size = 16
Saving model checkpoint to ../.cache/results/train_jdf_val_jdr/checkpoint-52
Configuration saved in ../.cache/results/train_jdf_val_jdr/checkpoint-52/config.json
Model weights saved in ../.cache/results/train_jdf_val_jdr/checkpoint-52/pytorch_model.bin
tokenizer config file saved in ../.cache/results/train_jdf_val_jdr/checkpoint-52/tokenizer_config.json
Special tokens file saved in ../.cache/results/train_jdf_val_jdr/checkpoint-52/special_tokens_map.json
Deleting older checkpoint [../.cache/results/train_jdf_val_jdr/checkpoint-480] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 146
  Batch size = 16
Saving model checkpoint to ../.cache/results/train_jdf_val_jdr/checkpoint-104
Configuration saved in ../.cache/results/train_jdf_val_jdr/checkpoint-104/config.json
Model weights saved in ../.cache/results/train_jdf_val_jdr/checkpoint-104/pytorch_model.bin
tokenizer config file saved in ../.ca

Classify subset of entity

In [None]:
target_class = ['Human', 'Project', 'Organization', 'Reference_User']
new_spec_tokens = list(set(train.id2label) - set() )

model.resize_token_embeddings(len(tokenizer))

In [None]:
'Human',
 'Location',
 'Reference_Code_Postal',
 'Reference_CEDEX',
 'Reference_CS',
 'Function',
 'Project',
 'Email',
 'Phone_Number',
 'Social_Network',
 'Reference_User',
 'Organization',
 'Url'