In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import sys
import os
from os import path

sys.path.append("./../src")

In [2]:
import pandas as pd
import json

DATA_PATH = path.join('..','dataset')
JDF_PATH = path.join(DATA_PATH, 'JDF.json')
JDR_PATH = path.join(DATA_PATH, 'JDR.json')

data = dict()

with open(JDR_PATH, 'r') as f:
    data['jdr'] = json.load(f)
    
with open(JDF_PATH, 'r') as f:
    data['jdf'] = json.load(f)

print('JDR #examples :',len(data['jdr']))
print('JDF #examples :',len(data['jdf']))

annotations = [a for d in data['jdr'] for a in d['annotations']]
df_annotations = pd.DataFrame(annotations)
df_annotations["label"] = df_annotations["label"].astype("category")
display(HTML('<h3>Les entités nommées</h3>'))
display(df_annotations.head())

print('Labels:', df_annotations['label'].unique())

JDR #examples : 473
JDF #examples : 500


Unnamed: 0,form,label,begin,end
0,Faustin,Human,0,7
1,Chabot,Human,8,14
2,19,Location,28,30
3,rue,Location,31,34
4,Descartes,Location,35,44


Labels: ['Human', 'Location', 'Reference_Code_Postal', 'Reference_CEDEX', 'Reference_CS', ..., 'Phone_Number', 'Social_Network', 'Reference_User', 'Organization', 'Url']
Length: 13
Categories (13, object): ['Email', 'Function', 'Human', 'Location', ..., 'Reference_Code_Postal', 'Reference_User', 'Social_Network', 'Url']


<div class="alert alert-block alert-info">Format d'une pharse donnée</div>

In [3]:
data['jdr'][0]

{'identifier': 500,
 'text': 'Faustin Chabot\\r\\nAdresse : 19 rue Descartes 94370 Sucy-en-Brie (France)\\r\\nCedex 9 CS 12468\\r\\nData Engineer / Algorithm XZ Project\\r\\nfaustinchabot@teleworm.com / Tel : +33 0134354919\\r\\nLinkedin : https://fr.linkedin.com/in/fauchab\\r\\nTeleworm France\\r\\nteleworm.france.com',
 'annotations': [{'form': 'Faustin', 'label': 'Human', 'begin': 0, 'end': 7},
  {'form': 'Chabot', 'label': 'Human', 'begin': 8, 'end': 14},
  {'form': '19', 'label': 'Location', 'begin': 28, 'end': 30},
  {'form': 'rue', 'label': 'Location', 'begin': 31, 'end': 34},
  {'form': 'Descartes', 'label': 'Location', 'begin': 35, 'end': 44},
  {'form': '94370', 'label': 'Reference_Code_Postal', 'begin': 45, 'end': 50},
  {'form': 'Sucy-en-Brie', 'label': 'Location', 'begin': 51, 'end': 63},
  {'form': 'France', 'label': 'Location', 'begin': 65, 'end': 71},
  {'form': 'Cedex', 'label': 'Reference_CEDEX', 'begin': 76, 'end': 81},
  {'form': '9', 'label': 'Reference_CEDEX', 'be

<div class="alert alert-block alert-info">Est-ce qu'il y a de motif pour le numéro de téléphone?</div>

> Apparemment non

In [4]:
df_annotations[df_annotations['label']=='Phone_Number'].head(20)

Unnamed: 0,form,label,begin,end
18,+33 0134354919,Phone_Number,171,185
45,03.18.38.37.37,Phone_Number,136,150
72,+ 03 81 20 48 27,Phone_Number,152,168
91,01.75.88.25.30,Phone_Number,77,91
122,+ 33 01 77 83 74 05,Phone_Number,157,176
134,+33 0365962110,Phone_Number,70,84
170,01.55.29.21.75,Phone_Number,118,132
191,+ 33 01 79 28 30 87,Phone_Number,75,94
225,33 0147908347,Phone_Number,150,163
250,03.54.57.86.42,Phone_Number,162,176


<div class="alert alert-block alert-info">Liste des étiquettes à prédire</div>

In [5]:
list(df_annotations['label'].unique())

['Human',
 'Location',
 'Reference_Code_Postal',
 'Reference_CEDEX',
 'Reference_CS',
 'Function',
 'Project',
 'Email',
 'Phone_Number',
 'Social_Network',
 'Reference_User',
 'Organization',
 'Url']

<div class="alert alert-block alert-info">Les étiquettes sont-elles chevauchées?</div>

> Oui, il semble un bruit dans l'outil d'annotation. Il suffit de surrprimer celui contenu dans la vraie étiquette

In [6]:
def check_overlapping(data):
    for data_row in data:
        annotations = data_row['annotations']
        for pre, post in zip(annotations[:-1], annotations[1:]):
            if pre['end'] > post['begin']:
                print('TEXT:', data_row['text'])
                print(pre)
                print(post)
                print('='*15)

def remove_overlapping(data):
    for data_row in data:
        annotations = data_row['annotations']
        for pre, post in zip(annotations[:-1], annotations[1:]):
            if pre['end'] >= post['end']:
                annotations.remove(post)

for split in data:
    display(HTML('<h3>'+split+'</h3>'))
    check_overlapping(data[split])

for split in data:
    remove_overlapping(data[split])
    check_overlapping(data[split])

TEXT: Sibyla Chandonnet\nTechnicienne en radiologie / Health For All\nTel : +33 0365962110 \nsibylachandonnet@yahoo.fr\n89 rue du Général Ailleret 62300 Lens\nCedex 08 CS 40362\nThe Happy Bear / happybearhealth.com\nLinkedin : https://fr.linkedin.com/in/sibylac
{'form': 'Technicienne', 'label': 'Function', 'begin': 19, 'end': 31}
{'form': 'en', 'label': 'Function', 'begin': 27, 'end': 29}
TEXT: Dorothée Charrette\n94 rue du Faubourg National 94320 Thiais\nCedex 02\n+33 01 70 92 06 69 - dorotheecharette@outlook.com\nEscrow Papers - escrowpapers.fr\nPoste : Technicienne en téléphonie mobile
{'form': 'Technicienne', 'label': 'Function', 'begin': 164, 'end': 176}
{'form': 'en', 'label': 'Function', 'begin': 172, 'end': 174}
TEXT: Coralie Sanschagrin\nVendeuse en magasin - Supermarket Lists\nsupermarketlists.fr\nAdresse : 12 place Maurice-Charretier 94220 Charenton-sur-le-Pont\nTél : 0171220748\nEmail : coraliesanschagrin@gmail.com
{'form': 'Vendeuse', 'label': 'Function', 'begin': 21, 'end

<div class="alert alert-block alert-info">Est-ce qu'il existe un exemple qui manque d'annotation? Combien? Lesquels?</div>

> Non, apparemment très cohérent!

In [7]:
for split in data:
    for data_row in data[split]:
        annotations = data_row['annotations']
        if len(annotations) == 0:
            print(data_row)

## Tokenization test

In [82]:
from transformers import CamembertTokenizerFast

MAX_LINE = 1000000
CACHE_DIR = path.join('.cache', 'transformers')


texts = [d['text'] for d in data['jdr'][:MAX_LINE]]
print('Testing text:')
display(texts[:2])

#fast_tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base", cache_dir=CACHE_DIR, additional_special_tokens=['\\n', '\\r', 'https://'])
fast_tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base", cache_dir=CACHE_DIR)

token_encodings = fast_tokenizer(texts, return_offsets_mapping=True)

token_strings = [fast_tokenizer.convert_ids_to_tokens(input_ids) for input_ids in token_encodings.input_ids]

for tkstr in token_strings[:2]:
    print(tkstr)

Testing text:


['Faustin Chabot\\r\\nAdresse : 19 rue Descartes 94370 Sucy-en-Brie (France)\\r\\nCedex 9 CS 12468\\r\\nData Engineer / Algorithm XZ Project\\r\\nfaustinchabot@teleworm.com / Tel : +33 0134354919\\r\\nLinkedin : https://fr.linkedin.com/in/fauchab\\r\\nTeleworm France\\r\\nteleworm.france.com',
 'Vallis Lachance\\r\\nConcepteur de publications web - Un Site, une BD\\r\\n14 rue Victor Hugo 60200 Compiègne\\r\\nCedex 12 CS 10202\\r\\nTel : 03.18.38.37.37\\r\\nEmail : vallislachance@monwax.com\\r\\nMonwax \\r\\nmonwax.com\\r\\nFacebook : https://www.facebook.com/vallislachance']

loading file sentencepiece.bpe.model from cache at .cache/transformers/models--camembert-base/snapshots/3f452b6e5a89b0e6c828c9bba2642bc577086eae/sentencepiece.bpe.model
loading file tokenizer.json from cache at .cache/transformers/models--camembert-base/snapshots/3f452b6e5a89b0e6c828c9bba2642bc577086eae/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at .cache/transformers/models--camembert-base/snapshots/3f452b6e5a89b0e6c828c9bba2642bc577086eae/config.json
Model config CamembertConfig {
  "_name_or_path": "camembert-base",
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3

['<s>', '▁Faust', 'in', '▁Cha', 'bot', '\\', 'r', '\\', 'n', 'Ad', 'resse', '▁:', '▁19', '▁rue', '▁Descartes', '▁94', '370', '▁Suc', 'y', '-', 'en', '-', 'B', 'rie', '▁(', 'France', ')', '\\', 'r', '\\', 'n', 'Ce', 'dex', '▁9', '▁CS', '▁124', '68', '\\', 'r', '\\', 'n', 'D', 'ata', '▁Engine', 'er', '▁/', '▁Al', 'gor', 'ith', 'm', '▁X', 'Z', '▁Project', '\\', 'r', '\\', 'n', 'fa', 'ustin', 'cha', 'bot', '@', 'tel', 'e', 'w', 'orm', '.', 'com', '▁/', '▁Tel', '▁:', '▁+', '33', '▁01', '34', '35', '49', '19', '\\', 'r', '\\', 'n', 'L', 'ink', 'e', 'din', '▁:', '▁https', '://', 'fr', '.', 'link', 'e', 'din', '.', 'com', '/', 'in', '/', 'fa', 'uch', 'ab', '\\', 'r', '\\', 'n', 'T', 'ele', 'w', 'orm', '▁France', '\\', 'r', '\\', 'nt', 'ele', 'w', 'orm', '.', 'france', '.', 'com', '</s>']
['<s>', '▁Val', 'lis', '▁La', 'ch', 'ance', '\\', 'r', '\\', 'n', 'Con', 'cept', 'eur', '▁de', '▁publications', '▁web', '▁-', '▁Un', '▁Site', ',', '▁une', '▁BD', '\\', 'r', '\\', 'n', '14', '▁rue', '▁Victor', 

In [83]:
def mapping_label_token(token_span_batch, annotations_batch):
    """
    Remap IOB tag to each token generated by tokenizer. Should provide the span (begin/end)
    """
    
    labels = list()
    
    for token_span_sent, annotations in zip(token_span_batch, annotations_batch):
        
        annotations = annotations.copy()
        entity = annotations.pop(0)
        
        last_label = 'O'
        token_label = list()
        
        for token in token_span_sent:
            
            while entity['end'] < token['begin']: entity = annotations.pop(0)

            if token['begin'] == token['end']:
                label = 'O'    
            elif entity['begin'] <= token['begin'] and token['end'] <= entity['end']:
                prefix = 'B-' if last_label == 'O' or last_label[2:] != entity['label'] else 'I-'
                label = prefix + entity['label']
            else:
                label = 'O'
                
            token_label.append(label)
            last_label = label
                
        labels.append(token_label)
        
    return labels

def tokenize_text(texts, annotations, tokenizer):
    
    # Tokenize text
    token_encodings = tokenizer(texts, return_offsets_mapping=True)
    token_encodings['tokens'] = [fast_tokenizer.convert_ids_to_tokens(input_ids) for input_ids in token_encodings.input_ids]
    
    # Mapping labels
    token_span = token_encodings.offset_mapping
    token_span_dict = [[{'begin': span[0], 'end': span[1]} for span in token_sent ] for token_sent in token_span]
    token_encodings['ner_tags'] = mapping_label_token(token_span_dict, annotations)
    
    return token_encodings

annotations = [d['annotations'] for d in data['jdr'][:MAX_LINE]]
texts = [d['text'] for d in data['jdr'][:MAX_LINE]]
tokenized = tokenize_text(texts, annotations, fast_tokenizer)

all_labels = [i for l in tokenized['ner_tags'] for i in l ]
unique_label = set(all_labels)
id2label = list(set(all_labels))
print('id2label =',id2label)
label2id = {label: idx for idx, label in enumerate(id2label)}
print('label2id =',label2id)

tokenized['labels'] = [[label2id[label] for label in label_sentence] for label_sentence in tokenized['ner_tags']]

id2label = ['I-Reference_CS', 'I-Reference_User', 'B-Reference_User', 'B-Organization', 'B-Reference_CS', 'I-Human', 'I-Email', 'O', 'B-Human', 'B-Project', 'I-Social_Network', 'I-Reference_CEDEX', 'I-Url', 'B-Url', 'I-Function', 'I-Location', 'B-Email', 'B-Social_Network', 'B-Reference_CEDEX', 'B-Phone_Number', 'B-Location', 'B-Function', 'I-Phone_Number', 'I-Organization', 'I-Reference_Code_Postal', 'I-Project', 'B-Reference_Code_Postal']
label2id = {'I-Reference_CS': 0, 'I-Reference_User': 1, 'B-Reference_User': 2, 'B-Organization': 3, 'B-Reference_CS': 4, 'I-Human': 5, 'I-Email': 6, 'O': 7, 'B-Human': 8, 'B-Project': 9, 'I-Social_Network': 10, 'I-Reference_CEDEX': 11, 'I-Url': 12, 'B-Url': 13, 'I-Function': 14, 'I-Location': 15, 'B-Email': 16, 'B-Social_Network': 17, 'B-Reference_CEDEX': 18, 'B-Phone_Number': 19, 'B-Location': 20, 'B-Function': 21, 'I-Phone_Number': 22, 'I-Organization': 23, 'I-Reference_Code_Postal': 24, 'I-Project': 25, 'B-Reference_Code_Postal': 26}


In [84]:
df = pd.DataFrame(dict(tokenized))
os.makedirs(path.join('..', '.cache'), exist_ok=True)
df.to_csv(path.join('..','.cache', 'jdr.csv'))

In [85]:
df

Unnamed: 0,input_ids,attention_mask,offset_mapping,tokens,ner_tags,labels
0,"[5, 28119, 236, 2614, 8674, 3155, 81, 3155, 25...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 5), (5, 7), (8, 11), (11, 14), (1...","[<s>, ▁Faust, in, ▁Cha, bot, \, r, \, n, Ad, r...","[O, B-Human, I-Human, I-Human, I-Human, O, O, ...","[7, 8, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 20, 15, 1..."
1,"[5, 1598, 4026, 61, 751, 1269, 3155, 81, 3155,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 3), (3, 6), (7, 9), (9, 11), (11,...","[<s>, ▁Val, lis, ▁La, ch, ance, \, r, \, n, Co...","[O, B-Human, I-Human, I-Human, I-Human, I-Huma...","[7, 8, 5, 5, 5, 5, 7, 7, 7, 7, 21, 14, 14, 14,..."
2,"[5, 11904, 73, 6445, 276, 8348, 88, 3155, 81, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (4, 6), (6, 8), (8, 10), (11,...","[<s>, ▁Arch, ai, mb, au, ▁Mass, on, \, r, \, n...","[O, B-Human, I-Human, I-Human, I-Human, I-Huma...","[7, 8, 5, 5, 5, 5, 5, 7, 7, 7, 7, 21, 14, 14, ..."
3,"[5, 470, 1606, 9313, 2265, 3155, 81, 3155, 255...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (4, 8), (9, 12), (12, 16), (1...","[<s>, ▁Jean, ette, ▁Fre, mont, \, r, \, n, 8, ...","[O, B-Human, I-Human, I-Human, I-Human, O, O, ...","[7, 8, 5, 5, 5, 7, 7, 7, 7, 20, 15, 15, 15, 15..."
4,"[5, 3696, 19483, 236, 3155, 81, 3155, 255, 137...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (5, 8), (8, 10), (10, 11), (1...","[<s>, ▁Cher, ▁Baz, in, \, r, \, n, Mé, can, ic...","[O, B-Human, I-Human, I-Human, O, O, O, O, B-F...","[7, 8, 5, 5, 7, 7, 7, 7, 21, 14, 14, 14, 14, 7..."
...,...,...,...,...,...,...
468,"[5, 4114, 61, 29807, 3155, 255, 6179, 7148, 43...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 6), (7, 9), (9, 13), (13, 14), (1...","[<s>, ▁Claude, ▁La, ndry, \, n, Ad, resse, ▁:,...","[O, B-Human, I-Human, I-Human, O, O, O, O, O, ...","[7, 8, 5, 5, 7, 7, 7, 7, 7, 20, 15, 15, 15, 15..."
469,"[5, 11853, 9625, 10, 1981, 3155, 255, 3853, 30...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 9), (10, 13), (13, 14), (14, 18),...","[<s>, ▁Charlotte, ▁Bus, s, ière, \, n, 59, ▁co...","[O, B-Human, I-Human, I-Human, I-Human, O, O, ...","[7, 8, 5, 5, 5, 7, 7, 20, 15, 15, 15, 15, 26, ..."
470,"[5, 18467, 24817, 3155, 255, 3225, 9, 3220, 9,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 6), (7, 14), (14, 15), (15, 16), ...","[<s>, ▁Cédric, ▁Garnier, \, n, 04, ., 27, ., 1...","[O, B-Human, I-Human, O, O, B-Phone_Number, I-...","[7, 8, 5, 7, 7, 19, 22, 22, 22, 22, 22, 22, 22..."
471,"[5, 14147, 10223, 11734, 4461, 3155, 255, 3395...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 3), (3, 9), (10, 13), (13, 16), (...","[<s>, ▁Fla, vienne, ▁Dev, ost, \, n, 02, ., 56...","[O, B-Human, I-Human, I-Human, I-Human, O, O, ...","[7, 8, 5, 5, 5, 7, 7, 19, 22, 22, 22, 22, 22, ..."


In [None]:
class TextMineDataset(Dataset):
    
    def __init__(self, split):
        self.split = split
        
        self.data = df

    def __getitem__(self, idx):
        if idx >= len(self): raise IndexError  # meet the end of dataset
        sample = self.data.loc[idx].to_dict()
        #for k, v in sample.items():
        #   print(k, '=', len(v))
        return {'input_ids': sample['input_ids'], 'attention_mask': sample['attention_mask'],'labels': sample['labels']}

    def __len__(self):
        return len(self.data)

In [87]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoModelForTokenClassification, EarlyStoppingCallback
from datasets import load_metric
import numpy as np
from transformers.integrations import TensorBoardCallback

metric = load_metric("seqeval")


from torch.utils.data import Dataset

class TextMineDataset(Dataset):
    
    def __init__(self, df):
        self.data = df

    def __getitem__(self, idx):
        if idx >= len(self): raise IndexError  # meet the end of dataset
        sample = self.data.loc[idx].to_dict()
        #for k, v in sample.items():
        #   print(k, '=', len(v))
        return {'input_ids': sample['input_ids'], 'attention_mask': sample['attention_mask'],'labels': sample['labels']}

    def __len__(self):
        return len(self.data)

dataset = TextMineDataset(df)

def compute_metrics(p):
    
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if(k not in flattened_results.keys()):
            flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

model = AutoModelForTokenClassification.from_pretrained("camembert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id, cache_dir=CACHE_DIR)
data_collator = DataCollatorForTokenClassification(fast_tokenizer)

training_args = TrainingArguments(
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=20,
    output_dir=path.join('..', '.cache', 'results'),
    logging_dir=path.join('..', '.cache', 'logs'),
    logging_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3),
        TensorBoardCallback(),
    ]
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
    tokenizer=fast_tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

loading configuration file config.json from cache at .cache/transformers/models--camembert-base/snapshots/3f452b6e5a89b0e6c828c9bba2642bc577086eae/config.json
Model config CamembertConfig {
  "_name_or_path": "camembert-base",
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": [
    "I-Reference_CS",
    "I-Reference_User",
    "B-Reference_User",
    "B-Organization",
    "B-Reference_CS",
    "I-Human",
    "I-Email",
    "O",
    "B-Human",
    "B-Project",
    "I-Social_Network",
    "I-Reference_CEDEX",
    "I-Url",
    "B-Url",
    "I-Function",
    "I-Location",
    "B-Email",
    "B-Social_Network",
    "B-Reference_CEDEX",
    "B-Phone_Number",
    "B-Location",
    "B-Function",
    "I-Phone_Number",
    "I-Organization",
    "I-Reference_Code_Postal",
    "I-Project",
    "B-

RuntimeError: TensorBoardCallback requires tensorboard to be installed. Either update your PyTorch version or install tensorboardX.