In [88]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import sys
import os
from os import path

sys.path.append("./../src")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [93]:
import pandas as pd
import json

DATA_PATH = path.join('..', 'dataset')
JDF_PATH = path.join(DATA_PATH, 'JDF.json')
JDR_PATH = path.join(DATA_PATH, 'JDR.json')

data = dict()

with open(JDR_PATH, 'r') as f:
    data['jdr'] = json.load(f)
    
with open(JDF_PATH, 'r') as f:
    data['jdf'] = json.load(f)

print('JDR #examples :',len(data['jdr']))
print('JDF #examples :',len(data['jdf']))

annotations = [a for d in data['jdr'] for a in d['annotations']]
df_annotations = pd.DataFrame(annotations)
df_annotations["label"] = df_annotations["label"].astype("category")
display(HTML('<h3>Les entités nommées</h3>'))
display(df_annotations.head())

print('Labels:', df_annotations['label'].unique())

JDR #examples : 473
JDF #examples : 500


Unnamed: 0,form,label,begin,end
0,Faustin,Human,0,7
1,Chabot,Human,8,14
2,19,Location,25,27
3,rue,Location,28,31
4,Descartes,Location,32,41


Labels: ['Human', 'Location', 'Reference_Code_Postal', 'Reference_CEDEX', 'Reference_CS', ..., 'Phone_Number', 'Social_Network', 'Reference_User', 'Organization', 'Url']
Length: 13
Categories (13, object): ['Email', 'Function', 'Human', 'Location', ..., 'Reference_Code_Postal', 'Reference_User', 'Social_Network', 'Url']


<div class="alert alert-block alert-info">Format d'une pharse donnée</div>

In [94]:
data['jdr'][0]

{'identifier': 500,
 'text': 'Faustin Chabot\nAdresse : 19 rue Descartes 94370 Sucy-en-Brie (France)\nCedex 9 CS 12468\nData Engineer / Algorithm XZ Project\nfaustinchabot@teleworm.com / Tel : +33 0134354919\nLinkedin : https://fr.linkedin.com/in/fauchab\nTeleworm France\nteleworm.france.com',
 'annotations': [{'form': 'Faustin', 'label': 'Human', 'begin': 0, 'end': 7},
  {'form': 'Chabot', 'label': 'Human', 'begin': 8, 'end': 14},
  {'form': '19', 'label': 'Location', 'begin': 25, 'end': 27},
  {'form': 'rue', 'label': 'Location', 'begin': 28, 'end': 31},
  {'form': 'Descartes', 'label': 'Location', 'begin': 32, 'end': 41},
  {'form': '94370', 'label': 'Reference_Code_Postal', 'begin': 42, 'end': 47},
  {'form': 'Sucy-en-Brie', 'label': 'Location', 'begin': 48, 'end': 60},
  {'form': 'France', 'label': 'Location', 'begin': 62, 'end': 68},
  {'form': 'Cedex', 'label': 'Reference_CEDEX', 'begin': 70, 'end': 75},
  {'form': '9', 'label': 'Reference_CEDEX', 'begin': 76, 'end': 77},
  {'fo

<div class="alert alert-block alert-info">Est-ce qu'il y a de motif pour le numéro de téléphone?</div>

> Apparemment non

In [95]:
df_annotations[df_annotations['label']=='Phone_Number'].head(20)

Unnamed: 0,form,label,begin,end
18,+33 0134354919,Phone_Number,159,173
45,03.18.38.37.37,Phone_Number,124,138
72,+ 03 81 20 48 27,Phone_Number,137,153
91,01.75.88.25.30,Phone_Number,68,82
122,+ 33 01 77 83 74 05,Phone_Number,142,161
134,+33 0365962110,Phone_Number,68,82
170,01.55.29.21.75,Phone_Number,114,128
191,+ 33 01 79 28 30 87,Phone_Number,72,91
225,33 0147908347,Phone_Number,145,158
250,03.54.57.86.42,Phone_Number,156,170


<div class="alert alert-block alert-info">Liste des étiquettes à prédire</div>

In [96]:
list(df_annotations['label'].unique())

['Human',
 'Location',
 'Reference_Code_Postal',
 'Reference_CEDEX',
 'Reference_CS',
 'Function',
 'Project',
 'Email',
 'Phone_Number',
 'Social_Network',
 'Reference_User',
 'Organization',
 'Url']

<div class="alert alert-block alert-info">Les étiquettes sont-elles chevauchées?</div>

> Oui, il semble un bruit dans l'outil d'annotation. Il suffit de surrprimer celui contenu dans la vraie étiquette

In [97]:
def check_overlapping(data):
    for data_row in data:
        annotations = data_row['annotations']
        for pre, post in zip(annotations[:-1], annotations[1:]):
            if pre['end'] > post['begin']:
                print('TEXT:', data_row['text'])
                print(pre)
                print(post)
                print('='*15)

def remove_overlapping(data):
    for data_row in data:
        annotations = data_row['annotations']
        for pre, post in zip(annotations[:-1], annotations[1:]):
            if pre['end'] >= post['end']:
                annotations.remove(post)

for split in data:
    display(HTML('<h3>'+split+'</h3>'))
    check_overlapping(data[split])

for split in data:
    remove_overlapping(data[split])
    check_overlapping(data[split])

<div class="alert alert-block alert-info">Est-ce qu'il existe un exemple qui manque d'annotation? Combien? Lesquels?</div>

> Non, apparemment très cohérent!

In [98]:
for split in data:
    for data_row in data[split]:
        annotations = data_row['annotations']
        if len(annotations) == 0:
            print(data_row)

## Tokenization test

In [8]:
from transformers import CamembertTokenizerFast

MAX_LINE = 1000000
CACHE_DIR = path.join('..', '.cache')
TRANSFORMERS_DIR = path.join(CACHE_DIR, 'transformers')


texts = [d['text'] for d in data['jdr'][:MAX_LINE]]
print('Testing text:')
display(texts[:2])

#fast_tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base", cache_dir=CACHE_DIR, additional_special_tokens=['\\n', '\\r', 'https://'])
fast_tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base", cache_dir=TRANSFORMERS_DIR)

token_encodings = fast_tokenizer(texts, return_offsets_mapping=True)

token_strings = [fast_tokenizer.convert_ids_to_tokens(input_ids) for input_ids in token_encodings.input_ids]

for tkstr in token_strings[:2]:
    print(tkstr)

Testing text:


['Faustin Chabot\nAdresse : 19 rue Descartes 94370 Sucy-en-Brie (France)\nCedex 9 CS 12468\nData Engineer / Algorithm XZ Project\nfaustinchabot@teleworm.com / Tel : +33 0134354919\nLinkedin : https://fr.linkedin.com/in/fauchab\nTeleworm France\nteleworm.france.com',
 'Vallis Lachance\nConcepteur de publications web - Un Site, une BD\n14 rue Victor Hugo 60200 Compiègne\nCedex 12 CS 10202\nTel : 03.18.38.37.37\nEmail : vallislachance@monwax.com\nMonwax \nmonwax.com\nFacebook : https://www.facebook.com/vallislachance']

['<s>', '▁Faust', 'in', '▁Cha', 'bot', '▁Adresse', '▁:', '▁19', '▁rue', '▁Descartes', '▁94', '370', '▁Suc', 'y', '-', 'en', '-', 'B', 'rie', '▁(', 'France', ')', '▁Ce', 'dex', '▁9', '▁CS', '▁124', '68', '▁Data', '▁Engine', 'er', '▁/', '▁Al', 'gor', 'ith', 'm', '▁X', 'Z', '▁Project', '▁fa', 'ustin', 'cha', 'bot', '@', 'tel', 'e', 'w', 'orm', '.', 'com', '▁/', '▁Tel', '▁:', '▁+', '33', '▁01', '34', '35', '49', '19', '▁Link', 'e', 'din', '▁:', '▁https', '://', 'fr', '.', 'link', 'e', 'din', '.', 'com', '/', 'in', '/', 'fa', 'uch', 'ab', '▁Tele', 'w', 'orm', '▁France', '▁tele', 'w', 'orm', '.', 'france', '.', 'com', '</s>']
['<s>', '▁Val', 'lis', '▁La', 'ch', 'ance', '▁Concept', 'eur', '▁de', '▁publications', '▁web', '▁-', '▁Un', '▁Site', ',', '▁une', '▁BD', '▁14', '▁rue', '▁Victor', '▁Hugo', '▁60', '200', '▁Compiègne', '▁Ce', 'dex', '▁12', '▁CS', '▁102', '02', '▁Tel', '▁:', '▁03', '.', '18', '.', '38', '.', '37', '.', '37', '▁Email', '▁:', '▁val', 'lis', 'la', 'ch', 'ance', '@', 'mon', 'w

In [9]:
def mapping_label_token(token_span_batch, annotations_batch):
    """
    Remap IOB tag to each token generated by tokenizer. Should provide the span (begin/end)
    """
    
    labels = list()
    
    for token_span_sent, annotations in zip(token_span_batch, annotations_batch):
        
        annotations = annotations.copy()
        entity = annotations.pop(0)
        
        last_label = 'O'
        token_label = list()
        
        for token in token_span_sent:
            
            while entity['end'] < token['begin']: entity = annotations.pop(0)

            if token['begin'] == token['end']:
                label = 'O'    
            elif entity['begin'] <= token['begin'] and token['end'] <= entity['end']:
                prefix = 'B-' if last_label == 'O' or last_label[2:] != entity['label'] else 'I-'
                label = prefix + entity['label']
            else:
                label = 'O'
                
            token_label.append(label)
            last_label = label
                
        labels.append(token_label)
        
    return labels

def tokenize_text(texts, annotations, tokenizer):
    
    # Tokenize text
    token_encodings = tokenizer(texts, return_offsets_mapping=True)
    token_encodings['tokens'] = [fast_tokenizer.convert_ids_to_tokens(input_ids) for input_ids in token_encodings.input_ids]
    
    # Mapping labels
    token_span = token_encodings.offset_mapping
    token_span_dict = [[{'begin': span[0], 'end': span[1]} for span in token_sent ] for token_sent in token_span]
    token_encodings['ner_tags'] = mapping_label_token(token_span_dict, annotations)
    
    return token_encodings

annotations = [d['annotations'] for d in data['jdr'][:MAX_LINE]]
texts = [d['text'] for d in data['jdr'][:MAX_LINE]]
tokenized = tokenize_text(texts, annotations, fast_tokenizer)

all_labels = [i for l in tokenized['ner_tags'] for i in l ]
unique_label = set(all_labels)
id2label = list(unique_label)
print('id2label =',id2label)
label2id = {label: idx for idx, label in enumerate(id2label)}
print('label2id =',label2id)

tokenized['labels'] = [[label2id[label] for label in label_sentence] for label_sentence in tokenized['ner_tags']]

id2label = ['B-Url', 'B-Organization', 'O', 'I-Organization', 'I-Human', 'I-Reference_CS', 'I-Reference_CEDEX', 'B-Reference_User', 'B-Project', 'I-Social_Network', 'B-Email', 'I-Url', 'B-Reference_Code_Postal', 'B-Phone_Number', 'I-Email', 'I-Phone_Number', 'I-Reference_Code_Postal', 'B-Human', 'B-Reference_CEDEX', 'I-Function', 'B-Location', 'B-Function', 'B-Reference_CS', 'I-Location', 'B-Social_Network', 'I-Project', 'I-Reference_User']
label2id = {'B-Url': 0, 'B-Organization': 1, 'O': 2, 'I-Organization': 3, 'I-Human': 4, 'I-Reference_CS': 5, 'I-Reference_CEDEX': 6, 'B-Reference_User': 7, 'B-Project': 8, 'I-Social_Network': 9, 'B-Email': 10, 'I-Url': 11, 'B-Reference_Code_Postal': 12, 'B-Phone_Number': 13, 'I-Email': 14, 'I-Phone_Number': 15, 'I-Reference_Code_Postal': 16, 'B-Human': 17, 'B-Reference_CEDEX': 18, 'I-Function': 19, 'B-Location': 20, 'B-Function': 21, 'B-Reference_CS': 22, 'I-Location': 23, 'B-Social_Network': 24, 'I-Project': 25, 'I-Reference_User': 26}


In [10]:
import json

id2label = list(unique_label)
def fn_sort(x):
    val = 0 if len(x) == 1 else (ord(x[2]))*1e3 + ord(x[0])
    return val

id2label = sorted(id2label, key=fn_sort, reverse=False)
label2id = {label: idx for idx, label in enumerate(id2label)}
label2id

with open(path.join(CACHE_DIR, 'label_idx.json'), "w") as f:
    json.dump({
        'label2id': label2id,
        'id2label': id2label
    }, f, indent='\t')

In [11]:
df = pd.DataFrame(dict(tokenized))
os.makedirs(path.join('..', '.cache'), exist_ok=True)
df.to_parquet(path.join('..','.cache', 'jdr.parquet'))
df

Unnamed: 0,input_ids,attention_mask,offset_mapping,tokens,ner_tags,labels
0,"[5, 28119, 236, 2614, 8674, 13242, 43, 653, 83...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 5), (5, 7), (8, 11), (11, 14), (1...","[<s>, ▁Faust, in, ▁Cha, bot, ▁Adresse, ▁:, ▁19...","[O, B-Human, I-Human, I-Human, I-Human, O, O, ...","[2, 17, 4, 4, 4, 2, 2, 20, 23, 23, 12, 16, 20,..."
1,"[5, 1598, 4026, 61, 751, 1269, 15901, 601, 8, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 3), (3, 6), (7, 9), (9, 11), (11,...","[<s>, ▁Val, lis, ▁La, ch, ance, ▁Concept, eur,...","[O, B-Human, I-Human, I-Human, I-Human, I-Huma...","[2, 17, 4, 4, 4, 4, 21, 19, 19, 19, 19, 2, 8, ..."
2,"[5, 11904, 73, 6445, 276, 8348, 88, 22878, 8, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (4, 6), (6, 8), (8, 10), (11,...","[<s>, ▁Arch, ai, mb, au, ▁Mass, on, ▁Chargé, ▁...","[O, B-Human, I-Human, I-Human, I-Human, I-Huma...","[2, 17, 4, 4, 4, 4, 4, 21, 19, 19, 19, 8, 25, ..."
3,"[5, 470, 1606, 9313, 2265, 344, 5921, 4950, 27...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (4, 8), (9, 12), (12, 16), (1...","[<s>, ▁Jean, ette, ▁Fre, mont, ▁8, ▁Rue, ▁Jose...","[O, B-Human, I-Human, I-Human, I-Human, B-Loca...","[2, 17, 4, 4, 4, 20, 23, 23, 23, 23, 12, 16, 2..."
4,"[5, 3696, 19483, 236, 5177, 2701, 1151, 1972, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (5, 8), (8, 10), (11, 13), (1...","[<s>, ▁Cher, ▁Baz, in, ▁Mé, can, ic, ienne, ▁a...","[O, B-Human, I-Human, I-Human, B-Function, I-F...","[2, 17, 4, 4, 21, 19, 19, 19, 19, 2, 8, 25, 25..."
...,...,...,...,...,...,...
468,"[5, 4114, 61, 29807, 13242, 43, 5387, 839, 25,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 6), (7, 9), (9, 13), (14, 21), (2...","[<s>, ▁Claude, ▁La, ndry, ▁Adresse, ▁:, ▁46, ▁...","[O, B-Human, I-Human, I-Human, O, O, B-Locatio...","[2, 17, 4, 4, 2, 2, 20, 23, 23, 23, 23, 23, 23..."
469,"[5, 11853, 9625, 10, 1981, 7017, 307, 470, 505...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 9), (10, 13), (13, 14), (14, 18),...","[<s>, ▁Charlotte, ▁Bus, s, ière, ▁59, ▁cours, ...","[O, B-Human, I-Human, I-Human, I-Human, B-Loca...","[2, 17, 4, 4, 4, 20, 23, 23, 23, 23, 12, 16, 2..."
470,"[5, 18467, 24817, 5066, 9, 3220, 9, 2138, 9, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 6), (7, 14), (15, 17), (17, 18), ...","[<s>, ▁Cédric, ▁Garnier, ▁04, ., 27, ., 19, .,...","[O, B-Human, I-Human, B-Phone_Number, I-Phone_...","[2, 17, 4, 13, 15, 15, 15, 15, 15, 15, 15, 15,..."
471,"[5, 14147, 10223, 11734, 4461, 4670, 9, 4876, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 3), (3, 9), (10, 13), (13, 16), (...","[<s>, ▁Fla, vienne, ▁Dev, ost, ▁02, ., 56, ., ...","[O, B-Human, I-Human, I-Human, I-Human, B-Phon...","[2, 17, 4, 4, 4, 13, 15, 15, 15, 15, 15, 15, 1..."


## Make Pytorch TextMine Dataset from raw files

In [99]:
from data.textmine import TextMineDataset
from transformers import CamembertTokenizerFast

DATA_PATH = path.join('..', 'dataset')
CACHE_DIR = path.join('..', '.cache')

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base", cache_dir=path.join(CACHE_DIR, 'transformers'))
jdr = TextMineDataset('jdr', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)
jdf = TextMineDataset('jdf', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)

Cache data at ../.cache/jdr.parquet
Generate new label2idx
Cache data at ../.cache/jdf.parquet
Generate new label2idx


## Reshuffle JDR and JDF, get new splits

In [102]:
# load jdf and jdr
df_jdf = pd.read_parquet(path.join(CACHE_DIR, 'jdf.parquet'))
df_jdr = pd.read_parquet(path.join(CACHE_DIR, 'jdr.parquet'))
#df_jdr = df_jdr.drop(columns=['labels'])

# fusion into full data
full_data = pd.concat([df_jdf, df_jdr], ignore_index=True)
for col in full_data.columns:
    if isinstance(full_data.loc[0, col], np.ndarray):
        full_data[col] = full_data[col].apply(lambda x: x.tolist())
full_data.to_parquet(path.join(CACHE_DIR, 'full.parquet'))

# split and generate dataset
from sklearn.model_selection import train_test_split
import numpy as np

train, val = train_test_split(full_data, test_size=.15)
train.to_parquet(path.join(CACHE_DIR, 'train.parquet'), index=False)
val.to_parquet(path.join(CACHE_DIR, 'val.parquet'), index=False)

trainset = TextMineDataset('train', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)
valset = TextMineDataset('val', tokenizer=tokenizer, data_path=DATA_PATH, cache=CACHE_DIR)

Load cache data from ../.cache/train.parquet
Generate new label2idx
Load cache data from ../.cache/val.parquet
Generate new label2idx


In [79]:
training_args = TrainingArguments(
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=20,
    output_dir=path.join('..', '.cache', 'results', 'train_vaL_split'),
    logging_dir=path.join('..', '.cache', 'logs', 'train_vaL_split'),
    logging_steps=1,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True
)

In [45]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=valset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3),
    ]
)

trainer.train()

***** Running training *****
  Num examples = 827
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1040
  Number of trainable parameters = 110052123
You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Email F1,Function F1,Human F1,Location F1,Organization F1,Phone Number F1,Project F1,Reference Cedex F1,Reference Cs F1,Reference Code Postal F1,Reference User F1,Social Network F1,Url F1
1,1.7417,1.658992,0.908745,0.918348,0.913521,0.891095,0.953271,0.734694,0.983051,0.962233,0.854167,0.928302,0.0,0.0,0.0,0.924528,0.0,0.0,0.86747
2,1.302,1.210943,0.945817,0.955812,0.950788,0.963212,1.0,0.90566,0.986395,0.970492,0.907143,0.96063,0.0,0.866667,0.0,0.947867,0.0,0.0,0.930233
3,1.0465,0.960262,0.946869,0.958694,0.952745,0.968613,0.995215,0.90566,0.986395,0.983498,0.921986,0.97992,0.0,0.606061,0.0,0.930233,0.0,0.0,0.942529
4,0.7879,0.771485,0.960615,0.960615,0.960615,0.981606,0.990385,0.912621,0.986395,0.985124,0.943262,0.991935,0.0,0.666667,0.0,0.952381,0.0,0.0,0.931818
5,0.6497,0.626723,0.954286,0.962536,0.958393,0.985109,0.995215,0.914286,0.986395,0.980198,0.946996,0.991935,0.0,0.666667,0.0,0.952381,0.666667,0.0,0.941176
6,0.5807,0.517963,0.955238,0.963497,0.95935,0.985693,1.0,0.903846,0.989831,0.983498,0.939929,0.991935,0.0,0.666667,0.0,0.952381,0.5,0.0,0.964706
7,0.4344,0.434567,0.964695,0.971182,0.967927,0.986277,1.0,0.914286,0.989831,0.98843,0.95,0.991935,0.0,0.866667,0.6,0.952381,0.4,0.0,0.964706
8,0.3582,0.372642,0.965682,0.973103,0.969378,0.987299,1.0,0.90566,0.989831,0.98843,0.942446,0.991935,0.0,1.0,1.0,0.952381,0.4,0.0,0.953488
9,0.4113,0.32801,0.964829,0.975024,0.9699,0.988029,1.0,0.90566,0.986395,0.98843,0.960289,0.991935,0.25,1.0,1.0,0.952381,0.4,0.0,0.942529
10,0.2706,0.295367,0.966667,0.975024,0.970827,0.989489,1.0,0.914286,0.97973,0.986799,0.952727,0.991935,0.666667,1.0,1.0,0.952381,1.0,0.0,0.953488


***** Running Evaluation *****
  Num examples = 146
  Batch size = 16
Saving model checkpoint to ../.cache/results/train_jdf_val_jdr/checkpoint-52
Configuration saved in ../.cache/results/train_jdf_val_jdr/checkpoint-52/config.json
Model weights saved in ../.cache/results/train_jdf_val_jdr/checkpoint-52/pytorch_model.bin
tokenizer config file saved in ../.cache/results/train_jdf_val_jdr/checkpoint-52/tokenizer_config.json
Special tokens file saved in ../.cache/results/train_jdf_val_jdr/checkpoint-52/special_tokens_map.json
Deleting older checkpoint [../.cache/results/train_jdf_val_jdr/checkpoint-480] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 146
  Batch size = 16
Saving model checkpoint to ../.cache/results/train_jdf_val_jdr/checkpoint-104
Configuration saved in ../.cache/results/train_jdf_val_jdr/checkpoint-104/config.json
Model weights saved in ../.cache/results/train_jdf_val_jdr/checkpoint-104/pytorch_model.bin
tokenizer config file saved in ../.ca

TrainOutput(global_step=1040, training_loss=0.5528511846581331, metrics={'train_runtime': 10510.2951, 'train_samples_per_second': 1.574, 'train_steps_per_second': 0.099, 'total_flos': 675013670845236.0, 'train_loss': 0.5528511846581331, 'epoch': 20.0})

In [76]:
id2label = {idx: label for idx, label in enumerate(trainset.id2label)}

model = AutoModelForTokenClassification.from_pretrained("camembert-base", num_labels=len(jdf.id2label), id2label=id2label, label2id=trainset.label2id, cache_dir=CACHE_DIR)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream tas

In [81]:
training_args.output_dir

'../.cache/results/train_vaL_split'

In [104]:
import torch
model.load_state_dict(torch.load('../.cache/results/train_jdf_val_jdr/checkpoint-1040/pytorch_model.bin'))

<All keys matched successfully>

### Inférence

In [105]:
JDA_PATH = path.join(DATA_PATH, 'JDA.json')

with open(JDA_PATH, 'r') as f:
    data['jda'] = json.load(f)

In [126]:
data_jda = data['jda']

def most_frequent(List):
    if 'O' in List and len(set(List)) > 1:
        List = [i for i in List if i != 'O']
    return max(set(List), key = List.count)

for idx_sent, sentence in enumerate(data_jda):
    tokens = tokenizer(sentence['text'], return_offsets_mapping=True, return_tensors='pt')
    offsets = tokens.pop('offset_mapping')
    offsets = offsets.squeeze()
    results = model(**tokens)
    label_bert = results.logits.squeeze().argmax(dim=1)
    label_bert = torch.cat([offsets, label_bert.unsqueeze(dim=1)], dim=1)
    idx_predict = 0
    
    annotations = list()
    
    for entity in sentence['annotations']:
        
        
        while label_bert[idx_predict][0] < entity['begin'] or label_bert[idx_predict][1] < entity['end']:
            idx_predict += 1
        
        predictions = list()
        while idx_predict < len(label_bert) and label_bert[idx_predict][0] < entity['end']:
            entity_label = int(label_bert[idx_predict, 2])
            entity_label = trainset.id2label[entity_label]
            entity_label = entity_label if len(entity_label) == 1 else entity_label[2:]
            predictions.append(entity_label)
            idx_predict += 1
        
        entity['label_bert'] = most_frequent(predictions)
        #print('predictions', predictions, 'decision=',entity['label_bert'])
        #if entity['label_bert'] == 'O':
            #print('='*50)
            #print('Predicted as O: ',entity)
        annotations.append(entity)
    #if idx_sent > 100:  
        #break
    data_jda[idx_sent]['annotations'] = annotations

data {'input_ids': [[5, 29708, 160, 491, 4260, 11739, 8591, 981, 10, 7015, 537, 3197, 958, 11592, 14330, 90, 11339, 1604, 17277, 9, 427, 14330, 90, 11339, 1604, 17277, 9, 427, 14330, 90, 11339, 1604, 17277, 9, 427, 13164, 14330, 90, 11339, 1604, 17277, 9, 427, 325, 5635, 839, 2027, 2263, 21, 29684, 3694, 25313, 26, 1305, 26, 10922, 3455, 9, 2121, 9, 2828, 9, 4959, 9, 1767, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'offset_mapping': [[(0, 0), (0, 6), (7, 8), (8, 11), (11, 15), (16, 27), (28, 30), (30, 33), (33, 34), (34, 39), (40, 41), (42, 44), (44, 46), (46, 54), (55, 58), (58, 60), (60, 62), (62, 64), (64, 68), (68, 69), (69, 71), (72, 75), (75, 77), (77, 79), (79, 81), (81, 85), (85, 86), (86, 88), (89, 92), (92, 94), (94, 96), (96, 98), (98, 102), (102, 103), (103, 105), (106, 109), (110, 113), (113, 1

ValueError: max() arg is an empty sequence

In [157]:
def most_frequent(List):
    return max(set(List), key = List.count)

for idx_sent, sentence in enumerate(data_jda):
    tokens = tokenizer(sentence['text'], return_offsets_mapping=True, return_tensors='pt')
    offsets = tokens.pop('offset_mapping')
    offsets = offsets.squeeze()
    results = model(**tokens)
    label_bert = results.logits.squeeze().argmax(dim=1)
    label_bert = torch.cat([offsets, label_bert.unsqueeze(dim=1)], dim=1)
    idx_predict = 0
    
    annotations = list()
    
    for entity in sentence['annotations']:
        
        predictions = list()
        
        while label_bert[idx_predict][0] < entity['begin'] or label_bert[idx_predict][1] < entity['end']:
            idx_predict += 1
        
        while label_bert[idx_predix][0] < entity['end']:
            entity_label = int(label_bert[idx_predict, 2])
            entity_label = jdf.id2label[entity_label]
            entity_label = entity_label if len(entity_label) == 1 else entity_label[2:]
            predictions.append(entity_label)
        
        
        entity['label_bert'] = most_frequent(predictions)
        annotations.append(entity)
        
    data_jda[idx_sent]['annotations'] = annotations

data {'input_ids': [[5, 29708, 160, 491, 4260, 11739, 8591, 981, 10, 7015, 537, 3197, 958, 11592, 14330, 90, 11339, 1604, 17277, 9, 427, 14330, 90, 11339, 1604, 17277, 9, 427, 14330, 90, 11339, 1604, 17277, 9, 427, 13164, 14330, 90, 11339, 1604, 17277, 9, 427, 325, 5635, 839, 2027, 2263, 21, 29684, 3694, 25313, 26, 1305, 26, 10922, 3455, 9, 2121, 9, 2828, 9, 4959, 9, 1767, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'offset_mapping': [[(0, 0), (0, 6), (7, 8), (8, 11), (11, 15), (16, 27), (28, 30), (30, 33), (33, 34), (34, 39), (40, 41), (42, 44), (44, 46), (46, 54), (55, 58), (58, 60), (60, 62), (62, 64), (64, 68), (68, 69), (69, 71), (72, 75), (75, 77), (77, 79), (79, 81), (81, 85), (85, 86), (86, 88), (89, 92), (92, 94), (94, 96), (96, 98), (98, 102), (102, 103), (103, 105), (106, 109), (110, 113), (113, 1

In [127]:
with open(path.join('jda_bert.json'), 'w') as f:
    json.dump(data_jda, f)

Classify subset of entity

In [55]:
for batch in df_jdr:
    print(batch)
    break

input_ids


In [None]:
target_class = {'Human', 'Project', 'Organization', 'Reference_User'}
new_tokens = list(set(train.id2label) - target_class)

tokenizer.add_tokens(list(new_tokens))

model.resize_token_embeddings(len(tokenizer))

## Rectify Organization with email

In [24]:
with open('preds_final.json', 'r') as f:
    data['final'] = json.load(f)

In [60]:
import re
data_jda = data['final'].copy()
cpt_replace = 0

for idx_sent, sentence in enumerate(data_jda):
    
    annotations = list()
    organization_name = list()
    
    for entity in sentence['annotations']:
        if entity['label'] == 'Email':
            result = re.search('@(.*)\.', entity['form'])
            if result is not None:
                organization_name.append(result.group(1))
                
    organization_name = set(organization_name)                
        
    for organ in organization_name:
        for entity in sentence['annotations']:
            if organ in entity['form'].lower().replace(' ','') and entity['label'] != 'Email':
                entity['label_rectify'] = 'Organization'
                
                print('organization name: ', organization_name)
                print('entity detected: ', entity['form'].lower())
                
                cpt_replace += 1
                print(entity)
                
    if cpt_replace > 0:
        break

In [45]:
import re

s = 'a=5;iwantthis123jasd'
result = re.search('asdf=5;(.*)123jasd', s)
print(result.group(1))

AttributeError: 'NoneType' object has no attribute 'group'

In [46]:
result