In [22]:
from tqdm.auto import tqdm
import re
def make_ner_sample(sample,
                    tokenizer,
                    label_dict,
                    lang = 'en',
                    text_name = 'text',
                    label_field = 'predictions',
                    label_list = []
                    ):
    sample_text = sample['data'][f'{text_name}_{lang}']
    tokens = tokenizer(sample_text,
                    #    return_tensors='pt',
                       truncation = True,
                       padding = 'max_length'
                       )
    # print(tokens)
    labels = []
    previous_match_span = (-1, -1)
    for i, id in enumerate(tokens['input_ids']):
        span = tokens.token_to_chars(i)
        if int(id) not in [101, 102, tokenizer.pad_token_id]:
            start = span.start
            end = span.end
            new_item = ''
            for result in sample[label_field][0]['result']:
                # ic(start, end)
                # ic(result)
                if result['value']:
                    label = result['value']['labels'][0]
                    if start >= result['value']['start'] \
                    and end <= result['value']['end'] \
                    and result['from_name'] == f'label_{lang}' \
                    and re.search(re.compile(r'[.,;]'), tokenizer.decode(id)) is None \
                    and label in label_list:
                        if result['value']['start'] == previous_match_span[0] \
                        and result['value']['end'] == previous_match_span[1]:
                            new_item = 'I-' + label
                        else:
                            new_item = 'B-' + label
                        labels.append(new_item)
                        previous_match_span = (result['value']['start'], result['value']['end'])
            if not new_item:
                labels.append('O')
            # ic(i, int(id), new_item, sample['data']['text_en'][start:end], tokenizer.decode(id))
        else:
            new_item = -100
            labels.append(new_item)
    labels = [int(label_dict[label] if isinstance(label, str) else label) for label in labels]
    tokens.update({'labels': labels})
    return tokens

def get_ner_classes(data=None, label_field='predictions', raw_labels=None):
    if not raw_labels:
        raw_labels = []
        for line in tqdm(data['train']):
            # print(line)
            for prediction in line[label_field]:
                for result in prediction['result']:
                    if result['value']:
                        if result['value']['labels'][0] not in raw_labels:
                            raw_labels.append(result['value']['labels'][0])

    label_list = ['O']
    prefixes = ['B-', 'I-']
    for raw_label in raw_labels:
        for prefix in prefixes:
            label_list.append(prefix+raw_label)
    
    label2id = {k: v for v, k in enumerate(label_list)}
    id2label = {v: k for k, v in zip(label2id.keys(), label2id.values())}
    
    return raw_labels, label_list, label2id, id2label

In [23]:
import json
json_path = '/home/pgajo/food/data/GZ/GZ-GOLD/GZ-GOLD-NER-ALIGN_105.json'
with open(json_path) as f:
    data = json.load(f)

sample = data[174]
print(sample['data']['ingredients_it'])
# for result in sample['annotations'][0]:
#     print()
for result in sample['annotations'][0]['result']:
    if 'from_name' in result.keys():
        if result['from_name'] == 'label_it':
            print(sample['data']['ingredients_it'][result['value']['start']:result['value']['end']])

Rigatoni 500 g;Passata di pomodoro 800 g;Provola 200 g;Parmigiano Reggiano DOP da grattugiare 200 g;Mortadella 160 g;Salame 130 g;Mozzarella 130 g;Uova 4;Cipolle piccola 1;Aglio 1 spicchio;Basilico q.b.;Timo q.b.;Olio extravergine d'oliva q.b.;Sale fino 1 pizzico
Rigatoni
Passata di pomodoro
Provola
Parmigiano Reggiano DOP
Mortadella
Salame
Mozzarella
Uova
Cipolle
Aglio
Basilico
Timo
Olio extravergine d'oliva
Sale
500
800
200
200
160
130
130
4
1
1
q.b.
q.b.
q.b.
1
g
g
g
g
g
g
g
pizzico
piccola
fino
spicchio
da grattugiare


In [27]:
from datasets import load_from_disk
data_name_test = '/home/pgajo/food/datasets/GZ-GOLD-NER-ALIGN_105_spaced_testonly'
dataset_test_raw = load_from_disk(data_name_test)
label_list = ['QUANTITY', 'FOOD', 'UNIT', 'COLOR', 'PHYSICAL_QUALITY', 'PROCESS', 'PURPOSE', 'TASTE', 'PART']
raw_labels, label_list, label2id, id2label = get_ner_classes(raw_labels=label_list)
sample = dataset_test_raw['train'][174]

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
make_ner_sample(sample, tokenizer, label2id, label_field='annotations')


{'input_ids': [101, 34466, 54193, 123, 41506, 10107, 113, 10757, 175, 114, 132, 78079, 10340, 33166, 10112, 126, 41506, 10107, 113, 11900, 175, 114, 132, 14021, 72702, 10262, 13565, 128, 54434, 113, 10777, 175, 114, 132, 37475, 95734, 107990, 95734, 65535, 11403, 10262, 13565, 128, 54434, 113, 10777, 175, 114, 118, 63706, 11912, 132, 56027, 93429, 10330, 126, 119, 127, 54434, 113, 13849, 175, 114, 132, 27162, 10500, 126, 119, 127, 54434, 113, 13389, 175, 114, 132, 34987, 55306, 11083, 10262, 13565, 125, 119, 127, 54434, 113, 13389, 175, 114, 132, 95574, 10107, 125, 132, 60781, 13326, 122, 118, 12474, 132, 69699, 47234, 10350, 122, 171, 73477, 132, 59333, 10114, 84298, 132, 51635, 55460, 10114, 84298, 132, 50821, 13953, 18823, 105133, 21073, 10114, 84298, 132, 24846, 44253, 122, 59227, 10269, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
for result in sample['annotations'][0]['result']:
    if 'from_name' in result.keys():
        if result['from_name'] == 'label_en':
            print(sample['data']['text_en'][result['value']['start']:result['value']['end']])