In [10]:
from datasets import load_dataset
import json
import os

# joelniklaus's mapa dataset for OOD evaluation

source : https://huggingface.co/datasets/joelniklaus/mapa

## utils

In [11]:
test_dir = f"test_data_all"

In [12]:
def ner_tags_to_spans(samples, coarse = True):
    """
    Converts NER tags in the dataset samples to spans (start, end, entity type).
    
    Args:
        samples (dict): A dictionary containing the tokens and NER tags.
        tag_to_id (dict): A dictionary mapping NER tags to IDs.
    
    Returns:
        dict: A dictionary containing tokenized text and corresponding NER spans.
    """
    if coarse:
        key = "coarse_grained"
    else: 
        key = "fine_grained"


    ner_labels = samples[key]
    spans = []
    start_pos = None
    entity_name = None

    for i,label in enumerate(ner_labels):
        if label == "O":  # 'O' tag
            if entity_name is not None:
                spans.append((start_pos, i - 1, entity_name))
                entity_name = None
                start_pos = None
        else:
            if label.startswith('B-'):
                if entity_name is not None:
                    spans.append((start_pos, i - 1, entity_name))
                entity_name = label[2:].lower()
                start_pos = i
            elif label.startswith('I-'):
                continue

    # Handle the last entity if the sentence ends with an entity
    if entity_name is not None:
        spans.append([start_pos, len(samples["tokens"]) - 1, entity_name])
    
    return {"tokenized_text": samples["tokens"], "ner": spans}

In [13]:
mapa = load_dataset("joelniklaus/mapa",split="test")

In [14]:
mapa

Dataset({
    features: ['language', 'type', 'file_name', 'sentence_number', 'tokens', 'coarse_grained', 'fine_grained'],
    num_rows: 10590
})

In [15]:
print(ner_tags_to_spans(mapa[5])['ner'])

print(mapa[5]['coarse_grained'])

[[3, 5, 'organisation']]
['O', 'O', 'O', 'B-ORGANISATION', 'I-ORGANISATION', 'I-ORGANISATION']


In [16]:
data_de_coarse = [ner_tags_to_spans(i,coarse=True) for i in mapa if i['language'] == 'de']
print(len(data_de_coarse))
print(data_de_coarse[0])

with open(os.path.join(test_dir,f"mapa_de_coarse.json"),'w') as f:
    json.dump(data_de_coarse,f)

data_fr_coarse = [ner_tags_to_spans(i,coarse=True) for i in mapa if i['language'] == 'fr']
print(len(data_fr_coarse))
print(data_fr_coarse[0])

with open(os.path.join(test_dir,f"mapa_fr_coarse.json"),'w') as f:
    json.dump(data_fr_coarse,f)

data_it_coarse = [ner_tags_to_spans(i,coarse=True) for i in mapa if i['language'] == 'it']
print(len(data_it_coarse))
print(data_it_coarse[0])

with open(os.path.join(test_dir,f"mapa_it_coarse.json"),'w') as f:
    json.dump(data_it_coarse,f)

558
{'tokenized_text': ['SCHLUSSANTRÄGE', 'DES', 'GENERALANWALTS'], 'ner': []}
490
{'tokenized_text': ['CONCLUSIONS', 'DE', 'L', '’', 'AVOCAT', 'GÉNÉRAL', 'M', '.'], 'ner': [[4, 7, 'person']]}
550
{'tokenized_text': ['CONCLUSIONI', 'DELL', '’', 'AVVOCATO', 'GENERALE'], 'ner': []}


In [17]:
data_de_fine = [ner_tags_to_spans(i,coarse=False) for i in mapa if i['language'] == 'de']
print(len(data_de_fine))
print(data_de_fine[0])

with open(os.path.join(test_dir,f"mapa_de_fine.json"),'w') as f:
    json.dump(data_de_fine,f)

data_fr_fine = [ner_tags_to_spans(i,coarse=False) for i in mapa if i['language'] == 'fr']
print(len(data_fr_fine))
print(data_fr_fine[0])

with open(os.path.join(test_dir,f"mapa_fr_fine.json"),'w') as f:
    json.dump(data_fr_fine,f)

data_it_fine = [ner_tags_to_spans(i,coarse=False) for i in mapa if i['language'] == 'it']
print(len(data_it_fine))
print(data_it_fine[0])

with open(os.path.join(test_dir,f"mapa_it_fine.json"),'w') as f:
    json.dump(data_it_fine,f)

558
{'tokenized_text': ['SCHLUSSANTRÄGE', 'DES', 'GENERALANWALTS'], 'ner': []}
490
{'tokenized_text': ['CONCLUSIONS', 'DE', 'L', '’', 'AVOCAT', 'GÉNÉRAL', 'M', '.'], 'ner': [(4, 5, 'role'), [6, 7, 'title']]}
550
{'tokenized_text': ['CONCLUSIONI', 'DELL', '’', 'AVVOCATO', 'GENERALE'], 'ner': [[3, 4, 'role']]}


In [18]:
from utils import get_all_labels

In [19]:
get_all_labels(data_de_coarse)

['person', 'date', 'organisation', 'address', 'amount']

In [23]:
get_all_labels(data_fr_fine)

['role',
 'title',
 'family name',
 'day',
 'month',
 'year',
 'country',
 'profession',
 'age',
 'url',
 'nationality',
 'value',
 'unit',
 'territory',
 'place']