In [14]:
import json

# Read the JSON file
with open('ood_data/names.txt', 'r') as file:
    data = json.load(file)

len(data)

48

In [15]:
from utils import get_all_labels

# Get all the labels
labels = get_all_labels(data)
labels

['person', 'org', 'loc', 'amount', 'date of loss']

In [16]:
# view examples
from utils import view
view(data[0]['tokenized_text'], data[0]['ner'])

person: ['Thomas', 'arbeitet']
org: ['SBB', 'in']
loc: ['Aarau', '.']
loc: ['Sch', 'ö', 'nenwerd', 'versetzt']


In [17]:
def helper_map(label):
    if label == 'person':
        return 'a_name'
    if label == 'org':
        return 'a_organisation'
    if label == 'loc':
        return 'a_place'
    raise ValueError(f'Unknown label: {label}')

def helper_map_baseline(label):
    if label == 'person':
        return 'person'
    if label == 'org':
        return 'organization'
    if label == 'loc':
        return 'place'
    raise ValueError(f'Unknown label: {label}')

def filter_ood_data(data, helper_map):
    # map person -> a_name
    # map org -> a_organisation
    # map loc -> a_place
    # remove all other ner labels
    # remove all data points with no ner labels
    # adjust the labels to be both inclusive

    filtered_data = []
    for item in data:
        ner_labels = item['ner']
        new_ner_labels = []
        for label in ner_labels:
            if label[2] in ['person', 'org', 'loc']:
                new_ner_labels.append((label[0], label[1]-1, helper_map(label[2])))
        if len(new_ner_labels) > 0:
            filtered_data.append({'tokenized_text': item['tokenized_text'], 'ner': new_ner_labels})

    return filtered_data

filtered_data = filter_ood_data(data, helper_map)
filtered_data_baseline = filter_ood_data(data, helper_map_baseline)
len(filtered_data), len(filtered_data_baseline)

(48, 48)

In [20]:
# save the filtered test data and filtered baseline test data in json files

with open('ood_data/test.json', 'w') as file:
    json.dump(filtered_data, file, indent = 2)

with open('ood_data/test_data_baseline.json', 'w') as file:
    json.dump(filtered_data_baseline, file, indent = 2)

