In [62]:
import json
import random
from sklearn.model_selection import train_test_split

In [63]:
data_train = json.load(open('../Dataset/NER_TRAIN_JUDGEMENT.json'))
data_test = json.load(open('../Dataset/NER_TEST_JUDGEMENT.json'))

In [64]:
def get_token_indices(sentence, start, end):
    words = sentence.split(' ')
    start_index = 0
    end_index = 0
    for i, word in enumerate(words):
        if start_index == 0 and start < len(word):
            start_index = i
        start -= len(word) + 1
        if end == len(word) or end == len(word) - 1:
            end_index = i+1
            break
        if end < len(word):
            end_index = i
            break
        end -= len(word) + 1
    return start_index, end_index

In [65]:
data_train_preprocessed = {}

for doc in data_train:
    data_train_preprocessed[doc['id']] = {}
    data_train_preprocessed[doc['id']]['text'] = doc['data']['text']
    data_train_preprocessed[doc['id']]['labels'] = ['O'] * len(doc['data']['text'].split(' '))
    
    for results in doc['annotations'][0]['result']:
        entity = results['value']
        start, end = get_token_indices(doc['data']['text'], entity['start'], entity['end'])
        if start == end:
            data_train_preprocessed[doc['id']]['labels'][start] = 'B_' + results['value']['labels'][0]
        else:
            data_train_preprocessed[doc['id']]['labels'][start] = 'B_' + results['value']['labels'][0]
            for i in range(start+1, end):
                data_train_preprocessed[doc['id']]['labels'][i] = 'I_' + results['value']['labels'][0]


In [66]:
data_test_preprocessed = {}

for doc in data_test:
    data_test_preprocessed[doc['id']] = {}
    data_test_preprocessed[doc['id']]['text'] = doc['data']['text']
    data_test_preprocessed[doc['id']]['labels'] = ['O'] * len(doc['data']['text'].split(' '))
    
    for results in doc['annotations'][0]['result']:
        entity = results['value']
        start, end = get_token_indices(doc['data']['text'], entity['start'], entity['end'])
        if start == end:
            data_test_preprocessed[doc['id']]['labels'][start] = 'B_' + results['value']['labels'][0]
        else:
            data_test_preprocessed[doc['id']]['labels'][start] = 'B_' + results['value']['labels'][0]
            for i in range(start+1, end):
                data_test_preprocessed[doc['id']]['labels'][i] = 'I_' + results['value']['labels'][0]

In [68]:
train_ids = list(data_train_preprocessed.keys())
random.seed(42)
random.shuffle(train_ids)

train_ids, val_ids = train_test_split(train_ids, test_size=0.15, random_state=42)

train_preprocessed = {id: data_train_preprocessed[id] for id in train_ids}
val_preprocessed = {id: data_train_preprocessed[id] for id in val_ids}

json.dump(train_preprocessed, open('../Dataset/NER_train.json', 'w'))
json.dump(val_preprocessed, open('../Dataset/NER_val.json', 'w'))
json.dump(data_test_preprocessed, open('../Dataset/NER_test.json', 'w'))