In [1]:
import json
import numpy as np
import pickle
from glob import glob

In [2]:
import json

with open('entities-data-v4.json') as fopen:
    ori = json.load(fopen)

In [3]:
train_ori = ori['text'][:-int(0.2 * len(ori['text']))]
test_ori = ori['text'][-int(0.2 * len(ori['text'])):]

In [4]:
train_label = ori['label'][:-int(0.2 * len(ori['text']))]
test_label = ori['label'][-int(0.2 * len(ori['text'])):]

In [5]:
len(train_ori), len(train_label)

(52586, 52586)

In [6]:
augments = glob('*-augmentation.json')
augments = [a for a in augments if 'entities-augmentation' not in a]
augments

['location-augmentation.json',
 'law-augmentation.json',
 'name-augmentation.json',
 'org-augmentation.json',
 'event-augmentation.json']

In [7]:
import itertools

print(len(train_ori), len(train_label), len(test_ori), len(test_label))

for f in augments:
    with open(f) as fopen:
        x = json.load(fopen)
    
    train_ori.extend(list(itertools.chain(*x['train_X'])))
    train_label.extend(list(itertools.chain(*x['train_Y'])))
    
    test_ori.extend(list(itertools.chain(*x['test_X'])))
    test_label.extend(list(itertools.chain(*x['test_Y'])))
    
    print(f, len(train_ori), len(train_label), len(test_ori), len(test_label))

52586 52586 13146 13146
location-augmentation.json 4742529 4742529 1184573 1184573
law-augmentation.json 4778142 4778142 1193849 1193849
name-augmentation.json 5135500 5135500 1283292 1283292
org-augmentation.json 5282968 5282968 1320321 1320321
event-augmentation.json 5387355 5387355 1347110 1347110


In [8]:
len(train_ori), len(test_ori)

(5387355, 1347110)

In [9]:
train_ori = [s.title() if s[0].isupper() else s for s in train_ori]
test_ori = [s.title() if s[0].isupper() else s for s in test_ori]

In [10]:
tag2idx = {'PAD': 0,
 'X': 1,
 'OTHER': 2,
 'organization': 3,
 'person': 4,
 'time': 5,
 'location': 6,
 'quantity': 7,
 'law': 8,
 'event': 9}

In [11]:
seq_len = 50
def iter_seq(x):
    return [x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)]

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

In [None]:
train_X_seq, train_Y_seq = to_train_seq(train_ori, train_label)

In [None]:
len(test_label), len(test_ori)

In [None]:
test_X_seq, test_Y_seq = to_train_seq(test_ori[:100], test_label[:100])

In [None]:
with open('session-entities.pkl', 'wb') as fopen:
    pickle.dump({'train_X': train_X_seq, 'train_Y': train_Y_seq,
                'test_X': test_X_seq, 'test_Y': test_Y_seq}, fopen)