In [2]:
prefix = 'https://f000.backblazeb2.com/file/malay-dataset/tagging/ontonotes5/'

urls = """
augmentation-address-ontonotes5.json
augmentation-event-ontonotes5.json
augmentation-fac-ontonotes5.json
augmentation-gpe-ontonotes5.json
augmentation-language-ontonotes5.json
augmentation-law-ontonotes5.json
augmentation-loc-ontonotes5.json
augmentation-norp-ontonotes5.json
augmentation-org-ontonotes5.json
augmentation-person-ontonotes5.json
augmentation-product-ontonotes5.json
augmentation-work-of-art-ontonotes5.json
ontonotes5-train-test.json
"""
urls = list(filter(None, urls.split('\n')))

import os

# uncomment to download
# for url in urls:
#     print(url)
#     os.system(f'wget {prefix}{url}')

In [3]:
import json

with open('ontonotes5-train-test.json') as fopen:
    data = json.load(fopen)
data.keys()

dict_keys(['train_X', 'train_Y', 'test_X', 'test_Y'])

In [4]:
train_X = data['train_X']
test_X = data['test_X']
train_Y = data['train_Y']
test_Y = data['test_Y']

In [5]:
d = [
    {'Tag': 'OTHER', 'Description': 'other'},
    {'Tag': 'ADDRESS', 'Description': 'Address of physical location.'},
    {'Tag': 'PERSON', 'Description': 'People, including fictional.'},
    {
        'Tag': 'NORP',
        'Description': 'Nationalities or religious or political groups.',
    },
    {
        'Tag': 'FAC',
        'Description': 'Buildings, airports, highways, bridges, etc.',
    },
    {
        'Tag': 'ORG',
        'Description': 'Companies, agencies, institutions, etc.',
    },
    {'Tag': 'GPE', 'Description': 'Countries, cities, states.'},
    {
        'Tag': 'LOC',
        'Description': 'Non-GPE locations, mountain ranges, bodies of water.',
    },
    {
        'Tag': 'PRODUCT',
        'Description': 'Objects, vehicles, foods, etc. (Not services.)',
    },
    {
        'Tag': 'EVENT',
        'Description': 'Named hurricanes, battles, wars, sports events, etc.',
    },
    {'Tag': 'WORK_OF_ART', 'Description': 'Titles of books, songs, etc.'},
    {'Tag': 'LAW', 'Description': 'Named documents made into laws.'},
    {'Tag': 'LANGUAGE', 'Description': 'Any named language.'},
    {
        'Tag': 'DATE',
        'Description': 'Absolute or relative dates or periods.',
    },
    {'Tag': 'TIME', 'Description': 'Times smaller than a day.'},
    {'Tag': 'PERCENT', 'Description': 'Percentage, including "%".'},
    {'Tag': 'MONEY', 'Description': 'Monetary values, including unit.'},
    {
        'Tag': 'QUANTITY',
        'Description': 'Measurements, as of weight or distance.',
    },
    {'Tag': 'ORDINAL', 'Description': '"first", "second", etc.'},
    {
        'Tag': 'CARDINAL',
        'Description': 'Numerals that do not fall under another type.',
    },
]
d = [d['Tag'] for d in d]
d = ['PAD', 'X'] + d
tag2idx = {i: no for no, i in enumerate(d)}
idx2tag = {no: i for no, i in enumerate(d)}

In [6]:
from glob import glob

augmented = glob('augmentation-*-ontonotes5.json')

for f in augmented:
    print(f)
    with open(f) as fopen:
        data = json.load(fopen)
        
    print(len(data.get('train_X', [])), len(data.get('train_Y', [])))
    print(len(data.get('test_X', [])), len(data.get('test_Y', [])))
    
    train_X.extend(data.get('train_X', []))
    train_Y.extend(data.get('train_Y', []))
    test_X.extend(data.get('test_X', []))
    test_Y.extend(data.get('test_Y', []))


augmentation-org-ontonotes5.json
35150 35150
6300 6300
augmentation-fac-ontonotes5.json
17040 17040
4260 4260
augmentation-loc-ontonotes5.json
13040 13040
2460 2460
augmentation-gpe-ontonotes5.json
21060 21060
0 0
augmentation-work-of-art-ontonotes5.json
4020 4020
1020 1020
augmentation-event-ontonotes5.json
1665 1665
0 0
augmentation-person-ontonotes5.json
37883 37883
2530 2530
augmentation-product-ontonotes5.json
7040 7040
1760 1760
augmentation-law-ontonotes5.json
12584 12584
3161 3161
augmentation-language-ontonotes5.json
6860 6860
0 0
augmentation-norp-ontonotes5.json
30660 30660
5940 5940
augmentation-address-ontonotes5.json
57502 57502
15106 15106


In [7]:
from malaya.text.bpe import WordPieceTokenizer

tokenizer = WordPieceTokenizer('BERT.wordpiece', do_lower_case = False)
# tokenizer.tokenize('halo nama sayacomel')

In [8]:
from tqdm import tqdm

def XY(strings):
    left_train, right_train = strings[0], strings[1]
    X, Y, MASK = [], [], []
    for i in tqdm(range(len(left_train))):
        left = [d for d in left_train[i]]
        right = [d for d in right_train[i]]
        bert_tokens = ['[CLS]']
        y = ['PAD']
        for no, orig_token in enumerate(left):
            t = tokenizer.tokenize(orig_token)
            bert_tokens.extend(t)
            if len(t):
                y.append(right[no])
            y.extend(['X'] * (len(t) - 1))
        bert_tokens.append('[SEP]')
        y.append('PAD')
        x = tokenizer.convert_tokens_to_ids(bert_tokens)
        y = [tag2idx[i] for i in y]
        input_mask = [1] * len(y)
        if len(x) != len(y):
            print(i)
        X.append(x)
        Y.append(y)
        MASK.append(input_mask)
    return X, Y

In [9]:
o = XY([train_X, train_Y])

100%|██████████| 571296/571296 [11:34<00:00, 822.33it/s] 


In [13]:
test_o = XY([test_X, test_Y])

100%|██████████| 78839/78839 [01:40<00:00, 780.89it/s]


In [15]:
len(o[0]), len(o[1]), len(test_o[0]), len(test_o[1])

(571296, 571296, 78839, 78839)

In [16]:
import pickle

with open('ontonotes5-fastformer.pkl', 'wb') as fopen:
    pickle.dump([o[0], o[1], test_o[0], test_o[1]], fopen)