In [1]:
import json

with open('translated-ontonotes5.json') as fopen:
    data = json.load(fopen)

In [2]:
len(data)

107361

In [3]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import re

In [4]:
number = r"\b\d+(?:[\.,']\d+)?\b"
percentage = number + '%'
money = r"(?:(?:[$€£¢]|RM|rm)\s*\d+(?:[\.,']\d+)?(?:[MmKkBb](?:n|(?:il(?:lion)?))?)?)|(?:\d+(?:[\.,']\d+)?(?:[MmKkBb](?:n|(?:il(?:lion)?))?)?\s*(?:[$€£¢]|sen|ringgit|cent|penny))"
time = r'(?:(?:\d+)?\.?\d+\s*(?:AM|PM|am|pm|a\.m\.|p\.m\.))|(?:(?:[0-2]?[0-9]|[2][0-3]):(?:[0-5][0-9])(?::(?:[0-5][0-9]))?(?: ?(?:AM|PM|am|pm|a\.m\.|p\.m\.))?)'
_short_date = r'(?:\b(?<!\d\.)(?:(?:(?:[0123]?[0-9][\.\-\/])?[0123]?[0-9][\.\-\/][12][0-9]{3})|(?:[0123]?[0-9][\.\-\/][0123]?[0-9][\.\-\/][12]?[0-9]{2,3}))(?!\.\d)\b)'
_full_date_parts = [
    # prefix
    r'(?:(?<!:)\b\'?\d{1,4},? ?)',
    r'\b(?:[Jj]an(?:uari)?|[Ff]eb(?:ruari)?|[Mm]a(?:c)?|[Aa]pr(?:il)?|[Mm]ei|[Jj]u(?:n)?|[Jj]ula(?:i)?|[Aa]ug(?:ust)?|[Oo]gos|[Ss]ept?(?:ember)?|[Oo]kt(?:ober)?|[Nn]ov(?:ember)?|[Dd]is(?:ember)?)\b',
    r'(?:(?:,? ?\'?)?\d{1,4}(?:st|nd|rd|n?th)?\b(?:[,\/]? ?\'?\d{2,4}[a-zA-Z]*)?(?: ?- ?\d{2,4}[a-zA-Z]*)?(?!:\d{1,4})\b)',
]
_fd1 = '(?:{})'.format(
    ''.join(
        [_full_date_parts[0] + '?', _full_date_parts[1], _full_date_parts[2]]
    )
)
_fd2 = '(?:{})'.format(
    ''.join(
        [_full_date_parts[0], _full_date_parts[1], _full_date_parts[2] + '?']
    )
)
date = '(?:' + '(?:' + _fd1 + '|' + _fd2 + ')' + '|' + _short_date + ')'
repeat_puncts =  r'([!?.]){2,}'
quotes = r'\"(\\.|[^\"]){2,}\"'
word = r'(?:[\w_]+)'
hypen = r'\w+(?:-\w+)+'
hypen_left = r'\w+(?: -\w+)+'
hypen_right = r'\w+(?:- \w+)+'
hypen_both = r'\w+(?: - \w+)+'
pipeline = [hypen, hypen_left, hypen_right, hypen_both, percentage, money, time, date, repeat_puncts,
           number, word, '<ENAMEX.*?>(.+?)<.*?/.*?ENAMEX>',]
pipeline.append('(?:\S)')
tok = re.compile(r'({})'.format('|'.join(pipeline)))

In [5]:
tok.findall('memancing sial! 1:00 pagi')

[('memancing', '', ''),
 ('sial', '', ''),
 ('!', '', ''),
 ('1:00', '', ''),
 ('pagi', '', '')]

In [6]:
categories = ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW',
        'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'OTHER', 'PERCENT',
        'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']

mapping = {'ACARA': 'EVENT', 'BAHASA': 'LANGUAGE', 'GEN': 'GPE',
           'KADINAL': 'CARDINAL', 'KUANTITI': 'QUANTITY',
           'Kawasan': 'LOC', 'MASA': 'TIME', 'ORANG': 'PERSON',
          'PERATUS': 'PERCENT', 'PERIBADI': 'PERSON',
          'PERISTIWA': 'EVENT', 'PRODUK': 'PRODUCT', 'TARIKH': 'DATE',
          'UNDANG': 'LAW', 'WAKTU': 'TIME', 'WANG': 'MONEY', 'WONE': 'MONEY'}

In [7]:
tok.findall(data[0][1][1])

[('<ENAMEX TYPE = "DATE"> Gerakan hari ini </ENAMEX>',
  '',
  ' Gerakan hari ini '),
 ('hanya', '', ''),
 ('mendengar', '', ''),
 ('sebahagian', '', ''),
 ('daripada', '', ''),
 ('pertengkaran', '', ''),
 ('awal', '', ''),
 ('.', '', '')]

In [8]:
tags = []
reject = '<>\\{}/`'
reject_words = ['DOC']
for t in tqdm(data):
    if t[1] is None:
        continue
    a = t
    for t in tok.findall(t[1][1]):
        if '<ENAMEX' in t[0] and 'ENAMEX>' in t[0]:
#             cat = re.findall('(?:[\w_]+)', t[0].split('>')[0])[-1]
#             bs = BeautifulSoup(t[0])
#             cat = bs.enamex.attrs.get('type', bs.enamex.attrs.get('jenis'))
#             if cat is None:
                
#                 print(t, cat)
            cat = re.findall('(?:[\w_]+)', t[0].split('">')[0].split('S_OFF')[0].split('E_OFF')[0])[-1]
            text = t[-1]
        else:
            cat = 'OTHER'
            text = t[0]
        if 'ENAMEX' in text:
            continue
        
        text = text.strip()
        text = [t[0] for t in tok.findall(text) if t[0] not in reject and t[0] not in reject_words]
        cat = cat.strip()
        if cat not in categories:
            try:
                new_cat = re.findall('(?:[\w_]+)', t[0].split('">')[0].split('S_OFF')[0].split('E_OFF')[0])[2]
                cat = new_cat
            except:
                cat = 'OTHER'
        if cat not in categories:
            cat = 'OTHER'
        cat = mapping.get(cat, cat)
        tags.extend([(t, cat) for t in text])

100%|██████████| 107361/107361 [00:13<00:00, 7809.72it/s]


In [9]:
tags[:10]

[('Gerakan', 'DATE'),
 ('hari', 'DATE'),
 ('ini', 'DATE'),
 ('hanya', 'OTHER'),
 ('mendengar', 'OTHER'),
 ('sebahagian', 'OTHER'),
 ('daripada', 'OTHER'),
 ('pertengkaran', 'OTHER'),
 ('awal', 'OTHER'),
 ('.', 'OTHER')]

In [10]:
len(tags)

1815562

In [11]:
test_index = int(0.1 * 1815562)
test_index

181556

In [12]:
train = tags[:-test_index]
test = tags[-test_index:]

In [14]:
import numpy as np

In [15]:
tagging = [t[1] for t in train]
np.unique(tagging, return_counts = True)

(array(['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW',
        'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'OTHER', 'PERCENT',
        'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART'],
       dtype='<U11'),
 array([  17180,   39578,    2700,    2941,   32172,     592,    1141,
           4096,    9354,   12803,    2881,   56691, 1391417,    5596,
          40814,    1827,    3280,    4324,    4619]))

In [16]:
tagging = [t[1] for t in test]
np.unique(tagging, return_counts = True)

(array(['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW',
        'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'OTHER', 'PERCENT',
        'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART'],
       dtype='<U11'),
 array([  2067,   6089,    109,    284,   3651,      8,    229,    447,
          1699,    985,    202,  10159, 149218,   1436,   3643,    218,
           368,    323,    421]))

In [None]:
with open('processed-train-ontonotes5.json', 'w') as fopen:
    json.dump(train, fopen)

In [None]:
with open('processed-test-ontonotes5.json', 'w') as fopen:
    json.dump(test, fopen)