In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import json

In [2]:
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/crawl/ticket2u/ticketui.json

In [3]:
with open('ticketui.json') as fopen:
    ticket2u = json.load(fopen)

In [4]:
import re

def cjk_detect(texts):
    # korean
    if re.search("[\uac00-\ud7a3]", texts):
        return "ko"
    # japanese
    if re.search("[\u3040-\u30ff]", texts):
        return "ja"
    # chinese
    if re.search("[\u4e00-\u9FFF]", texts):
        return "zh"
    return None

In [5]:
names = [row['row']['name'].title() for row in ticket2u if cjk_detect(row['row']['name']) is None]
len(names)

3655

In [6]:
train_names, test_names = train_test_split(names, test_size = 0.2)

In [7]:
events = {
    1: ['Krismas', 'PRU14', 'Aidhilfitri', 'Aidhiladha', 'PRU', 'Ramadan'],
    2: ['Perayaan Krismas', 'Gawai Dayak', 'Perayaan Deepavali', 'Hari Natal',
       'Yom Kippur', 'Bar Mitzvah', 'Hari Wesak', 'Pesta Tanglung',
       'Hari Deepavali', 'Hari Thaipusam', 'Hari Thaiponggol',
       'Hari Gawai', 'Hari Kaamatan', 'Hari San Pedro',
       'Tahun Baru', 'Hari Kebangsaan',
       'Hari Malaysia', 'Hari Wilayah', 'Hari Pekerja'],
    3: ['Tahun Baru Cina', 'Perayaan Tadau Kaamatan', 'Hari Raya Aidilfitri', 'Hari Raya Aidiladha',
       'Hari Awal Muharram', 'Hari Maulidur Rasul', 'Hari Nuzul Quran', 'Hari Ahad Easter',
       'Hari Kuih Bulan', 'Perayaan Ching Ming', 'Hari Durga Puja',
       'Hari Hol negeri', 'Perayaan Hantu Lapar', 'Hari Kemerdekaan Malaysia'],
    4: ['Hari Raya Aidil Adha', 'Hari Raya Aidil Fitri', 'Hari Israk dan Mikraj',
       'Hari Chap Goh Mei', 'Hari Tahun Baru Cina', 'Hari Keputeraan Sultan-sultan Negeri'],
    6: ['Hari Kelahiran Yang di-Pertua-Yang di-Pertua Negeri'],
    8: ['Hari Keputeraan Seri Paduka Baginda Yang di-Pertuan Agong']
}

In [8]:
for k, v in events.items():
    print(k, [len(i.split()) for i in v])

1 [1, 1, 1, 1, 1, 1]
2 [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2]
3 [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
4 [4, 4, 4, 4, 4, 4]
6 [6]
8 [8]


In [9]:
import random

def generate_event(length):
    if length not in events:
        l = None
    else:
        l = events[length]
    return l

In [10]:
with open('entities-data-v4.json') as fopen:
    entities = json.load(fopen)

In [11]:
entities['label'][49573:49575], entities['text'][49573:49575]

(['OTHER', 'event'], ['ketibaan', 'Syawal'])

In [12]:
results = []
i = 0
while i < len(entities['label']):
    r = []
    if entities['label'][i] == 'event':
        while entities['label'][i] == 'event':
            r.append(i)
            i += 1
        print(r)
        results.append(r)
    i += 1

[47536, 47537]
[47994, 47995]
[48030, 48031, 48032]
[48711, 48712]
[48786, 48787]
[48799, 48800]
[48825, 48826, 48827, 48828, 48829, 48830, 48831, 48832]
[48866, 48867, 48868, 48869, 48870, 48871, 48872, 48873]
[48938, 48939]
[48973, 48974, 48975]
[49069, 49070, 49071, 49072]
[49185, 49186, 49187, 49188]
[49247]
[49537, 49538]
[49552, 49553]
[49574]
[49637, 49638, 49639]
[49669]
[49722, 49723]
[49746]
[50186, 50187]
[50190, 50191]
[50238, 50239, 50240, 50241, 50242, 50243, 50244, 50245, 50246, 50247, 50248, 50249, 50250, 50251, 50252, 50253]
[50273, 50274]
[50486, 50487]
[50500, 50501]
[51213, 51214, 51215]
[51426, 51427, 51428]
[51430, 51431]
[51462, 51463]
[51520, 51521]
[52712, 52713, 52714]
[52791, 52792, 52793, 52794]
[53150]
[53402]
[54363, 54364]
[54465, 54466, 54467, 54468]
[54470]
[54870]
[55338, 55339, 55340, 55341, 55342, 55343]
[56065]
[56342, 56343]
[56711]
[57077, 57078, 57079, 57080, 57081, 57082, 57083, 57084, 57085]
[57578, 57579, 57580, 57581]
[58181, 58182, 58183]
[5

In [13]:
for r in results:
    print(entities['text'][r[0] - 1:r[-1] + 1])

['dalam', 'Sesi', 'Pertama']
['menganjurkan', 'Solat', 'Hajat']
['kenyataan', 'Merestui', 'Cadangan', 'Kerajaan']
['mendapat', 'Pengampunan', 'Bebas']
['sebelum', 'Pilihan', 'Raya']
['tangguhkan', 'Pilihan', 'Raya']
['pada', 'Istiadat', 'Pembukaan', 'Mesyuarat', 'Pertama', 'Penggal', 'Pertama', 'Parlimen', 'Ke-']
['ketika', 'Istiadat', 'Pembukaan', 'Mesyuarat', 'Pertama', 'Penggal', 'Pertama', 'Parlimen', 'Ke-']
['ucapan', 'Perasmian', 'Pembukaan']
['semasa', 'Sidang', 'Dewan', 'Rakyat']
['Paduka', 'Mesyuarat', 'Dewan', 'Rakyat', 'Penggal']
['selepas', 'Pilihan', 'Raya', 'Umum', 'Pru']
['selepas', 'Pru']
['menerusi', 'Perutusan', 'Aidilfitri']
['akan', 'Menyambut', 'Aidilfitri']
['ketibaan', 'Syawal']
['menyambut', 'Hari', 'Lebaran', 'Syawal']
['Madrasah', 'Ramadan']
['pulang', 'Berhari', 'Raya']
['sambutan', 'Aidilfitri']
['penuh', 'Khutbah', 'Jumaat']
['ini', 'Sidang', 'Jumaat']
['bersempena', 'Sambutan', 'Ulang', 'Tahun', 'Hari', 'Keputeraan', 'Rasmi', 'Seri', 'Paduka', 'Baginda', '

In [22]:
import math

def generate_index(l, name, texts, labels, length):
    cp, indices = [], []
    b = length - len(l)
    left = math.ceil(b / 2)
    right = b - left
    minus = l[0] - left
    if minus < 0:
        absolute = np.abs(minus)
        right += absolute
        left -= absolute

    for i in range(l[0] - left, l[0]):
        cp.append(texts[i])
        indices.append(labels[i])

    cp.extend(name)
    indices.extend([labels[l[0]] for _ in range(len(name))])
    try:
        for i in range(l[-1] + 1, l[-1] + right + 1):
            cp.append(texts[i])
            indices.append(labels[i])
    except Exception as e:
        print(e)
        pass
    
    return cp, indices

In [16]:
train_results, test_results = train_test_split(results, test_size = 0.2)

In [23]:
train_X, train_Y = [], []

for r in train_results:
    ev = generate_event(len(r))
    if ev:
        for e in ev:
            x, y = generate_index(r, e.split(), entities['text'], entities['label'], 25)
            if len(x) != len(y):
                print('len not same')
                continue
            train_X.append(x)
            train_Y.append(y)

In [24]:
for t in train_names:
    x, y = generate_index(train_results[random.randint(0, len(train_results) - 1)], 
                          t.split(), entities['text'], entities['label'], 25)
    if len(x) != len(y):
        print('len not same')
        continue
    train_X.append(x)
    train_Y.append(y)

In [27]:
test_X, test_Y = [], []

for r in test_results:
    ev = generate_event(len(r))
    if ev:
        for e in ev:
            x, y = generate_index(r, e.split(), entities['text'], entities['label'], 25)
            if len(x) != len(y):
                print('len not same')
                continue
            test_X.append(x)
            test_Y.append(y)

In [28]:
for t in test_names:
    x, y = generate_index(test_results[random.randint(0, len(test_results) - 1)], 
                          t.split(), entities['text'], entities['label'], 25)
    if len(x) != len(y):
        print('len not same')
        continue
    test_X.append(x)
    test_Y.append(y)

In [31]:
len(train_X), len(test_X)

(3688, 919)

In [32]:
with open('event-augmentation.json', 'w') as fopen:
    json.dump({'train_X': train_X, 'train_Y': train_Y, 'test_X': test_X, 'test_Y': test_Y}, fopen)