In [1]:
import numpy as np
import json
from sklearn.model_selection import train_test_split

In [2]:
with open('malaysia-laws.json') as fopen:
    laws = json.load(fopen)

In [3]:
# https://www.parlimen.gov.my/bills-dewan-rakyat.html?uweb=dr

ruu = """
D.R.43/2019	2019	RUU Kumpulan Wang Simpanan Pekerja (Pindaan) 2019	
Lulus
D.R.42/2019	2019	RUU Perlembagaan (Pindaan) (No. 2) 2019	
Bacaan Kali Pertama
D.R.41/2019	2019	RUU Petroleum (Cukai Pendapatan) (Pindaan) 2019	
Lulus
D.R.40/2019	2019	RUU Cukai Pendapatan (Pindaan) 2019	
Lulus
D.R.39/2019	2019	RUU Pengangkutan Barang Melalui Laut (Pindaan) 2019	
Lulus
D.R.38/2019	2019	RUU Cukai Aktiviti Perniagaan Labuan (Pindaan) 2019	
Lulus
D.R.37/2019	2019	RUU Racun (Pindaan) 2019	
Bacaan Kali Pertama
D.R.36/2019	2019	RUU Lembaga Promosi Kesihatan Malaysia (Pembubaran) 2019	
Lulus
D.R.35/2019	2019	RUU Hak Cipta (Pindaan) 2019	
Lulus
D.R.34/2019	2019	RUU Francais (Pindaan) 2019	
Lulus
D.R.33/2019	2019	RUU Kewangan 2019	
Lulus
D.R.32/2019	2019	RUU Perbekalan 2020	
Lulus
D.R.31/2019	2019	RUU Perhubungan Perusahaan (Pindaan) 2019	
Lulus
D.R.30/2019	2019	RUU Bank Negara Malaysia (Pindaan) 2019	
Lulus
D.R.29/2019	2019	RUU Mata Wang 2019	
Lulus
D.R.28/2019	2019	RUU Pusat Pencegahan Jenayah Kewangan Nasional 2019	
Lulus
D.R.27/2019	2019	RUU Perlindungan Penderma Makanan 2019	
Lulus
D.R.26/2019	2019	RUU Antiberita Tidak Benar (Pemansuhan) 2019	
Lulus
D.R.25/2019	2019	RUU Suruhanjaya Bebas Aduan Salah Laku Polis 2019	
Bacaan Kali Kedua dan Ketiga
D.R.24/2019	2019	RUU Bank Negara Malaysia (Pindaan) 2019	
Ditarik Balik
D.R.23/2019	2019	RUU Mata Wang 2019	
Ditarik Balik
D.R.22/2019	2019	RUU Penyemakan Undang-Undang (Pindaan) 2019	
Lulus
D.R.21/2019	2019	RUU Perlembagaan (Pindaan) 2019	
Lulus
D.R.20/2019	2019	RUU Standard Minimum Perumahan dan Kemudahan Pekerja (Pindaan) 2019	
Lulus
D.R.19/2019	2019	RUU Pentadbiran Undang-Undang Islam (Wilayah-Wilayah Persekutuan) (Pindaan) (No.2) 2019	
Lulus
D.R.18/2019	2019	RUU Profesion Guaman Syarie (Wilayah-Wilayah Persekutuan) 2019	
Lulus
D.R.17/2019	2019	RUU Syarikat (Pindaan) 2019	
Lulus
D.R.16/2019	2019	RUU Perikanan (Pindaan) 2019	
Lulus
D.R.15/2019	2019	RUU Perlindungan Pengguna (Pindaan) 2019	
Lulus
D.R.14/2019	2019	RUU Perlembagaan (Pindaan) 2019	
Ditarik Balik
D.R.13/2019	2019	RUU Perhimpunan Aman (Pindaan) 2019	
Lulus
D.R.12/2019	2019	RUU Pertubuhan Belia dan Pembangunan Belia (Pindaan) 2019	
Lulus
D.R.11/2019	2019	RUU Perihal Dagangan (Pindaan) 2019	
Lulus
D.R.10/2019	2019	RUU Cap Dagangan 2019	
Lulus
D.R 9/2019	2019	RUU Majlis Keselamatan Negara (Pindaan) 2019	
Ditarik Balik
D.R 8/2019	2019	RUU Levi Pelepasan 2019	
Lulus
D.R 7/2019	2019	RUU Perlembagaan (Pindaan) 2019	
Bacaan Kali Pertama
D.R 6/2019	2019	RUU Cukai Perkhidmatan (Pindaan) 2019	
Lulus
D.R 5/2019	2019	RUU Cukai Jualan (Pindaan) 2019	
Lulus
D.R 4/2019	2019	RUU Zon Bebas (Pindaan) 2019	
Lulus
D.R 3/2019	2019	RUU Eksais (Pindaan) 2019	
Lulus
D.R 2/2019	2019	RUU Kastam (Pindaan) 2019	
Lulus
D.R 1/2019	2019	RUU Perbekalan Tambahan (2018) 2019	
Lulus
"""

In [4]:
ruu_list = list(filter(None, ruu.split('\n')))[::2]
ruu_list = [r.strip().split('\t')[-1] for r in ruu_list]

In [5]:
ruu_ending = ['(Pindaan)', '(Pindaan) (No. 2)', '(Cukai Pendapatan) (Pindaan)',
             '(Pembubaran)', '(Pemansuhan)', '(Wilayah-Wilayah Persekutuan) (Pindaan) (No.2)',
             '(Wilayah-Wilayah Persekutuan)', '(2018)']

In [6]:
ruu_words = []
for r in ruu_list:
    r = r.split('(')[0].split('RUU ')[-1].replace('2019','').replace('2020','').strip()
    ruu_words.extend(r.split())
    
ruu_words = list(set(ruu_words))
len(ruu_words)

79

In [7]:
laws_list = []
for i in range(15):
    laws_list.extend(laws.get(str(i), []))
    
laws_list = list(set(laws_list))
len(laws_list)

670

In [8]:
laws_list.extend(ruu_list)
len(laws_list)

713

In [9]:
train_laws_list, test_laws_list = train_test_split(laws_list, test_size = 0.2)

In [10]:
train_laws, test_laws = train_test_split(laws['free'], test_size = 0.2)
train_ruu_words, test_ruu_words = train_test_split(ruu_words, test_size = 0.2)

In [11]:
import random

def generate_akta(length, sample):
    l = ' '.join(random.sample(sample, length))
    l = 'Akta ' + l + ' ' + random.choice(laws['free_num'])
    return l

def generate_ruu(length, sample):
    l = ' '.join(random.sample(sample, length))
    c = random.choice(ruu_ending)
    return f'RUU {l} {c} 2019'

In [12]:
generate_akta(5, train_laws), generate_ruu(5, test_ruu_words)

('Akta Pasport Pengangkatan Bersepadu Komputer Sabah 1933',
 'RUU Syarikat Perikanan Perlembagaan Pengguna Simpanan (Wilayah-Wilayah Persekutuan) 2019')

In [13]:
with open('entities-data-v4.json') as fopen:
    entities = json.load(fopen)

In [14]:
entities['label'][46775], entities['text'][46775]

('OTHER', 'saat')

In [15]:
results = []
i = 0
while i < len(entities['label']):
    r = []
    if entities['label'][i] == 'law':
        while entities['label'][i] == 'law':
            r.append(i)
            i += 1
        print(r)
        results.append(r)
    i += 1

[47488, 47489]
[47492, 47493]
[47495, 47496, 47497, 47498, 47499]
[47746, 47747, 47748, 47749, 47750, 47751, 47752]
[47760, 47761]
[48194, 48195, 48196, 48197]
[48434, 48435, 48436]
[48560, 48561]
[48592, 48593]
[48626, 48627, 48628]
[48662, 48663]
[48683]
[48688, 48689]
[49150]
[49263]
[49265, 49266]
[49296, 49297, 49298, 49299]
[49359, 49360]
[49391]
[49464, 49465]
[49942, 49943, 49944, 49945, 49946, 49947, 49948, 49949]
[50120, 50121, 50122, 50123]
[50125, 50126, 50127, 50128]
[50158, 50159, 50160]
[50348, 50349, 50350, 50351, 50352, 50353]
[50391]
[50393]
[53067, 53068, 53069, 53070]
[53771, 53772, 53773, 53774]
[53823]
[53855]
[53895]
[53969, 53970, 53971, 53972, 53973, 53974]
[53980]
[53983, 53984]
[53995]
[53999]
[54022]
[54026, 54027, 54028, 54029]
[54031, 54032, 54033]
[54037]
[54059]
[54268, 54269, 54270, 54271]
[54789, 54790, 54791, 54792, 54793, 54794, 54795]
[55259, 55260, 55261, 55262]
[56972, 56973, 56974, 56975]
[57613, 57614]
[57636, 57637]
[57690, 57691]
[57716, 57717

In [18]:
train_results, test_results = train_test_split(results, test_size = 0.2)

In [19]:
import math

def generate_index(l, name, texts, labels, length):
    cp, indices = [], []
    b = length - len(l)
    left = math.ceil(b / 2)
    right = b - left
    minus = l[0] - left
    if minus < 0:
        absolute = np.abs(minus)
        right += absolute
        left -= absolute

    for i in range(l[0] - left, l[0]):
        cp.append(texts[i])
        indices.append(labels[i])

    cp.extend(name)
    indices.extend([labels[l[0]] for _ in range(len(name))])
    try:
        for i in range(l[-1] + 1, l[-1] + right + 1):
            cp.append(texts[i])
            indices.append(labels[i])
    except Exception as e:
        print(e)
        pass
    
    return cp, indices

In [17]:
p = generate_index(results[0], train_laws_list[-1].split(), entities['text'], entities['label'], 30)
list(zip(*p))

[('katanya', 'OTHER'),
 ('dalam', 'OTHER'),
 ('pertemuan', 'OTHER'),
 ('itu', 'OTHER'),
 ('katanya', 'OTHER'),
 ('beliau', 'OTHER'),
 ('juga', 'OTHER'),
 ('memberi', 'OTHER'),
 ('jaminan', 'OTHER'),
 ('harapan', 'OTHER'),
 ('kekal', 'OTHER'),
 ('komited', 'OTHER'),
 ('mempertahan', 'OTHER'),
 ('peruntukan', 'OTHER'),
 ('Akta', 'law'),
 ('Pelan', 'law'),
 ('dan', 'law'),
 ('Dokumen', 'law'),
 ('Tanah', 'law'),
 ('dan', 'law'),
 ('Lombong', 'law'),
 ('(Salinan', 'law'),
 ('Fotograf)', 'law'),
 ('1950', 'law'),
 ('dalam', 'OTHER'),
 ('soal', 'OTHER'),
 ('Hak', 'law'),
 ('Melayu', 'law'),
 ('serta', 'OTHER'),
 ('Kedudukan', 'law'),
 ('Bahasa', 'law'),
 ('Melayu', 'law'),
 ('Dan', 'law'),
 ('Islam', 'law'),
 ('katanya', 'OTHER'),
 ('lagi', 'OTHER'),
 ('memang', 'OTHER'),
 ('sukar', 'OTHER')]

In [21]:
train_X, train_Y = [], []

for t in train_laws_list:
    x, y = generate_index(train_results[random.randint(0, len(train_results) - 1)], 
                          t.split(), entities['text'], entities['label'], 30)
    if len(x) != len(y):
        print('len not same')
        continue
    train_X.append(x)
    train_Y.append(y)

In [22]:
for r in train_results:
    for _ in range(8):
        x, y = generate_index(r, generate_akta(len(r), train_laws).split(), 
                              entities['text'], entities['label'], 30)
        if len(x) != len(y):
            print('len not same')
            continue
        train_X.append(x)
        train_Y.append(y)
    for _ in range(3):
        x, y = generate_index(r, generate_ruu(len(r), train_ruu_words).split(), 
                              entities['text'], entities['label'], 30)
        if len(x) != len(y):
            print('len not same')
            continue
        train_X.append(x)
        train_Y.append(y)

In [24]:
test_X, test_Y = [], []

for t in test_laws_list:
    x, y = generate_index(test_results[random.randint(0, len(test_results) - 1)], 
                          t.split(), entities['text'], entities['label'], 30)
    if len(x) != len(y):
        print('len not same')
        continue
    test_X.append(x)
    test_Y.append(y)

In [25]:
for r in test_results:
    for _ in range(8):
        x, y = generate_index(r, generate_akta(len(r), test_laws).split(), 
                              entities['text'], entities['label'], 30)
        if len(x) != len(y):
            print('len not same')
            continue
        test_X.append(x)
        test_Y.append(y)
    for _ in range(3):
        x, y = generate_index(r, generate_ruu(len(r), test_ruu_words).split(), 
                              entities['text'], entities['label'], 30)
        if len(x) != len(y):
            print('len not same')
            continue
        test_X.append(x)
        test_Y.append(y)

In [27]:
with open('law-augmentation.json', 'w') as fopen:
    json.dump({'train_X': train_X, 'train_Y': train_Y, 'test_X': test_X, 'test_Y': test_Y}, fopen)