In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/tagging/ontonotes5/processed-ontonotes5.json

In [2]:
import json
from sklearn.model_selection import train_test_split
import random

In [3]:
with open('processed-ontonotes5.json') as fopen:
    data = json.load(fopen)

In [4]:
from collections import defaultdict

entities = defaultdict(list)
for i in data:
    entities['text'].append(i[0])
    entities['label'].append(i[1])

In [5]:
# !wget https://raw.githubusercontent.com/nizulzaim/json-malaysia-postalcode/master/postalcode.json

In [6]:
with open('postalcode.json') as fopen:
    postal = json.load(fopen)

In [7]:
def generate_address(row):
    return [f"{row['place']}, {row['code']}, {row['city']}",
           f"{row['place']}, {row['code']}, {row['city']} {row['state']}"]

In [8]:
addresses = []

for row in postal['data']:
    addresses.extend(generate_address(row))
    
addresses = list(set(addresses))
    
len(addresses)

108056

In [9]:
addresses

['Jalan Pasir Belanda, 11920, Bayan Lepas',
 'Serian - Beg Berkunci, 94709, Serian Sarawak',
 'Jalan (U1/89A - U1/89E), 40150, Shah Alam Selangor',
 'Kampung Telaga Gajah, 15350, Kota Bharu',
 'Persiaran Stonor, 50450, Kuala Lumpur WP Kuala Lumpur',
 'Bintangor - Beg Berkunci, 96509, Bintangor Sarawak',
 'Bintulu - Peti Surat \xa0(2501 - 3000), 97012, Bintulu',
 'Kampung Lanterom, 06050, Bukit Kayu Hitam',
 'Lengkok Harimau, 31400, Ipoh Perak',
 'Ladang St Helier, 72100, Bahau Negeri Sembilan',
 'Taman Jalan Galing, 25250, Kuantan',
 'Taman Segamat Baru, 85000, Segamat Johor',
 'Kampung Padang, 09100, Baling',
 'Kampung Sukamari Lubok Merbau, 06710, Pendang Kedah',
 'Kampung Tanjong (Tawang), 16020, Bachok Kelantan',
 'Jalan 1/108C (Salak Selatan), 57100, Kuala Lumpur WP Kuala Lumpur',
 'Taman Ulu Bendol, 71500, Tanjong Ipoh',
 'Taman Bukit Indah (Fasa H & Fasa E), 07000, Langkawi',
 'Bersia (Km 10), 33320, Gerik',
 'Jalan Cempaka SD 12/5, 52200, Kuala Lumpur WP Kuala Lumpur',
 'Seri M

In [10]:
import pandas as pd

df = pd.read_csv('senarai-anak-syarikat-kplb.csv')['ALAMAT SYARIKAT'].dropna()
company_address = df.tolist()
company_address = [c.split('TEL')[0].strip() for c in company_address]
company_address = [c.split('Tel')[0].strip().title() for c in company_address]

In [11]:
df = pd.read_csv('penganjur-acara.csv')
for i in range(len(df)):
    row = df.iloc[i]
    l = row['Alamat 1'].replace('\n', ' ').strip().title()
    r = row['Alamat 2'].replace('\n', ' ').strip().title()
    company_address.extend([l, f'{l}, {r}'])

In [12]:
df = pd.read_csv('maklumat-syarikat-jualan-langsung.csv')['ALAMAT TERKINI (TELEFON & FAKS)'].dropna()
company_address.extend(df.tolist())

company_address = list(set(company_address))

In [13]:
import re

company_address = [r.strip().title() for r in company_address]
company_address = [t.split('(T')[0].split('( Tel')[0].split('( T')[0].split('(0')[0].strip() for t in company_address]
company_address = [re.sub(r'[ ]+', ' ', t).strip() for t in company_address]
company_address = [t[:-1] if t[-1] == '.' else t for t in company_address]
len(company_address)

1781

In [14]:
results = []
i = 0
while i < len(entities['label']):
    r = []
    if entities['label'][i] == 'LOC':
        while entities['label'][i] == 'LOC':
            r.append(i)
            i += 1
        results.append(r)
    i += 1

In [15]:
import math

def generate_index(l, name, texts, labels, length):
    cp, indices = [], []
    b = length - len(l)
    left = math.ceil(b / 2)
    right = b - left
    minus = l[0] - left
    if minus < 0:
        absolute = np.abs(minus)
        right += absolute
        left -= absolute

    for i in range(l[0] - left, l[0]):
        cp.append(texts[i])
        indices.append(labels[i])

    cp.extend(name)
    indices.extend(['ADDRESS' for _ in range(len(name))])
    try:
        for i in range(l[-1] + 1, l[-1] + right + 1):
            cp.append(texts[i])
            indices.append(labels[i])
    except Exception as e:
        print(e)
        pass
    
    return cp, indices

In [16]:
train_results, test_results = train_test_split(results, test_size = 0.2)
train_addresses, test_addresses = train_test_split(addresses, test_size = 0.2)

In [17]:
train_X, train_Y = [], []
repeat = 8

for t in company_address:
    for i in range(repeat):
        x, y = generate_index(train_results[random.randint(0, len(train_results) - 1)], 
                              t.split(), entities['text'], entities['label'], 50)
        if len(x) != len(y):
            print('len not same')
            continue
        train_X.append(x)
        train_Y.append(y)
            
len(train_X)

14248

In [18]:
repeat = 1
for t in train_addresses:
    for i in range(repeat):
        if random.random() > 0.5:
            x, y = generate_index(train_results[random.randint(0, len(train_results) - 1)], 
                                  t.split(), entities['text'], entities['label'], 50)
            if len(x) != len(y):
                print('len not same')
                continue
            train_X.append(x)
            train_Y.append(y)
            
len(train_X)

57502

In [19]:
test_X, test_Y = [], []
repeat = 1

for t in test_addresses:
    for i in range(repeat):
        if random.random() > 0.3:
            x, y = generate_index(test_results[random.randint(0, len(test_results) - 1)], 
                                  t.split(), entities['text'], entities['label'], 50)
            if len(x) != len(y):
                print('len not same')
                continue
            test_X.append(x)
            test_Y.append(y)
            
len(test_X)

15106

In [20]:
len(train_X), len(test_X)

(57502, 15106)

In [None]:
with open('augmentation-address-ontonotes5.json', 'w') as fopen:
    json.dump({'train_X': train_X, 'train_Y': train_Y,
              'test_X': test_X, 'test_Y': test_Y}, fopen)