In [1]:
# !wget https://huseinhouse-storage.s3-ap-southeast-1.amazonaws.com/bert-bahasa/dumping-wiki-6-july-2019.json
# !wget https://huseinhouse-storage.s3-ap-southeast-1.amazonaws.com/bert-bahasa/dumping-wiki-20-july-2019.json

In [2]:
import json

with open('dumping-wiki-6-july-2019.json') as fopen:
    wiki = json.load(fopen)
len(wiki)

1663373

In [3]:
with open('dumping-wiki-20-july-2019.json') as fopen:
    wiki += json.load(fopen)
len(wiki)

2967217

In [4]:
import malaya
import re
from unidecode import unidecode
from tqdm import tqdm

tokenizer = malaya.preprocessing.SocialTokenizer().tokenize

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False
    
def preprocessing(string):
    tokenized = tokenizer(unidecode(string.lower()))
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    tokenized = [w for w in tokenized if len(w) > 1]
    return ' '.join(tokenized)

In [5]:
for i in tqdm(range(len(wiki))):
    wiki[i] = preprocessing(wiki[i])

100%|██████████| 2967217/2967217 [05:40<00:00, 8707.10it/s] 


In [10]:
import collections

def build_dataset(words, n_words, atleast=3):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reversed_dictionary

In [11]:
split = (' '.join(wiki)).split()
vocabulary_size = len(list(set(split)))
dictionary, rev_dictionary = build_dataset(split, vocabulary_size)

In [12]:
len(dictionary)

340755

In [14]:
with open('cleaned-wiki.json', 'w') as fopen:
    json.dump(wiki, fopen)

In [15]:
with open('wiki-dictionary.json', 'w') as fopen:
    json.dump({'dictionary': dictionary, 'rev_dictionary': rev_dictionary}, fopen)