In [2]:
import re
import itertools
from unidecode import unidecode
import malaya
import json

_tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer
rejected = ['wkwk', 'http', 'https', 'lolol', 'hahaha']

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    string = ''.join(''.join(s)[:2] for _, s in itertools.groupby(unidecode(string)))
    tokenized = _tokenizer(string)
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [w for w in tokenized if all([r not in w for r in rejected])]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

with open('dictionary.json') as fopen:
    d = json.load(fopen)
dictionary = d['dictionary']
rev_dictionary = d['reverse_dictionary']

class Tokenizer:
    def __init__(self, vocab, rev_dictionary):
        self.vocab = vocab
        self.inv_vocab = rev_dictionary
    
    def tokenize(self, string):
        return preprocessing(string)
    
    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(t, 1) for t in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [self.inv_vocab[i] for i in ids]
    
tokenizer = Tokenizer(dictionary, rev_dictionary)

In [3]:
tokenizer.tokenize('saya suka makan ayam')

['saya', 'suka', 'ma', 'ayam']

In [4]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('saya suka makan ayam'))

[8, 73, 166, 988]