In [1]:
!wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/entities/entities-data-v3.json

--2019-06-17 16:15:52--  https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/entities/entities-data-v3.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.8.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.8.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1145908 (1.1M) [text/plain]
Saving to: ‘entities-data-v3.json’


2019-06-17 16:15:53 (25.7 MB/s) - ‘entities-data-v3.json’ saved [1145908/1145908]



In [2]:
import json

In [3]:
with open('entities-data-v3.json') as fopen:
    data = json.load(fopen)
data.keys()

dict_keys(['text', 'label'])

In [4]:
left_train = data['text']
right_train = data['label']

In [5]:
import re
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [6]:
seq_len = 50
def iter_seq(x):
    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

In [7]:
left_train, right_train = to_train_seq(left_train, right_train)

In [8]:
tag2idx = {'PAD': 0}
for no, u in enumerate(np.unique(right_train)):
    tag2idx[u] = no + 1
tag2idx

{'PAD': 0,
 'OTHER': 1,
 'law': 2,
 'location': 3,
 'organization': 4,
 'person': 5,
 'quantity': 6,
 'time': 7}

In [9]:
left_train

array([['Pengamat', 'politik', 'dari', ..., 'perpecahan', 'di',
        'kalangan'],
       ['politik', 'dari', 'Universitas', ..., 'di', 'kalangan', 'kader'],
       ['dari', 'Universitas', 'Gadjah', ..., 'kalangan', 'kader',
        'Golkar'],
       ...,
       ['golongan', 'OKU', 'penglihatan', ..., 'dibantu', 'oleh', 'ahli'],
       ['OKU', 'penglihatan', 'Penganjurannya', ..., 'oleh', 'ahli',
        'jawatankuasa'],
       ['penglihatan', 'Penganjurannya', 'sekali', ..., 'ahli',
        'jawatankuasa', 'program']], dtype='<U23')

In [10]:
BERT_INIT_CHKPNT = 'bert-pos/model.ckpt'
BERT_CONFIG = 'bert-bahasa/config.json'

In [11]:
from tqdm import tqdm
import itertools
from unidecode import unidecode
import malaya

_tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer
rejected = ['wkwk', 'http', 'https', 'lolol', 'hahaha']

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    string = ''.join(''.join(s)[:2] for _, s in itertools.groupby(unidecode(string)))
    tokenized = _tokenizer(string)
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [w for w in tokenized if all([r not in w for r in rejected])]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

with open('dictionary.json') as fopen:
    d = json.load(fopen)
dictionary = d['dictionary']
rev_dictionary = d['reverse_dictionary']

class Tokenizer:
    def __init__(self, vocab, rev_dictionary):
        self.vocab = vocab
        self.inv_vocab = rev_dictionary
    
    def tokenize(self, string):
        return preprocessing(string)
    
    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(t, 1) for t in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [self.inv_vocab[i] for i in ids]
    
tokenizer = Tokenizer(dictionary, rev_dictionary)

In [12]:
def XY(left_train, right_train):
    X, Y = [], []
    for i in tqdm(range(len(left_train))):
        left = left_train[i]
        right = right_train[i]
        bert_tokens = ['[CLS]']
        y = ['PAD']
        for no, orig_token in enumerate(left):
            t = tokenizer.tokenize(orig_token)
            if len(t):
                y.append(right[no])
                bert_tokens.extend(t)
                y.extend(['PAD'] * (len(t) - 1))
        bert_tokens.append("[SEP]")
        y.append('PAD')
        X.append(tokenizer.convert_tokens_to_ids(bert_tokens))
        Y.append([tag2idx[i] for i in y])
    return X, Y

In [13]:
train_X, train_Y = XY(left_train, right_train)

100%|██████████| 58397/58397 [01:14<00:00, 786.47it/s]


In [14]:
import keras
train_X = keras.preprocessing.sequence.pad_sequences(train_X, padding='post')
train_Y = keras.preprocessing.sequence.pad_sequences(train_Y, padding='post')
train_X.shape, train_Y.shape

Using TensorFlow backend.


((58397, 58), (58397, 58))

In [15]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling

In [16]:
epoch = 10
batch_size = 16
warmup_proportion = 0.1
num_train_steps = int(len(train_X) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [17]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.maxlen = tf.shape(self.X)[1]
        self.lengths = tf.count_nonzero(self.X, 1)
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            use_one_hot_embeddings=False)
        output_layer = model.get_sequence_output()
        logits = tf.layers.dense(output_layer, dimension_output)
        y_t = self.Y
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_t, self.lengths
        )
        self.cost = tf.reduce_mean(-log_likelihood)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        self.tags_seq, tags_score = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')

        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(y_t, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [18]:
dimension_output = len(tag2idx)
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from bert-pos/model.ckpt


In [None]:
import time

for e in range(10):
    lasttime = time.time()
    train_acc, train_loss = 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f\n'
        % (e, train_loss, train_acc)
    )

train minibatch loop: 100%|██████████| 3650/3650 [07:53<00:00,  7.67it/s, accuracy=0.944, cost=6.52] 
train minibatch loop:   0%|          | 1/3650 [00:00<07:32,  8.07it/s, accuracy=0.843, cost=19]

time taken: 473.56216311454773
epoch: 0, training loss: 22.055536, training acc: 0.864788



train minibatch loop:  33%|███▎      | 1205/3650 [02:36<05:21,  7.59it/s, accuracy=0.95, cost=6.21]  

In [None]:
saver.save(sess, 'bert-entities/model.ckpt')