In [1]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import re

In [2]:
BERT_INIT_CHKPNT = 'bert-entities/model.ckpt'
BERT_CONFIG = 'bert-bahasa/config.json'

In [3]:
from tqdm import tqdm
import malaya

In [4]:
_tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer
rejected = ['wkwk', 'http', 'https', 'lolol', 'hahaha']

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    string = ''.join(''.join(s)[:2] for _, s in itertools.groupby(unidecode(string)))
    tokenized = _tokenizer(string)
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [w for w in tokenized if all([r not in w for r in rejected])]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

with open('dictionary.json') as fopen:
    d = json.load(fopen)
dictionary = d['dictionary']
rev_dictionary = d['reverse_dictionary']

class Tokenizer:
    def __init__(self, vocab, rev_dictionary):
        self.vocab = vocab
        self.inv_vocab = rev_dictionary
    
    def tokenize(self, string):
        return preprocessing(string)
    
    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(t, 1) for t in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [self.inv_vocab[i] for i in ids]
    
tokenizer = Tokenizer(dictionary, rev_dictionary)

In [5]:
with open('selected-topics.json') as fopen:
    x = json.load(fopen)
texts = x['X']
labels = x['Y']

In [6]:
MAX_SEQ_LENGTH = 100

In [7]:
tokenizer.tokenize(texts[1])

['kempen', 'misi', 'denggi']

In [8]:
input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(texts):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 65000/65000 [00:19<00:00, 3275.50it/s]


In [9]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [10]:
epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [11]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_pooled_output()
        self.logits = tf.layers.dense(output_layer, dimension_output)
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [12]:
unique_labels = np.unique(labels)

In [13]:
dimension_output = len(unique_labels)
learning_rate = 1e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from bert-entities/model.ckpt


In [14]:
from sklearn.cross_validation import train_test_split

train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, labels, test_size = 0.2
)

In [15]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_input_ids) / batch_size
    train_acc /= len(train_input_ids) / batch_size
    test_loss /= len(test_input_ids) / batch_size
    test_acc /= len(test_input_ids) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 867/867 [02:58<00:00,  5.27it/s, accuracy=0.875, cost=0.255] 
test minibatch loop: 100%|██████████| 217/217 [00:14<00:00, 16.30it/s, accuracy=0.875, cost=0.346]
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.909500
time taken: 192.36621642112732
epoch: 0, training loss: 1.321438, training acc: 0.599721, valid loss: 0.303427, valid acc: 0.909500



train minibatch loop: 100%|██████████| 867/867 [02:57<00:00,  5.28it/s, accuracy=0.95, cost=0.0972] 
test minibatch loop: 100%|██████████| 217/217 [00:13<00:00, 15.52it/s, accuracy=0.875, cost=0.367] 
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.909500, current acc: 0.943269
time taken: 191.0646493434906
epoch: 1, training loss: 0.190147, training acc: 0.938962, valid loss: 0.167643, valid acc: 0.943269



train minibatch loop: 100%|██████████| 867/867 [02:57<00:00,  5.27it/s, accuracy=0.975, cost=0.0655]
test minibatch loop: 100%|██████████| 217/217 [00:13<00:00, 15.51it/s, accuracy=0.9, cost=0.229]   
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.943269, current acc: 0.947692
time taken: 191.02010822296143
epoch: 2, training loss: 0.127806, training acc: 0.954702, valid loss: 0.152384, valid acc: 0.947692



train minibatch loop: 100%|██████████| 867/867 [02:57<00:00,  5.27it/s, accuracy=0.975, cost=0.0468]
test minibatch loop: 100%|██████████| 217/217 [00:13<00:00, 15.51it/s, accuracy=0.85, cost=0.332]  
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.947692, current acc: 0.949000
time taken: 191.0524821281433
epoch: 3, training loss: 0.105019, training acc: 0.960202, valid loss: 0.149048, valid acc: 0.949000



train minibatch loop: 100%|██████████| 867/867 [02:57<00:00,  5.28it/s, accuracy=1, cost=0.0412]    
test minibatch loop: 100%|██████████| 217/217 [00:13<00:00, 15.51it/s, accuracy=0.875, cost=0.302] 
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

time taken: 191.06097269058228
epoch: 4, training loss: 0.094668, training acc: 0.963308, valid loss: 0.151800, valid acc: 0.947115



train minibatch loop: 100%|██████████| 867/867 [02:57<00:00,  5.27it/s, accuracy=1, cost=0.0342]    
test minibatch loop: 100%|██████████| 217/217 [00:13<00:00, 15.52it/s, accuracy=0.9, cost=0.239]   
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

epoch: 5, pass acc: 0.949000, current acc: 0.949000
time taken: 191.11428141593933
epoch: 5, training loss: 0.085015, training acc: 0.966981, valid loss: 0.152551, valid acc: 0.949000



train minibatch loop: 100%|██████████| 867/867 [02:57<00:00,  5.26it/s, accuracy=0.975, cost=0.0405]
test minibatch loop: 100%|██████████| 217/217 [00:13<00:00, 15.51it/s, accuracy=0.85, cost=0.352]  
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

time taken: 191.04737639427185
epoch: 6, training loss: 0.076038, training acc: 0.969760, valid loss: 0.156909, valid acc: 0.946154



train minibatch loop: 100%|██████████| 867/867 [02:57<00:00,  5.28it/s, accuracy=0.975, cost=0.0459]
test minibatch loop: 100%|██████████| 217/217 [00:14<00:00, 15.50it/s, accuracy=0.9, cost=0.354]   
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

time taken: 191.076397895813
epoch: 7, training loss: 0.070667, training acc: 0.971702, valid loss: 0.164149, valid acc: 0.947538



train minibatch loop: 100%|██████████| 867/867 [02:56<00:00,  5.27it/s, accuracy=0.975, cost=0.0246]
test minibatch loop: 100%|██████████| 217/217 [00:13<00:00, 15.54it/s, accuracy=0.875, cost=0.307] 

time taken: 190.93834352493286
epoch: 8, training loss: 0.063827, training acc: 0.974740, valid loss: 0.165106, valid acc: 0.946115

break epoch:9






In [16]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    batch_masks = test_input_masks[i: index]
    batch_segment = test_segment_ids[i: index]
    batch_y = test_Y[i: index]
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
    ), 1, ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 217/217 [00:14<00:00, 16.50it/s]


In [17]:
labels = ['kesihatan',
 'kes lemas',
 'kes pecah rumah',
 'kes tangkap basah',
 'kewangan dan perniagaan',
 'kos sara hidup',
 'suruhanjaya pilihan raya malaysia',
 'tentera malaysia',
 'nilai ringgit jatuh',
 'kes buang bayi',
 'isu kemiskinan',
 'infrastruktur',
 'harga minyak']

from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = labels, digits=5
    )
)

                                   precision    recall  f1-score   support

                        kesihatan    0.98063   0.98465   0.98264       977
                        kes lemas    0.97217   0.97124   0.97170      1043
                  kes pecah rumah    0.94153   0.90680   0.92384      1030
                kes tangkap basah    0.95635   0.96477   0.96055      1022
          kewangan dan perniagaan    0.93193   0.94095   0.93642      1033
                   kos sara hidup    0.95504   0.92816   0.94141      1030
suruhanjaya pilihan raya malaysia    0.92270   0.93100   0.92683      1000
                 tentera malaysia    0.96846   0.97842   0.97342       973
              nilai ringgit jatuh    0.97793   0.95029   0.96391      1026
                   kes buang bayi    0.95125   0.93404   0.94257       940
                   isu kemiskinan    0.86634   0.91358   0.88933       972
                    infrastruktur    0.91354   0.89857   0.90599       976
                     har