In [1]:
# !pip3 install bert-tensorflow --user
# !wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
# !unzip multi_cased_L-12_H-768_A-12.zip

In [2]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import tensorflow as tf

In [3]:
from tqdm import tqdm
import json

In [4]:
with open('selected-topics.json') as fopen:
    x = json.load(fopen)
texts = x['X']
labels = x['Y']

In [5]:
MAX_SEQ_LENGTH = 100

In [6]:
BERT_VOCAB = 'multi_cased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = 'multi_cased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = 'multi_cased_L-12_H-768_A-12/bert_config.json'

tokenization.validate_case_matches_checkpoint(False, '')
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=False)

In [7]:
tokenizer.tokenize(texts[1])

['Kemp',
 '##en',
 'misi',
 'den',
 '##ggi',
 'https',
 ':',
 '/',
 '/',
 't',
 '.',
 'co',
 '/',
 '2',
 '##Q',
 '##P',
 '##o',
 '##D',
 '##L',
 '##y',
 '##Y',
 '##6',
 '##Z']

In [8]:
input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(texts):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 65000/65000 [00:21<00:00, 3028.02it/s]


In [9]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [10]:
epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [11]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_pooled_output()
        self.logits = tf.layers.dense(output_layer, dimension_output)
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [12]:
unique_labels = np.unique(labels)

In [13]:
dimension_output = len(unique_labels)
learning_rate = 1e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from multi_cased_L-12_H-768_A-12/bert_model.ckpt


In [14]:
from sklearn.model_selection import train_test_split

train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, labels, test_size = 0.2
)

In [15]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_input_ids) / batch_size
    train_acc /= len(train_input_ids) / batch_size
    test_loss /= len(test_input_ids) / batch_size
    test_acc /= len(test_input_ids) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 867/867 [05:59<00:00,  2.62it/s, accuracy=0.975, cost=0.133] 
test minibatch loop: 100%|██████████| 217/217 [00:30<00:00,  7.16it/s, accuracy=0.95, cost=0.24]   
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.930308
time taken: 389.8563537597656
epoch: 0, training loss: 0.996093, training acc: 0.694856, valid loss: 0.237997, valid acc: 0.930308



train minibatch loop: 100%|██████████| 867/867 [05:58<00:00,  2.62it/s, accuracy=0.975, cost=0.0464]
test minibatch loop: 100%|██████████| 217/217 [00:30<00:00,  7.21it/s, accuracy=0.95, cost=0.158]  
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.930308, current acc: 0.948769
time taken: 388.7940900325775
epoch: 1, training loss: 0.175669, training acc: 0.943298, valid loss: 0.158832, valid acc: 0.948769



train minibatch loop: 100%|██████████| 867/867 [05:58<00:00,  2.61it/s, accuracy=0.975, cost=0.0332]
test minibatch loop: 100%|██████████| 217/217 [00:30<00:00,  7.18it/s, accuracy=0.95, cost=0.0897] 
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.948769, current acc: 0.954231
time taken: 389.1874635219574
epoch: 2, training loss: 0.110326, training acc: 0.959529, valid loss: 0.135784, valid acc: 0.954231



train minibatch loop: 100%|██████████| 867/867 [05:58<00:00,  2.62it/s, accuracy=0.975, cost=0.118] 
test minibatch loop: 100%|██████████| 217/217 [00:30<00:00,  7.22it/s, accuracy=0.95, cost=0.104]  
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.954231, current acc: 0.956846
time taken: 389.04965686798096
epoch: 3, training loss: 0.088524, training acc: 0.964702, valid loss: 0.126539, valid acc: 0.956846



train minibatch loop: 100%|██████████| 867/867 [05:58<00:00,  2.61it/s, accuracy=0.975, cost=0.0377]
test minibatch loop: 100%|██████████| 217/217 [00:30<00:00,  7.21it/s, accuracy=0.975, cost=0.112] 
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

time taken: 388.8955490589142
epoch: 4, training loss: 0.075868, training acc: 0.969644, valid loss: 0.130936, valid acc: 0.955269



train minibatch loop: 100%|██████████| 867/867 [05:58<00:00,  2.61it/s, accuracy=1, cost=0.0218]    
test minibatch loop: 100%|██████████| 217/217 [00:30<00:00,  7.21it/s, accuracy=0.975, cost=0.0696]
train minibatch loop:   0%|          | 0/867 [00:00<?, ?it/s]

time taken: 388.9079146385193
epoch: 5, training loss: 0.067325, training acc: 0.972481, valid loss: 0.139464, valid acc: 0.954962



train minibatch loop: 100%|██████████| 867/867 [05:58<00:00,  2.61it/s, accuracy=1, cost=0.0175]    
test minibatch loop: 100%|██████████| 217/217 [00:30<00:00,  7.20it/s, accuracy=0.975, cost=0.12]  

time taken: 389.0100209712982
epoch: 6, training loss: 0.060811, training acc: 0.974385, valid loss: 0.141177, valid acc: 0.954346

break epoch:7






In [16]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    batch_masks = test_input_masks[i: index]
    batch_segment = test_segment_ids[i: index]
    batch_y = test_Y[i: index]
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
    ), 1, ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 217/217 [00:30<00:00,  7.19it/s]


In [18]:
labels = ['kesihatan',
 'kes lemas',
 'kes pecah rumah',
 'kes tangkap basah',
 'kewangan dan perniagaan',
 'kos sara hidup',
 'suruhanjaya pilihan raya malaysia',
 'tentera malaysia',
 'nilai ringgit jatuh',
 'kes buang bayi',
 'isu kemiskinan',
 'infrastruktur',
 'harga minyak']

In [21]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = labels, digits=5
    )
)

                                   precision    recall  f1-score   support

                        kesihatan    0.97542   0.98805   0.98169      1004
                        kes lemas    0.98827   0.97068   0.97940       955
                  kes pecah rumah    0.94521   0.95549   0.95032      1011
                kes tangkap basah    0.95064   0.97568   0.96300       987
          kewangan dan perniagaan    0.94614   0.96178   0.95389       968
                   kos sara hidup    0.95679   0.94801   0.95238       981
suruhanjaya pilihan raya malaysia    0.93681   0.93214   0.93447      1002
                 tentera malaysia    0.97431   0.96422   0.96924      1062
              nilai ringgit jatuh    0.91910   0.97505   0.94625      1002
                   kes buang bayi    0.95591   0.95400   0.95495      1000
                   isu kemiskinan    0.94271   0.90541   0.92368      1036
                    infrastruktur    0.98000   0.93531   0.95714       943
                     har