In [1]:
!wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/part-of-speech/pos-data-v3.json

--2019-06-17 11:16:24--  https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/part-of-speech/pos-data-v3.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.8.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.8.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3085086 (2.9M) [text/plain]
Saving to: ‘pos-data-v3.json.1’


2019-06-17 11:16:24 (47.7 MB/s) - ‘pos-data-v3.json.1’ saved [3085086/3085086]



In [2]:
import json

In [3]:
with open('pos-data-v3.json') as fopen:
    data = json.load(fopen)

In [4]:
left_train = [i[0] for i in data]
right_train = [i[-1] for i in data]

In [5]:
import re
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [6]:
seq_len = 50
def iter_seq(x):
    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

In [7]:
left_train, right_train = to_train_seq(left_train, right_train)

In [8]:
tag2idx = {'PAD': 0}
for no, u in enumerate(np.unique(right_train)):
    tag2idx[u] = no + 1
tag2idx

{'PAD': 0,
 'ADJ': 1,
 'ADP': 2,
 'ADV': 3,
 'AUX': 4,
 'CCONJ': 5,
 'DET': 6,
 'NOUN': 7,
 'NUM': 8,
 'PART': 9,
 'PRON': 10,
 'PROPN': 11,
 'SCONJ': 12,
 'SYM': 13,
 'VERB': 14,
 'X': 15}

In [9]:
left_train

array([['Sampul', 'dari', 'dua', ..., 'dan', 'dirilis', 'pada'],
       ['dari', 'dua', 'singel', ..., 'dirilis', 'pada', 'tanggal'],
       ['dua', 'singel', 'pertama', ..., 'pada', 'tanggal', '21'],
       ...,
       ['di', 'kalender', 'Yahudi', ..., 'General', 'Superintendent',
        'UPC'],
       ['kalender', 'Yahudi', 'Cibodas', ..., 'Superintendent', 'UPC',
        'di'],
       ['Yahudi', 'Cibodas', 'Baru', ..., 'UPC', 'di', 'Amerika']],
      dtype='<U25')

In [10]:
BERT_INIT_CHKPNT = 'bert-emotion/model.ckpt'
BERT_CONFIG = 'bert-bahasa/config.json'

In [11]:
from tqdm import tqdm
import itertools
from unidecode import unidecode
import malaya

_tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer
rejected = ['wkwk', 'http', 'https', 'lolol', 'hahaha']

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    string = ''.join(''.join(s)[:2] for _, s in itertools.groupby(unidecode(string)))
    tokenized = _tokenizer(string)
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [w for w in tokenized if all([r not in w for r in rejected])]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

with open('dictionary.json') as fopen:
    d = json.load(fopen)
dictionary = d['dictionary']
rev_dictionary = d['reverse_dictionary']

class Tokenizer:
    def __init__(self, vocab, rev_dictionary):
        self.vocab = vocab
        self.inv_vocab = rev_dictionary
    
    def tokenize(self, string):
        return preprocessing(string)
    
    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(t, 1) for t in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [self.inv_vocab[i] for i in ids]
    
tokenizer = Tokenizer(dictionary, rev_dictionary)

In [12]:
def XY(left_train, right_train):
    X, Y = [], []
    for i in tqdm(range(len(left_train))):
        left = left_train[i]
        right = right_train[i]
        bert_tokens = ['[CLS]']
        y = ['PAD']
        for no, orig_token in enumerate(left):
            t = tokenizer.tokenize(orig_token)
            if len(t):
                y.append(right[no])
                bert_tokens.extend(t)
                y.extend(['PAD'] * (len(t) - 1))
        bert_tokens.append("[SEP]")
        y.append('PAD')
        X.append(tokenizer.convert_tokens_to_ids(bert_tokens))
        Y.append([tag2idx[i] for i in y])
    return X, Y

In [13]:
train_X, train_Y = XY(left_train, right_train)

100%|██████████| 103645/103645 [02:12<00:00, 783.64it/s]


In [14]:
import keras
train_X = keras.preprocessing.sequence.pad_sequences(train_X, padding='post')
train_Y = keras.preprocessing.sequence.pad_sequences(train_Y, padding='post')
train_X.shape, train_Y.shape

Using TensorFlow backend.


((103645, 55), (103645, 55))

In [15]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling

In [16]:
epoch = 3
batch_size = 16
warmup_proportion = 0.1
num_train_steps = int(len(train_X) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [17]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.maxlen = tf.shape(self.X)[1]
        self.lengths = tf.count_nonzero(self.X, 1)
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            use_one_hot_embeddings=False)
        output_layer = model.get_sequence_output()
        logits = tf.layers.dense(output_layer, dimension_output)
        y_t = self.Y
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_t, self.lengths
        )
        self.cost = tf.reduce_mean(-log_likelihood)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        self.tags_seq, tags_score = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')

        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(y_t, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [18]:
dimension_output = len(tag2idx)
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from bert-emotion/model.ckpt


In [20]:
import time

for e in range(8):
    lasttime = time.time()
    train_acc, train_loss = 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f\n'
        % (e, train_loss, train_acc)
    )

train minibatch loop: 100%|██████████| 6478/6478 [13:24<00:00,  8.19it/s, accuracy=0.905, cost=9.56]
train minibatch loop:   0%|          | 1/6478 [00:00<12:42,  8.50it/s, accuracy=0.872, cost=15.2]

time taken: 804.5755860805511
epoch: 0, training loss: 16.775186, training acc: 0.874612



train minibatch loop: 100%|██████████| 6478/6478 [13:24<00:00,  8.27it/s, accuracy=0.951, cost=5.92] 
train minibatch loop:   0%|          | 1/6478 [00:00<13:03,  8.26it/s, accuracy=0.911, cost=12.4]

time taken: 804.7049980163574
epoch: 1, training loss: 13.124162, training acc: 0.900369



train minibatch loop: 100%|██████████| 6478/6478 [13:24<00:00,  8.22it/s, accuracy=0.977, cost=3.08] 
train minibatch loop:   0%|          | 1/6478 [00:00<13:19,  8.11it/s, accuracy=0.925, cost=11.3]

time taken: 804.7344207763672
epoch: 2, training loss: 9.865208, training acc: 0.924173



train minibatch loop: 100%|██████████| 6478/6478 [13:24<00:00,  8.23it/s, accuracy=0.975, cost=2.23] 
train minibatch loop:   0%|          | 1/6478 [00:00<12:58,  8.32it/s, accuracy=0.967, cost=5.3]

time taken: 804.8675396442413
epoch: 3, training loss: 7.126463, training acc: 0.944937



train minibatch loop: 100%|██████████| 6478/6478 [13:24<00:00,  8.24it/s, accuracy=0.973, cost=2.54] 
train minibatch loop:   0%|          | 1/6478 [00:00<12:52,  8.39it/s, accuracy=0.932, cost=7.06]

time taken: 804.5777688026428
epoch: 4, training loss: 5.068951, training acc: 0.961455



train minibatch loop: 100%|██████████| 6478/6478 [13:24<00:00,  8.28it/s, accuracy=0.981, cost=1.96] 
train minibatch loop:   0%|          | 1/6478 [00:00<13:07,  8.23it/s, accuracy=0.962, cost=5.22]

time taken: 804.8435492515564
epoch: 5, training loss: 3.750298, training acc: 0.971970



train minibatch loop: 100%|██████████| 6478/6478 [13:24<00:00,  8.25it/s, accuracy=0.991, cost=1.13] 
train minibatch loop:   0%|          | 1/6478 [00:00<13:29,  8.00it/s, accuracy=0.986, cost=1.74]

time taken: 804.6170637607574
epoch: 6, training loss: 2.928225, training acc: 0.978502



train minibatch loop: 100%|██████████| 6478/6478 [13:24<00:00,  8.29it/s, accuracy=0.981, cost=1.68]  

time taken: 804.8858578205109
epoch: 7, training loss: 2.379846, training acc: 0.982927






In [21]:
saver.save(sess, 'bert-pos/model.ckpt')

'bert-pos/model.ckpt'