In [1]:
import tensorflow as tf
import numpy as np
import re
import os
from sklearn.metrics import classification_report
import time

  from ._conv import register_converters as _register_converters


In [2]:
with open('entities-bm-normalize-v3.txt','r') as fopen:
    texts= list(filter(None, fopen.read().split('\n')))
len(texts)

15194

In [3]:
tag2idx = {'PAD': 0}
pos2idx = {'PAD': 0}
char2idx = {'PAD': 0}
tag_idx = 1
char_idx = 1
pos_idx = 1

In [4]:
train_texts = texts
# dataset is too small
test_texts = texts

In [5]:
def process_word(word, lower=True):
    if lower:
        word = word.lower()
    else:
        if word.isupper():
            word = word.title()
    word = re.sub('[^A-Za-z0-9\- ]+', '', word)
    if word.isdigit():
        word = 'NUM'
    return word

def read_file(f):
    global tag_idx, char_idx, pos_idx
    words, tags, poss, X, Y, Y_pos = [], [], [], [], [], []
    for line in f:
        line = line.strip()
        if (len(line) == 0 or line.startswith("-DOCSTART-")):
            continue
        else:
            ls = line.split(' ')
            try:
                word, tag, pos = ls[0],ls[1],ls[2]
            except:
                continue
            word = process_word(word)
            if len(word) < 1:
                continue
            char_ids = []
            for c in word:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
                char_ids.append(char2idx[c])
            words += [word]
            tags += [tag]
            poss += [pos]
            X.append(char_ids)
            if tag not in tag2idx:
                tag2idx[tag] = tag_idx
                tag_idx += 1
            if pos not in pos2idx:
                pos2idx[pos] = pos_idx
                pos_idx += 1
            Y.append(tag2idx[tag])
            Y_pos.append(pos2idx[pos])
                        
    return words, tags, poss, X, np.array(Y), np.array(Y_pos)

In [6]:
train_words, train_tags, train_poss, train_X, train_Y, train_Y_pos = read_file(train_texts)
test_words, test_tags, test_poss, test_X, test_Y, test_Y_pos = read_file(test_texts)

In [7]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output, dimension_output_pos, learning_rate):
        
        def cells(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        self.Y_pos = tf.placeholder(tf.float32, [None, dimension_output_pos])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer // 2),
                cell_bw = cells(size_layer // 2),
                inputs = encoder_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            encoder_embedded = tf.concat((out_fw, out_bw), 2)
        W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        W_pos = tf.get_variable('w_pos',shape=(size_layer, dimension_output_pos),initializer=tf.orthogonal_initializer())
        b_pos = tf.get_variable('b_pos',shape=(dimension_output_pos),initializer=tf.zeros_initializer())
        self.logits = tf.add(tf.matmul(encoder_embedded[:, -1], W),b,name='logits')
        self.logits_pos = tf.add(tf.matmul(encoder_embedded[:, -1], W_pos),b_pos,name='logits_pos')
        cost_entity = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        cost_pos = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits_pos, labels = self.Y_pos))
        self.cost = cost_entity + cost_pos
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        correct_pred_pos = tf.equal(tf.argmax(self.logits_pos, 1), tf.argmax(self.Y_pos, 1))
        self.accuracy_pos = tf.reduce_mean(tf.cast(correct_pred_pos, tf.float32))

In [8]:
train_onehot = np.zeros((train_Y.shape[0],len(tag2idx)))
train_onehot[np.arange(train_Y.shape[0]),train_Y] = 1.0
train_onehot_pos = np.zeros((train_Y_pos.shape[0],len(pos2idx)))
train_onehot_pos[np.arange(train_Y_pos.shape[0]),train_Y_pos] = 1.0

test_onehot = np.zeros((test_Y.shape[0],len(tag2idx)))
test_onehot[np.arange(test_Y.shape[0]),test_Y] = 1.0
test_onehot_pos = np.zeros((test_Y_pos.shape[0],len(pos2idx)))
test_onehot_pos[np.arange(test_Y_pos.shape[0]),test_Y_pos] = 1.0

In [10]:
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = len(tag2idx)
dimension_output_pos = len(pos2idx)
learning_rate = 1e-3
batch_size = 32
idx2tag={idx: tag for tag, idx in tag2idx.items()}
idx2pos={idx: tag for tag, idx in pos2idx.items()}

In [29]:
import json
with open('char-settings.json','w') as fopen:
    fopen.write(json.dumps({'idx2tag':idx2tag,'idx2pos':idx2pos,'tag2idx':tag2idx,'pos2idx':pos2idx,'char2idx':char2idx}))

In [11]:
def str_idx(corpus, dic, UNK=3):
    maxlen = max([len(i) for i in corpus])
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            try:
                X[i,-1 - no]=dic[k]
            except Exception as e:
                X[i,-1 - no]=UNK
    return X

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(char2idx),
              dimension_output,dimension_output_pos,learning_rate)
sess.run(tf.global_variables_initializer())

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [13]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_words[i:i+batch_size],char2idx)
        acc_pos, acc, loss, _ = sess.run([model.accuracy_pos, model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size],
                                       model.Y_pos: train_onehot_pos[i:i+batch_size]})
        train_loss += loss
        train_acc += ((acc+acc_pos)/2)
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_words[i:i+batch_size],char2idx)
        acc_pos, acc, loss = sess.run([model.accuracy_pos, model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size],
                                       model.Y_pos : test_onehot_pos[i:i+batch_size]})
        test_loss += loss
        test_acc += ((acc+acc_pos)/2)
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1

epoch: 0, pass acc: 0.000000, current acc: 0.780716
time taken: 18.98798632621765
epoch: 0, training loss: 1.998942, training acc: 0.721154, valid loss: 1.550866, valid acc: 0.780716

epoch: 1, pass acc: 0.780716, current acc: 0.803385
time taken: 21.58083415031433
epoch: 1, training loss: 1.366895, training acc: 0.804387, valid loss: 1.301349, valid acc: 0.803385

epoch: 2, pass acc: 0.803385, current acc: 0.836104
time taken: 21.7098650932312
epoch: 2, training loss: 1.168944, training acc: 0.833267, valid loss: 1.146023, valid acc: 0.836104

epoch: 3, pass acc: 0.836104, current acc: 0.848524
time taken: 21.69019603729248
epoch: 3, training loss: 1.051429, training acc: 0.847990, valid loss: 1.027778, valid acc: 0.848524

epoch: 4, pass acc: 0.848524, current acc: 0.860143
time taken: 21.572304248809814
epoch: 4, training loss: 0.960130, training acc: 0.858774, valid loss: 0.942974, valid acc: 0.860143

epoch: 5, pass acc: 0.860143, current acc: 0.866687
time taken: 21.7069938182830

In [14]:
label_Y, label_pos, predicted_Y, predicted_pos = [], [], [], []
for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
    batch_x = str_idx(test_words[i:i+batch_size],char2idx)
    label_Y += test_Y[i:i+batch_size].tolist()
    label_pos += test_Y_pos[i:i+batch_size].tolist()
    logits, logits_pos = sess.run([tf.argmax(model.logits,1),tf.argmax(model.logits_pos,1)], 
                                  feed_dict = {model.X : batch_x})
    predicted_Y += logits.tolist()
    predicted_pos += logits_pos.tolist()

In [15]:
print(classification_report(label_Y, predicted_Y, target_names=tag2idx.keys()))

             precision    recall  f1-score   support

       TIME       0.74      0.82      0.78       474
        ART       0.95      0.96      0.96     11530
          I       0.86      0.80      0.83      1347
        LOC       0.88      0.90      0.89       174
        ORG       0.53      0.72      0.61       367
        PRN       0.65      0.80      0.72       173
        PAD       0.70      0.55      0.61       144
       NORP       0.72      0.60      0.65       575
        DOC       0.77      0.71      0.74        85
          O       0.86      0.51      0.64       105
      EVENT       0.00      0.00      0.00         1
        LAW       1.00      1.00      1.00         1

avg / total       0.91      0.91      0.91     14976



  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


In [16]:
print(classification_report(label_pos, predicted_pos, target_names=pos2idx.keys()))

             precision    recall  f1-score   support

         GN       0.91      0.96      0.94      8478
         KJ       0.96      0.95      0.96       109
         NO       0.97      0.71      0.82      2606
       NORP       0.95      0.99      0.97      1170
         KP       0.98      0.99      0.99       374
          O       0.53      0.68      0.59       204
       ARAH       0.84      0.99      0.91       660
    KPEMERI       0.98      0.97      0.98       641
         KB       0.98      0.99      0.98       287
    PISAHAN       0.00      0.00      0.00        11
       NAFI       1.00      0.98      0.99       144
         KA       0.88      1.00      0.93         7
         KS       0.93      0.93      0.93       103
         KH       1.00      1.00      1.00        53
   KPENGUAT       0.95      0.98      0.97        58
   KPENEGAS       0.71      1.00      0.83         5
         KT       0.97      0.93      0.95        30
        PAD       1.00      1.00      1.00   

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


In [17]:
def get_entity(string):
    batch_x = str_idx([process_word(w) for w in string.split()],char2idx)
    logits, logits_pos = sess.run([tf.argmax(model.logits,1),tf.argmax(model.logits_pos,1)],feed_dict={model.X:batch_x})
    for no, i in enumerate(string.split()):
        print(i,idx2tag[logits[no]],idx2pos[logits_pos[no]])

In [18]:
get_entity('KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar  sekiranya mengantuk ketika memandu.')

KUALA LOC KN
LUMPUR: LOC KN
Sempena O KN
sambutan O KN
Aidilfitri EVENT KN
minggu O KN
depan, O KN
Perdana PRN KN
Menteri PRN KN
Tun PRN KN
Dr PRN KN
Mahathir PRN KN
Mohamad PRN KN
dan O KH
Menteri PRN KN
Pengangkutan LAW KN
Anthony PRN KN
Loke PRN KN
Siew PRN KN
Fook PRN KN
menitipkan O KN
pesanan O KN
khas O KN
kepada O KS
orang O KN
ramai O KN
yang O KETERANGAN
mahu O KN
pulang O KN
ke O KS
kampung LOC KN
halaman LOC KN
masing-masing. O KN
Dalam O KS
video O KN
pendek O KN
terbitan O KN
Jabatan NORP KN
Keselamatan O KN
Jalan LOC KN
Raya ART KN
(JKJR) PRN KN
itu, O GN
Dr PRN KN
Mahathir PRN KN
menasihati O KJ
mereka O GN
supaya O KH
berhenti O KJ
berehat O KA
dan O KH
tidur O KN
sebentar O KETERANGAN
sekiranya O KN
mengantuk O KJ
ketika O KN
memandu. O KJ


In [19]:
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, os.getcwd()+"/char/char-entity-pos.ckpt")

'/home/husein/space/text-dataset/entities/bm/char/char-entity-pos.ckpt'

In [20]:
# only load Variables, placeholder for input, and our logits
strings=','.join([n.name for n in tf.get_default_graph().as_graph_def().node if "Variable" in n.op or n.name.find('Placeholder') >= 0 or n.name.find('logits') == 0])

In [21]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"
    clear_devices = True
    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        ) 
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

In [22]:
freeze_graph("char", strings)

INFO:tensorflow:Restoring parameters from /home/husein/space/text-dataset/entities/bm/char/char-entity-pos.ckpt
INFO:tensorflow:Froze 41 variables.
Converted 41 variables to const ops.
550 ops in the final graph.


In [23]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [24]:
g=load_graph('char/frozen_model.pb')

In [25]:
for op in g.get_operations():
    print(op.name)

import/Placeholder
import/Placeholder_1
import/Placeholder_2
import/Variable
import/Variable/read
import/embedding_lookup
import/bidirectional_rnn_0/fw/fw/Rank
import/bidirectional_rnn_0/fw/fw/range/start
import/bidirectional_rnn_0/fw/fw/range/delta
import/bidirectional_rnn_0/fw/fw/range
import/bidirectional_rnn_0/fw/fw/concat/values_0
import/bidirectional_rnn_0/fw/fw/concat/axis
import/bidirectional_rnn_0/fw/fw/concat
import/bidirectional_rnn_0/fw/fw/transpose
import/bidirectional_rnn_0/fw/fw/Shape
import/bidirectional_rnn_0/fw/fw/strided_slice/stack
import/bidirectional_rnn_0/fw/fw/strided_slice/stack_1
import/bidirectional_rnn_0/fw/fw/strided_slice/stack_2
import/bidirectional_rnn_0/fw/fw/strided_slice
import/bidirectional_rnn_0/fw/fw/LSTMCellZeroState/ExpandDims/dim
import/bidirectional_rnn_0/fw/fw/LSTMCellZeroState/ExpandDims
import/bidirectional_rnn_0/fw/fw/LSTMCellZeroState/Const
import/bidirectional_rnn_0/fw/fw/LSTMCellZeroState/concat/axis
import/bidirectional_rnn_0/fw/fw/LSTM

In [30]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
pos_logits = g.get_tensor_by_name('import/logits_pos:0')
test_sess = tf.InteractiveSession(graph=g)

In [32]:
test_sess.run([logits,pos_logits], feed_dict={x:batch_x})[0].shape

(32, 13)