In [1]:
from bs4 import BeautifulSoup
import re
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [2]:
def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [y.strip() for y in string]

def to_title(string):
    if string.isupper():
        string = string.title()
    return string

In [3]:
def parse_raw(filename):
    with open(filename, 'r') as fopen:
        entities = fopen.read()
    soup = BeautifulSoup(entities, 'html.parser')
    inside_tag = ''
    texts, labels = [], []
    for sentence in soup.prettify().split('\n'):
        if len(inside_tag):
            splitted = process_string(sentence)
            texts += splitted
            labels += [inside_tag] * len(splitted)
            inside_tag = ''
        else:
            if not sentence.find('</'):
                pass
            elif not sentence.find('<'):
                inside_tag = sentence.split('>')[0][1:]
            else:
                splitted = process_string(sentence)
                texts += splitted
                labels += ['OTHER'] * len(splitted)
    assert (len(texts)==len(labels)), "length texts and labels are not same"
    print('len texts and labels: ', len(texts))
    return texts,labels

In [4]:
train_texts, train_labels = parse_raw('data_train.txt')
test_texts, test_labels = parse_raw('data_test.txt')
train_texts += test_texts
train_labels += test_labels

len texts and labels:  34012
len texts and labels:  9249


In [5]:
np.unique(train_labels,return_counts=True)

(array(['OTHER', 'location', 'organization', 'person', 'quantity', 'time'],
       dtype='<U12'), array([35613,  1536,  1592,  2358,  1336,   826]))

In [6]:
with open('entities-bm-normalize-v3.txt','r') as fopen:
    entities_bm = fopen.read().split('\n')[:-1]
entities_bm = [i.split() for i in entities_bm]
entities_bm = [[i[0],'TIME' if i[0] in 'jam' else i[1]] for i in entities_bm]

In [7]:
replace_by = {'organizaiton':'organization','orgnization':'organization',
             'othoer': 'OTHER'}

with open('NER-part1.txt','r') as fopen:
    nexts = fopen.read().split('\n')[:-1]
nexts = [i.split() for i in nexts]
for i in nexts:
    if len(i) == 2:
        label = i[1].lower()
        if 'other' in label:
            label = label.upper()
        if label in replace_by:
            label = replace_by[label]
        train_labels.append(label)
        train_texts.append(i[0])

In [8]:
replace_by = {'LOC':'location','PRN':'person','NORP':'organization','ORG':'organization','LAW':'law',
             'EVENT':'event','FAC':'organization','TIME':'time','O':'OTHER','ART':'person','DOC':'law'}
for i in entities_bm:
    try:
        string = process_string(i[0])
        if len(string):
            train_labels.append(replace_by[i[1]])
            train_texts.append(process_string(i[0])[0])  
    except Exception as e:
        print(e)
        
assert (len(train_texts)==len(train_labels)), "length texts and labels are not same"

'KN'
'KA'


In [9]:
np.unique(train_labels,return_counts=True)

(array(['OTHER', 'event', 'law', 'location', 'organization', 'person',
        'quantity', 'time'], dtype='<U12'),
 array([49712,   234,   185,  2056,  2596,  4397,  1341,  1296]))

In [10]:
word2idx = {'PAD': 0,'NUM':1,'UNK':2}
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0}
word_idx = 3
tag_idx = 1
char_idx = 1

def parse_XY(texts, labels):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    X, Y = [], []
    for no, text in enumerate(texts):
        text = text.lower()
        tag = labels[no]
        for c in text:
            if c not in char2idx:
                char2idx[c] = char_idx
                char_idx += 1
        if tag not in tag2idx:
            tag2idx[tag] = tag_idx
            tag_idx += 1
        Y.append(tag2idx[tag])
        if text not in word2idx:
            word2idx[text] = word_idx
            word_idx += 1
        X.append(word2idx[text])
    return X, np.array(Y)

In [11]:
np.unique(train_labels)

array(['OTHER', 'event', 'law', 'location', 'organization', 'person',
       'quantity', 'time'], dtype='<U12')

In [12]:
X, Y = parse_XY(train_texts, train_labels)
idx2word={idx: tag for tag, idx in word2idx.items()}
idx2tag = {i: w for w, i in tag2idx.items()}

In [13]:
seq_len = 50
def iter_seq(x):
    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

def generate_char_seq(batch):
    x = [[len(idx2word[i]) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((batch.shape[0],batch.shape[1],maxlen),dtype=np.int32)
    for i in range(batch.shape[0]):
        for k in range(batch.shape[1]):
            for no, c in enumerate(idx2word[batch[i,k]]):
                temp[i,k,-1-no] = char2idx[c]
    return temp

In [14]:
import json
with open('entity-ner.json','w') as fopen:
    fopen.write(json.dumps({'idx2tag':idx2tag,'idx2word':idx2word,
           'word2idx':word2idx,'tag2idx':tag2idx,'char2idx':char2idx}))

In [15]:
X_seq, Y_seq = to_train_seq(X, Y)
X_char_seq = generate_char_seq(X_seq)
X_seq.shape

(61767, 50)

In [16]:
from keras.utils import to_categorical
Y_seq_3d = np.array([to_categorical(i, num_classes=len(tag2idx)) for i in Y_seq])

Using TensorFlow backend.


In [17]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y, train_char, test_char = train_test_split(X_seq, Y_seq_3d, X_char_seq, 
                                                                           test_size=0.2)



In [18]:
import tensorflow.contrib as tf_contrib
from tensorflow.contrib import rnn


class EntityNetwork:
    def __init__(
        self,
        learning_rate,
        decay_steps,
        decay_rate,
        story_length,
        vocab_size,
        embed_size,
        hidden_size,
        num_layers,
        dim_word,
        dim_char,
        hidden_size_char,
        hidden_size_word,
        word2idx,
        char2idx,
        dropout = 0.8,
        initializer = tf.random_normal_initializer(stddev = 0.1),
        clip_gradients = 5.0,
        use_bi_lstm = False,
    ):
        """init all hyperparameter here"""
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.learning_rate = tf.Variable(
            learning_rate, trainable = False, name = 'learning_rate'
        )
        self.learning_rate_decay_half_op = tf.assign(
            self.learning_rate, self.learning_rate * 0.5
        )
        self.initializer = initializer
        self.hidden_size = hidden_size
        self.clip_gradients = clip_gradients
        self.story_length = story_length
        self.block_size = 20
        self.use_bi_lstm = use_bi_lstm
        self.dimension = (
            self.hidden_size * 2 if self.use_bi_lstm else self.hidden_size
        )
        self.story = tf.placeholder(
            tf.int32, [None, self.story_length, None], name = 'story'
        )
        self.labels = tf.placeholder(
            tf.int32, shape = [None, None, None], name = 'input_y'
        )
        self.char_ids = tf.placeholder(
            tf.int32, shape = [None, None, None], name = 'char_ids'
        )
        self.query = tf.placeholder(tf.int32, [None, None], name = 'question')
        self.sequence_length = tf.shape(self.query)[1]
        self.batch_size = tf.shape(self.query)[0]
        self.lengths = tf.count_nonzero(self.query, 1)
        self.dropout_keep_prob = dropout

        self.global_step = tf.Variable(
            0, trainable = False, name = 'Global_Step'
        )
        self.epoch_step = tf.Variable(0, trainable = False, name = 'Epoch_Step')
        self.epoch_increment = tf.assign(
            self.epoch_step, tf.add(self.epoch_step, tf.constant(1))
        )
        self.decay_steps, self.decay_rate = decay_steps, decay_rate

        self.instantiate_weights()
        logits = self.inference()

        self.word_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(word2idx), dim_word], stddev = 1.0 / np.sqrt(dim_word)
            )
        )
        self.char_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(char2idx), dim_char], stddev = 1.0 / np.sqrt(dim_char)
            )
        )

        word_embedded = tf.nn.embedding_lookup(self.word_embeddings, self.query)
        char_embedded = tf.nn.embedding_lookup(
            self.char_embeddings, self.char_ids
        )
        s = tf.shape(char_embedded)
        char_embedded = tf.reshape(
            char_embedded, shape = [s[0] * s[1], s[-2], dim_char]
        )

        def cells(size, name, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                rnn.LSTMCell(size, reuse = reuse, name=name),
                state_keep_prob = dropout,
                output_keep_prob = dropout,
            )

        cell_chars = tf.nn.rnn_cell.MultiRNNCell(
            [cells(hidden_size_char,'char') for _ in range(num_layers)],
            
        )
        cell_words = tf.nn.rnn_cell.MultiRNNCell(
            [cells(hidden_size_word,'word') for _ in range(num_layers)]
        )
        char_embedded, _ = tf.nn.dynamic_rnn(
            cell_chars, char_embedded, dtype = tf.float32
        )
        output = tf.reshape(
            char_embedded[:, -1], shape = [s[0], s[1], hidden_size_char]
        )
        word_embedded = tf.concat([word_embedded, output], axis = -1)
        word_embedded, _ = tf.nn.dynamic_rnn(
            cell_words, word_embedded, dtype = tf.float32
        )
        logits = tf.tile(tf.expand_dims(logits,axis=1),[1,self.sequence_length,1])
        word_embedded = tf.multiply(word_embedded, logits)
        logits = tf.layers.dense(word_embedded, len(idx2tag))
        y_t = tf.argmax(self.labels, 2)
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_t, self.lengths
        )
        self.cost = tf.reduce_mean(-log_likelihood)
        mask = tf.sequence_mask(self.lengths, maxlen = self.sequence_length)
        self.tags_seq, tags_score = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')

        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(y_t, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        self.optimizer = self.train()

    def inference(self):
        self.embedding_with_mask()
        if self.use_bi_lstm:
            self.input_encoder_bi_lstm()
        else:
            self.input_encoder_bow()
        self.hidden_state = self.rnn_story()
        logits = self.output_module()
        return logits

    def output_module(self):
        p = tf.nn.softmax(
            tf.multiply(
                tf.expand_dims(self.query_embedding, axis = 1),
                self.hidden_state,
            )
        )
        return tf.reduce_sum(tf.multiply(p, self.hidden_state), axis = 1)

    def rnn_story(self):
        input_split = tf.split(
            self.story_embedding, self.story_length, axis = 1
        )
        input_list = [tf.squeeze(x, axis = 1) for x in input_split]
        h_all = tf.get_variable(
            'hidden_states',
            shape = [self.block_size, self.dimension],
            initializer = self.initializer,
        )
        w_all = tf.get_variable(
            'keys',
            shape = [self.block_size, self.dimension],
            initializer = self.initializer,
        )
        w_all_expand = tf.tile(
            tf.expand_dims(w_all, axis = 0), [self.batch_size, 1, 1]
        )
        h_all_expand = tf.tile(
            tf.expand_dims(h_all, axis = 0), [self.batch_size, 1, 1]
        )
        for i, input in enumerate(input_list):
            h_all_expand = self.cell(input, h_all_expand, w_all_expand, i)
        return h_all_expand

    def embedding_with_mask(self):
        self.story_embedding = tf.nn.embedding_lookup(
            self.Embedding, self.story
        )
        self.query_embedding = tf.nn.embedding_lookup(
            self.Embedding, self.query
        )

    def input_encoder_bow(self):
        self.story_embedding = tf.reduce_sum(self.story_embedding, axis = 2)
        self.query_embedding = tf.reduce_sum(self.query_embedding, axis = 1)

    def input_encoder_bi_lstm(self):
        """
        use bi-directional lstm to encode query_embedding:[batch_size,sequence_length,embed_size]
        and story_embedding:[batch_size,story_length,sequence_length,embed_size]
        output:query_embedding:[batch_size,hidden_size*2]
        story_embedding:[batch_size,self.story_length,self.hidden_size*2]
        """
        lstm_fw_cell = rnn.BasicLSTMCell(self.hidden_size)
        lstm_bw_cell = rnn.BasicLSTMCell(self.hidden_size)
        if self.dropout_keep_prob is not None:
            lstm_fw_cell = rnn.DropoutWrapper(
                lstm_fw_cell, output_keep_prob = self.dropout_keep_prob
            )
            lstm_bw_cell == rnn.DropoutWrapper(
                lstm_bw_cell, output_keep_prob = self.dropout_keep_prob
            )
        query_hidden_output, _ = tf.nn.bidirectional_dynamic_rnn(
            lstm_fw_cell,
            lstm_bw_cell,
            self.query_embedding,
            dtype = tf.float32,
            scope = 'query_rnn',
        )
        query_hidden_output = tf.concat(query_hidden_output, axis = 2)
        self.query_embedding = tf.reduce_sum(query_hidden_output, axis = 1)
        self.story_embedding = tf.reshape(
            self.story_embedding,
            shape = (
                -1,
                self.story_length * self.sequence_length,
                self.embed_size,
            ),
        )
        lstm_fw_cell_story = rnn.BasicLSTMCell(self.hidden_size)
        lstm_bw_cell_story = rnn.BasicLSTMCell(self.hidden_size)
        if self.dropout_keep_prob is not None:
            lstm_fw_cell_story = rnn.DropoutWrapper(
                lstm_fw_cell_story, output_keep_prob = self.dropout_keep_prob
            )

    def instantiate_weights(self):
        """define all weights here"""

        with tf.variable_scope('dynamic_memory'):
            self.U = tf.get_variable(
                'U',
                shape = [self.dimension, self.dimension],
                initializer = self.initializer,
            )
            self.V = tf.get_variable(
                'V',
                shape = [self.dimension, self.dimension],
                initializer = self.initializer,
            )
            self.W = tf.get_variable(
                'W',
                shape = [self.dimension, self.dimension],
                initializer = self.initializer,
            )
            self.h_bias = tf.get_variable('h_bias', shape = [self.dimension])
            self.h2_bias = tf.get_variable('h2_bias', shape = [self.dimension])

        with tf.variable_scope('embedding_projection'):
            self.Embedding = tf.get_variable(
                'Embedding',
                shape = [self.vocab_size, self.embed_size],
                initializer = self.initializer,
            )

    def cell(self, s_t, h_all, w_all, i):
        s_t_expand = tf.expand_dims(s_t, axis = 1)
        g = tf.nn.sigmoid(
            tf.multiply(s_t_expand, h_all) + tf.multiply(s_t_expand, w_all)
        )

        h_candidate_part1 = (
            tf.matmul(tf.reshape(h_all, shape = (-1, self.dimension)), self.U)
            + tf.matmul(tf.reshape(w_all, shape = (-1, self.dimension)), self.V)
            + self.h_bias
        )

        h_candidate_part1 = tf.reshape(
            h_candidate_part1,
            shape = (self.batch_size, self.block_size, self.dimension),
        )
        h_candidate_part2 = tf.expand_dims(
            tf.matmul(s_t, self.W) + self.h2_bias, axis = 1
        )
        h_candidate = self.activation(
            h_candidate_part1 + h_candidate_part2,
            scope = 'h_candidate' + str(i),
        )

        h_all = h_all + tf.multiply(g, h_candidate)

        h_all = tf.nn.l2_normalize(h_all, -1)
        return h_all

    def activation(self, features, scope = None):
        with tf.variable_scope(scope, 'PReLU', initializer = self.initializer):
            alpha = tf.get_variable('alpha', features.get_shape().as_list()[1:])
            pos = tf.nn.relu(features)
            neg = alpha * (features - tf.abs(features)) * 0.5
            return pos + neg

    def train(self):
        """based on the loss, use SGD to update parameter"""
        learning_rate = tf.train.exponential_decay(
            self.learning_rate,
            self.global_step,
            self.decay_steps,
            self.decay_rate,
            staircase = True,
        )
        self.learning_rate_ = learning_rate
        train_op = tf_contrib.layers.optimize_loss(
            self.cost,
            global_step = self.global_step,
            learning_rate = learning_rate,
            optimizer = 'Adam',
            clip_gradients = self.clip_gradients,
        )
        return train_op


In [19]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

dim_word = 64
dim_char = 128
learning_rate = 1e-3
hidden_size_char = 64
hidden_size_word = 64
num_layers = 2
batch_size = 32
decay_step = 1e4
decay_rate = 1.0
story_len = 1

model = EntityNetwork(
    learning_rate,
    decay_step,
    decay_rate,
    story_len,
    len(idx2word),
    dim_word,
    dim_word,
    num_layers,
    dim_word,
    dim_char,
    hidden_size_char,
    hidden_size_word,
    word2idx,
    char2idx
)

sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [20]:
import time

for e in range(7):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        batch_char = train_char[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.query: batch_x,
                model.story: batch_x_expand,
                model.char_ids: batch_char,
                model.labels: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.query: batch_x,
                model.story: batch_x_expand,
                model.char_ids: batch_char,
                model.labels: batch_y
            },
        )
        assert not np.isnan(cost)
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (e, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 1545/1545 [07:12<00:00,  3.63it/s, accuracy=0.82, cost=23.7] 
test minibatch loop: 100%|██████████| 387/387 [01:09<00:00,  5.43it/s, accuracy=0.85, cost=18]   
train minibatch loop:   0%|          | 0/1545 [00:00<?, ?it/s]

time taken: 502.42081928253174
epoch: 0, training loss: 34.051046, training acc: 0.779488, valid loss: 25.495596, valid acc: 0.806997



train minibatch loop: 100%|██████████| 1545/1545 [07:17<00:00,  3.54it/s, accuracy=0.956, cost=4.55]
test minibatch loop: 100%|██████████| 387/387 [01:09<00:00,  5.65it/s, accuracy=0.96, cost=4.38] 
train minibatch loop:   0%|          | 0/1545 [00:00<?, ?it/s]

time taken: 507.0530540943146
epoch: 1, training loss: 19.501139, training acc: 0.843072, valid loss: 7.213836, valid acc: 0.938117



train minibatch loop: 100%|██████████| 1545/1545 [07:15<00:00,  3.62it/s, accuracy=0.992, cost=1.1] 
test minibatch loop: 100%|██████████| 387/387 [01:09<00:00,  5.64it/s, accuracy=1, cost=0.794]    
train minibatch loop:   0%|          | 0/1545 [00:00<?, ?it/s]

time taken: 505.02700638771057
epoch: 2, training loss: 4.142400, training acc: 0.966011, valid loss: 2.432646, valid acc: 0.983406



train minibatch loop: 100%|██████████| 1545/1545 [07:14<00:00,  3.58it/s, accuracy=1, cost=0.219]    
test minibatch loop: 100%|██████████| 387/387 [01:10<00:00,  5.56it/s, accuracy=0.99, cost=1.86]  
train minibatch loop:   0%|          | 0/1545 [00:00<?, ?it/s]

time taken: 504.5054397583008
epoch: 3, training loss: 1.691221, training acc: 0.987663, valid loss: 1.299461, valid acc: 0.992584



train minibatch loop: 100%|██████████| 1545/1545 [07:11<00:00,  3.89it/s, accuracy=1, cost=0.0904]   
test minibatch loop: 100%|██████████| 387/387 [01:08<00:00,  5.96it/s, accuracy=1, cost=0.302]    
train minibatch loop:   0%|          | 0/1545 [00:00<?, ?it/s]

time taken: 499.93957567214966
epoch: 4, training loss: 0.979031, training acc: 0.993341, valid loss: 0.828576, valid acc: 0.996121



train minibatch loop: 100%|██████████| 1545/1545 [06:55<00:00,  4.00it/s, accuracy=1, cost=0.073]    
test minibatch loop: 100%|██████████| 387/387 [01:07<00:00,  5.98it/s, accuracy=1, cost=0.429]    
train minibatch loop:   0%|          | 0/1545 [00:00<?, ?it/s]

time taken: 482.60799765586853
epoch: 5, training loss: 0.681862, training acc: 0.995498, valid loss: 0.649344, valid acc: 0.997589



train minibatch loop: 100%|██████████| 1545/1545 [06:56<00:00,  3.82it/s, accuracy=1, cost=0.0131]   
test minibatch loop: 100%|██████████| 387/387 [01:07<00:00,  5.80it/s, accuracy=0.99, cost=0.529] 

time taken: 483.89457631111145
epoch: 6, training loss: 0.518453, training acc: 0.996710, valid loss: 0.529990, valid acc: 0.998480






In [21]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out

In [22]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_x_expand = np.expand_dims(batch_x,axis = 1)
    batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    predicted = pred2label(sess.run(model.tags_seq,
            feed_dict = {
                model.query: batch_x,
                model.story: batch_x_expand,
                model.char_ids: batch_char,
            },
    ))
    real = pred2label(np.argmax(batch_y, axis = 2))
    predict_Y.extend(predicted)
    real_Y.extend(real)

validation minibatch loop: 100%|██████████| 387/387 [00:59<00:00,  6.79it/s]


In [23]:
from sklearn.metrics import classification_report
print(classification_report(np.array(real_Y).ravel(), np.array(predict_Y).ravel()))

              precision    recall  f1-score   support

       OTHER       1.00      1.00      1.00    497198
       event       0.98      0.95      0.96      2381
         law       0.99      0.97      0.98      1881
    location       0.99      0.99      0.99     20305
organization       0.99      0.98      0.98     26036
      person       0.99      0.99      0.99     43470
    quantity       0.99      0.99      0.99     13608
        time       0.98      0.99      0.98     12821

 avg / total       1.00      1.00      1.00    617700



In [24]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'entity/model.ckpt')

strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'OptimizeLoss' not in n.name
        and 'Global_Step' not in n.name
        and 'Epoch_Step' not in n.name
        and 'learning_rate' not in n.name
    ]
)
strings.split(',')

['dynamic_memory/U',
 'dynamic_memory/V',
 'dynamic_memory/W',
 'dynamic_memory/h_bias',
 'dynamic_memory/h2_bias',
 'embedding_projection/Embedding',
 'hidden_states',
 'keys',
 'h_candidate0/alpha',
 'Variable',
 'Variable_1',
 'rnn/multi_rnn_cell/cell_0/char/kernel',
 'rnn/multi_rnn_cell/cell_0/char/bias',
 'rnn/multi_rnn_cell/cell_1/char/kernel',
 'rnn/multi_rnn_cell/cell_1/char/bias',
 'rnn/multi_rnn_cell/cell_0/word/kernel',
 'rnn/multi_rnn_cell/cell_0/word/bias',
 'rnn/multi_rnn_cell/cell_1/word/kernel',
 'rnn/multi_rnn_cell/cell_1/word/bias',
 'dense/kernel',
 'dense/bias',
 'transitions',
 'logits']

In [25]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))
        
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [26]:
freeze_graph('entity', strings)

INFO:tensorflow:Restoring parameters from entity/model.ckpt
INFO:tensorflow:Froze 22 variables.
INFO:tensorflow:Converted 22 variables to const ops.
926 ops in the final graph.


In [27]:
g = load_graph('entity/frozen_model.pb')

In [28]:
string = 'KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar  sekiranya mengantuk ketika memandu.'

In [29]:
def char_str_idx(corpus, dic, UNK = 0):
    maxlen = max([len(i) for i in corpus])
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            val = dic[k] if k in dic else UNK
            X[i, -1 - no] = val
    return X

def generate_char_seq(batch, idx2word, char2idx):
    x = [[len(idx2word[i]) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((batch.shape[0], batch.shape[1], maxlen), dtype = np.int32)
    for i in range(batch.shape[0]):
        for k in range(batch.shape[1]):
            for no, c in enumerate(idx2word[batch[i, k]].lower()):
                temp[i, k, -1 - no] = char2idx[c]
    return temp

sequence = process_string(string.lower())
X_seq = char_str_idx([sequence], word2idx, 2)
X_char_seq = generate_char_seq(X_seq, idx2word, char2idx)

In [30]:
[n.name for n in g.as_graph_def().node]

['import/story',
 'import/char_ids',
 'import/question',
 'import/Shape',
 'import/strided_slice/stack',
 'import/strided_slice/stack_1',
 'import/strided_slice/stack_2',
 'import/strided_slice',
 'import/Shape_1',
 'import/strided_slice_1/stack',
 'import/strided_slice_1/stack_1',
 'import/strided_slice_1/stack_2',
 'import/strided_slice_1',
 'import/count_nonzero/zeros',
 'import/count_nonzero/NotEqual',
 'import/count_nonzero/ToInt64',
 'import/count_nonzero/Sum/reduction_indices',
 'import/count_nonzero/Sum',
 'import/dynamic_memory/U',
 'import/dynamic_memory/U/read',
 'import/dynamic_memory/V',
 'import/dynamic_memory/V/read',
 'import/dynamic_memory/W',
 'import/dynamic_memory/W/read',
 'import/dynamic_memory/h_bias',
 'import/dynamic_memory/h_bias/read',
 'import/dynamic_memory/h2_bias',
 'import/dynamic_memory/h2_bias/read',
 'import/embedding_projection/Embedding',
 'import/embedding_projection/Embedding/read',
 'import/embedding_lookup/axis',
 'import/embedding_lookup',
 'im

In [31]:
story = g.get_tensor_by_name('import/story:0')
char_ids = g.get_tensor_by_name('import/char_ids:0')
question = g.get_tensor_by_name('import/question:0')
tags_seq = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
batch_x_expand = np.expand_dims(X_seq,axis = 1)
predicted = test_sess.run(tags_seq,
            feed_dict = {
                question: X_seq,
                char_ids: X_char_seq,
                story: batch_x_expand
            })[0]

for i in range(len(predicted)):
    print(sequence[i],idx2tag[predicted[i]])



kuala location
lumpur location
sempena OTHER
sambutan OTHER
aidilfitri event
minggu event
depan OTHER
perdana person
menteri person
tun person
dr person
mahathir person
mohamad OTHER
dan OTHER
menteri OTHER
pengangkutan OTHER
anthony person
loke person
siew person
fook person
menitipkan OTHER
pesanan OTHER
khas OTHER
kepada OTHER
orang event
ramai OTHER
yang OTHER
mahu OTHER
pulang OTHER
ke OTHER
kampung OTHER
halaman OTHER
masing-masing OTHER
dalam OTHER
video OTHER
pendek OTHER
terbitan OTHER
jabatan organization
keselamatan organization
jalan organization
raya organization
jkjr organization
itu OTHER
dr person
mahathir person
menasihati OTHER
mereka OTHER
supaya OTHER
berhenti OTHER
berehat OTHER
dan OTHER
tidur OTHER
sebentar OTHER
sekiranya OTHER
mengantuk OTHER
ketika OTHER
memandu OTHER
