In [1]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np
import re

In [2]:
with open('id_gsd-ud-train.conllu.txt') as fopen:
    corpus = fopen.read().split('\n')
    
with open('id_gsd-ud-test.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))
    
with open('id_gsd-ud-dev.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))

In [3]:
word2idx = {'PAD': 0,'NUM':1,'UNK':2}
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0,'NUM':1,'UNK':2}
word_idx = 3
tag_idx = 1
char_idx = 3

def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [to_title(y.strip()) for y in string]

def to_title(string):
    if string.isupper():
        string = string.title()
    return string

def process_corpus(corpus, until = None):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    sentences, words, depends, labels, pos = [], [], [], [], []
    temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []
    for sentence in corpus:
        if len(sentence):
            if sentence[0] == '#':
                continue
            sentence = sentence.split('\t')
            temp = process_string(sentence[1])
            if not len(temp):
                sentence[1] = 'EMPTY'
            sentence[1] = process_string(sentence[1])[0]
            for c in sentence[1]:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
            if sentence[7] not in tag2idx:
                tag2idx[sentence[7]] = tag_idx
                tag_idx += 1
            if sentence[1] not in word2idx:
                word2idx[sentence[1]] = word_idx
                word_idx += 1
            temp_word.append(word2idx[sentence[1]])
            temp_depend.append(int(sentence[6]) + 1)
            temp_label.append(tag2idx[sentence[7]])
            temp_sentence.append(sentence[1])
            temp_pos.append(sentence[3])
        else:
            words.append(temp_word)
            depends.append(temp_depend)
            labels.append(temp_label)
            sentences.append(temp_sentence)
            pos.append(temp_pos)
            temp_word = []
            temp_depend = []
            temp_label = []
            temp_sentence = []
            temp_pos = []
    return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1]
        
sentences, words, depends, labels, pos = process_corpus(corpus)

In [4]:
import json

with open('augmented.json') as fopen:
    augmented = json.load(fopen)

In [5]:
def parse_XY(texts):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    outside, sentences = [], []
    for no, text in enumerate(texts):
        s = process_string(text)
        sentences.append(s)
        inside = []
        for w in s:
            for c in w:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
            
            if w not in word2idx:
                word2idx[w] = word_idx
                word_idx += 1
                
            inside.append(word2idx[w])
        outside.append(inside)
    return outside, sentences

In [6]:
text_augmented = []
for a in augmented:
    text_augmented.extend(a[0])
    depends.extend(a[1])
    labels.extend(a[2])

In [7]:
outside, new_sentences = parse_XY(text_augmented)

In [8]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [9]:
words.extend(outside)
sentences.extend(new_sentences)

In [10]:
len(words), len(depends), len(labels), len(sentences)

(50365, 50365, 50365, 50365)

In [11]:
def generate_char_seq(batch, UNK = 2):
    maxlen_c = max([len(k) for k in batch])
    x = [[len(i) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((len(batch),maxlen_c,maxlen),dtype=np.int32)
    for i in range(len(batch)):
        for k in range(len(batch[i])):
            for no, c in enumerate(batch[i][k][:maxlen][::-1]):
                temp[i,k,-1-no] = char2idx.get(c, UNK)
    return temp

In [12]:
idx2word = {idx: tag for tag, idx in word2idx.items()}
idx2tag = {i: w for w, i in tag2idx.items()}
char = generate_char_seq(sentences)

In [13]:
words = pad_sequences(words,padding='post')
depends = pad_sequences(depends,padding='post')
labels = pad_sequences(labels,padding='post')
words.shape

(50365, 189)

In [14]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y, train_depends, test_depends, train_char, test_char = train_test_split(
                                                                           words,
                                                                           labels,
                                                                           depends,
                                                                           char,
                                                                           test_size=0.1)



In [15]:
class Model:
    def __init__(
        self,
        dim_word,
        dim_char,
        dropout,
        learning_rate,
        hidden_size_char,
        hidden_size_word,
        num_layers,
        maxlen
    ):
        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                output_keep_prob = dropout,
            )

        def bahdanau(embedded, size):
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                num_units = hidden_size_word, memory = embedded
            )
            return tf.contrib.seq2seq.AttentionWrapper(
                cell = cells(hidden_size_word),
                attention_mechanism = attention_mechanism,
                attention_layer_size = hidden_size_word,
            )
        self.word_ids = tf.placeholder(tf.int32, shape = [None, None])
        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None])
        self.labels = tf.placeholder(tf.int32, shape = [None, None])
        self.depends = tf.placeholder(tf.int32, shape = [None, None])
        self.maxlen = tf.shape(self.word_ids)[1]
        self.lengths = tf.count_nonzero(self.word_ids, 1)

        self.word_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(word2idx), dim_word], stddev = 1.0 / np.sqrt(dim_word)
            )
        )
        self.char_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(char2idx), dim_char], stddev = 1.0 / np.sqrt(dim_char)
            )
        )

        word_embedded = tf.nn.embedding_lookup(
            self.word_embeddings, self.word_ids
        )
        char_embedded = tf.nn.embedding_lookup(
            self.char_embeddings, self.char_ids
        )
        s = tf.shape(char_embedded)
        char_embedded = tf.reshape(
            char_embedded, shape = [s[0] * s[1], s[-2], dim_char]
        )
        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_char),
                cell_bw = cells(hidden_size_char),
                inputs = char_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_char_%d' % (n),
            )
            char_embedded = tf.concat((out_fw, out_bw), 2)
        output = tf.reshape(
            char_embedded[:, -1], shape = [s[0], s[1], 2 * hidden_size_char]
        )
        word_embedded = tf.concat([word_embedded, output], axis = -1)

        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = bahdanau(word_embedded, hidden_size_word),
                cell_bw = bahdanau(word_embedded, hidden_size_word),
                inputs = word_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_word_%d' % (n),
            )
            word_embedded = tf.concat((out_fw, out_bw), 2)

        logits = tf.layers.dense(word_embedded, len(idx2tag))
        
        tag_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(idx2tag), dim_word], stddev = 1.0 / np.sqrt(dim_word)
            )
        )
        logits_max = tf.argmax(logits,axis=2,output_type=tf.int32)
        lookup_logits = tf.nn.embedding_lookup(
            tag_embeddings, logits_max
        )
        (out_fw, out_bw), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_word),
                cell_bw = cells(hidden_size_word),
                inputs = word_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_word_%d' % (n),
            )
        
        cast_mask = tf.cast(tf.sequence_mask(self.lengths + 1, maxlen = maxlen), dtype = tf.float32)
        cast_mask = tf.tile(tf.expand_dims(cast_mask,axis=1),[1,self.maxlen,1]) * 10
        
        lookup_logits = tf.concat((out_fw, out_bw), 2)
        logits_depends = tf.layers.dense(lookup_logits, maxlen)
        
        logits_depends = tf.multiply(logits_depends, cast_mask)
        
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, self.labels, self.lengths
        )
        with tf.variable_scope("depends"):
            log_likelihood_depends, transition_params_depends = tf.contrib.crf.crf_log_likelihood(
                logits_depends, self.depends, self.lengths
            )
        self.cost = tf.reduce_mean(-log_likelihood) + tf.reduce_mean(-log_likelihood_depends)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        
        self.tags_seq, _ = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')
        
        self.tags_seq_depends, _ = tf.contrib.crf.crf_decode(
            logits_depends, transition_params_depends, self.lengths
        )
        self.tags_seq_depends = tf.identity(self.tags_seq_depends, name = 'logits_depends')

        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(self.labels, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        self.prediction = tf.boolean_mask(self.tags_seq_depends, mask)
        mask_label = tf.boolean_mask(self.depends, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy_depends = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

dim_word = 128
dim_char = 256
dropout = 0.85
learning_rate = 1e-3
hidden_size_char = 128
hidden_size_word = 64
num_layers = 2
batch_size = 16

model = Model(dim_word,dim_char,dropout,learning_rate,hidden_size_char,hidden_size_word,num_layers,
             words.shape[1])
sess.run(tf.global_variables_initializer())


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [17]:
import time

for e in range(15):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss, train_acc_depends, test_acc_depends = 0, 0, 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_char = train_char[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        batch_depends = train_depends[i : min(i + batch_size, train_X.shape[0])]
        acc_depends, acc, cost, _ = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        train_acc_depends += acc_depends
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_depends = test_depends[i : min(i + batch_size, test_X.shape[0])]
        acc_depends, acc, cost = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        assert not np.isnan(cost)
        test_loss += cost
        test_acc += acc
        test_acc_depends += acc_depends
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    train_acc_depends /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size
    test_acc_depends /= len(test_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\n'
        % (e, train_loss, train_acc, train_acc_depends, test_loss, test_acc, test_acc_depends)
    )

train minibatch loop: 100%|██████████| 2833/2833 [1:20:12<00:00,  1.70s/it, accuracy=0.812, accuracy_depends=0.547, cost=46]  
test minibatch loop: 100%|██████████| 315/315 [03:16<00:00,  1.63it/s, accuracy=0.896, accuracy_depends=0.656, cost=28.5]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5008.040819406509
epoch: 0, training loss: 62.484219, training acc: 0.687616, training depends: 0.412115, valid loss: 35.875297, valid acc: 0.836436, valid depends: 0.645364



train minibatch loop: 100%|██████████| 2833/2833 [1:20:11<00:00,  1.68s/it, accuracy=0.888, accuracy_depends=0.781, cost=25.5]
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.63it/s, accuracy=0.948, accuracy_depends=0.793, cost=16.7]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5006.738508939743
epoch: 1, training loss: 27.094490, training acc: 0.875416, training depends: 0.729413, valid loss: 21.630555, valid acc: 0.889475, valid depends: 0.800033



train minibatch loop: 100%|██████████| 2833/2833 [1:20:13<00:00,  1.69s/it, accuracy=0.924, accuracy_depends=0.878, cost=16.5]
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.64it/s, accuracy=0.967, accuracy_depends=0.896, cost=9.21]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5008.577131271362
epoch: 2, training loss: 15.925178, training acc: 0.922652, training depends: 0.847503, valid loss: 15.807395, valid acc: 0.915586, valid depends: 0.866298



train minibatch loop: 100%|██████████| 2833/2833 [1:20:14<00:00,  1.69s/it, accuracy=0.956, accuracy_depends=0.911, cost=11.3]
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.63it/s, accuracy=0.959, accuracy_depends=0.926, cost=8.56]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5010.133793115616
epoch: 3, training loss: 10.878052, training acc: 0.946691, training depends: 0.894082, valid loss: 13.006711, valid acc: 0.929738, valid depends: 0.899279



train minibatch loop: 100%|██████████| 2833/2833 [1:20:13<00:00,  1.69s/it, accuracy=0.938, accuracy_depends=0.935, cost=8.81]
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.63it/s, accuracy=0.978, accuracy_depends=0.948, cost=4.36]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5008.703083515167
epoch: 4, training loss: 7.498419, training acc: 0.962283, training depends: 0.928191, valid loss: 11.665039, valid acc: 0.938365, valid depends: 0.914650



train minibatch loop: 100%|██████████| 2833/2833 [1:20:14<00:00,  1.69s/it, accuracy=0.938, accuracy_depends=0.823, cost=17.7]
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.63it/s, accuracy=0.967, accuracy_depends=0.844, cost=12.6]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5010.282833337784
epoch: 5, training loss: 10.502579, training acc: 0.964749, training depends: 0.887862, valid loss: 17.299690, valid acc: 0.936364, valid depends: 0.823247



train minibatch loop: 100%|██████████| 2833/2833 [1:20:15<00:00,  1.69s/it, accuracy=0.961, accuracy_depends=0.958, cost=6.84]
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.63it/s, accuracy=0.978, accuracy_depends=0.948, cost=5.51]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5010.7648820877075
epoch: 6, training loss: 8.172327, training acc: 0.972956, training depends: 0.908020, valid loss: 10.859837, valid acc: 0.947560, valid depends: 0.921681



train minibatch loop: 100%|██████████| 2833/2833 [1:20:14<00:00,  1.69s/it, accuracy=0.982, accuracy_depends=0.956, cost=4.74] 
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.64it/s, accuracy=0.985, accuracy_depends=0.974, cost=2.53]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5010.066545963287
epoch: 7, training loss: 5.122587, training acc: 0.979184, training depends: 0.947194, valid loss: 10.091347, valid acc: 0.951014, valid depends: 0.936373



train minibatch loop: 100%|██████████| 2833/2833 [1:20:16<00:00,  1.69s/it, accuracy=0.966, accuracy_depends=0.932, cost=7.67] 
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.64it/s, accuracy=0.974, accuracy_depends=0.919, cost=7.52]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5011.766835451126
epoch: 8, training loss: 5.387102, training acc: 0.979654, training depends: 0.941862, valid loss: 10.987624, valid acc: 0.946941, valid depends: 0.924877



train minibatch loop: 100%|██████████| 2833/2833 [1:20:17<00:00,  1.69s/it, accuracy=0.971, accuracy_depends=0.956, cost=5.2] 
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.62it/s, accuracy=0.993, accuracy_depends=0.97, cost=2.84] 
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5013.079018354416
epoch: 9, training loss: 9.745852, training acc: 0.976012, training depends: 0.874312, valid loss: 11.059560, valid acc: 0.950731, valid depends: 0.925793



train minibatch loop: 100%|██████████| 2833/2833 [1:20:16<00:00,  1.70s/it, accuracy=0.964, accuracy_depends=0.94, cost=5.8]   
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.64it/s, accuracy=0.981, accuracy_depends=0.97, cost=3.35] 
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5012.365065097809
epoch: 10, training loss: 8.140059, training acc: 0.971539, training depends: 0.907471, valid loss: 10.344279, valid acc: 0.951060, valid depends: 0.935012



train minibatch loop: 100%|██████████| 2833/2833 [1:20:15<00:00,  1.68s/it, accuracy=0.99, accuracy_depends=0.969, cost=3.05]  
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.63it/s, accuracy=0.981, accuracy_depends=0.959, cost=4.11]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5011.202510595322
epoch: 11, training loss: 3.299452, training acc: 0.985352, training depends: 0.966909, valid loss: 9.951744, valid acc: 0.954282, valid depends: 0.944745



train minibatch loop: 100%|██████████| 2833/2833 [1:20:16<00:00,  1.70s/it, accuracy=0.99, accuracy_depends=0.969, cost=3.83]  
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.63it/s, accuracy=0.981, accuracy_depends=0.978, cost=2.13]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5011.455784082413
epoch: 12, training loss: 2.711230, training acc: 0.987114, training depends: 0.974351, valid loss: 9.891155, valid acc: 0.954302, valid depends: 0.948340



train minibatch loop: 100%|██████████| 2833/2833 [1:20:16<00:00,  1.69s/it, accuracy=0.99, accuracy_depends=0.977, cost=2.45]  
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.64it/s, accuracy=0.993, accuracy_depends=0.974, cost=1.97]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5012.48108124733
epoch: 13, training loss: 2.359227, training acc: 0.988185, training depends: 0.978197, valid loss: 9.463627, valid acc: 0.955425, valid depends: 0.952417



train minibatch loop: 100%|██████████| 2833/2833 [1:20:18<00:00,  1.68s/it, accuracy=0.964, accuracy_depends=0.987, cost=4.21] 
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.63it/s, accuracy=0.985, accuracy_depends=0.974, cost=2.12]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5013.777709722519
epoch: 14, training loss: 2.100721, training acc: 0.989465, training depends: 0.980502, valid loss: 9.556043, valid acc: 0.956996, valid depends: 0.954261



train minibatch loop: 100%|██████████| 2833/2833 [1:20:18<00:00,  1.68s/it, accuracy=0.987, accuracy_depends=0.984, cost=1.9]  
test minibatch loop: 100%|██████████| 315/315 [03:15<00:00,  1.64it/s, accuracy=0.981, accuracy_depends=0.974, cost=4.66]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 5014.499285459518
epoch: 15, training loss: 1.833721, training acc: 0.990503, training depends: 0.983315, valid loss: 9.180412, valid acc: 0.959398, valid depends: 0.957512



train minibatch loop:   0%|          | 5/2833 [00:08<1:20:31,  1.71s/it, accuracy=0.997, accuracy_depends=0.988, cost=1.06]

KeyboardInterrupt: 

In [36]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out

In [19]:
seq, deps = sess.run([model.tags_seq, model.tags_seq_depends],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
            },
)

In [45]:
real_Y, predict_Y, real_depends, predict_depends = [], [], [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    batch_depends = test_depends[i : min(i + batch_size, test_X.shape[0])]
    seq, deps = sess.run([model.tags_seq, model.tags_seq_depends],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
            },
    )
    predicted = pred2label(seq)
    real = pred2label(batch_y)
    predict_Y.extend(predicted)
    real_Y.extend(real)
    
    real_depends.extend(batch_depends.tolist())
    predict_depends.extend(deps.tolist())


validation minibatch loop:   0%|          | 0/315 [00:00<?, ?it/s][A
validation minibatch loop:   0%|          | 1/315 [00:00<03:12,  1.63it/s][A
validation minibatch loop:   1%|          | 2/315 [00:01<03:10,  1.65it/s][A
validation minibatch loop:   1%|          | 3/315 [00:01<03:09,  1.64it/s][A
validation minibatch loop:   1%|▏         | 4/315 [00:02<03:08,  1.65it/s][A
validation minibatch loop:   2%|▏         | 5/315 [00:03<03:09,  1.64it/s][A
validation minibatch loop:   2%|▏         | 6/315 [00:03<03:10,  1.62it/s][A
validation minibatch loop:   2%|▏         | 7/315 [00:04<03:08,  1.64it/s][A
validation minibatch loop:   3%|▎         | 8/315 [00:04<03:06,  1.64it/s][A
validation minibatch loop:   3%|▎         | 9/315 [00:05<03:05,  1.65it/s][A
validation minibatch loop:   3%|▎         | 10/315 [00:06<03:03,  1.66it/s][A
validation minibatch loop:   3%|▎         | 11/315 [00:06<03:03,  1.66it/s][A
validation minibatch loop:   4%|▍         | 12/315 [00:07<03:01,  1.6

validation minibatch loop:  65%|██████▌   | 205/315 [02:04<01:07,  1.62it/s][A
validation minibatch loop:  65%|██████▌   | 206/315 [02:05<01:06,  1.64it/s][A
validation minibatch loop:  66%|██████▌   | 207/315 [02:05<01:07,  1.61it/s][A
validation minibatch loop:  66%|██████▌   | 208/315 [02:06<01:05,  1.62it/s][A
validation minibatch loop:  66%|██████▋   | 209/315 [02:06<01:04,  1.64it/s][A
validation minibatch loop:  67%|██████▋   | 210/315 [02:07<01:03,  1.65it/s][A
validation minibatch loop:  67%|██████▋   | 211/315 [02:08<01:02,  1.66it/s][A
validation minibatch loop:  67%|██████▋   | 212/315 [02:08<01:01,  1.66it/s][A
validation minibatch loop:  68%|██████▊   | 213/315 [02:09<01:02,  1.64it/s][A
validation minibatch loop:  68%|██████▊   | 214/315 [02:09<01:01,  1.65it/s][A
validation minibatch loop:  68%|██████▊   | 215/315 [02:10<01:00,  1.65it/s][A
validation minibatch loop:  69%|██████▊   | 216/315 [02:11<00:59,  1.66it/s][A
validation minibatch loop:  69%|██████▉ 

In [47]:
from sklearn.metrics import classification_report
print(classification_report(np.array(real_Y).ravel(), np.array(predict_Y).ravel(), digits = 4))

               precision    recall  f1-score   support

          PAD     1.0000    1.0000    1.0000    843055
          acl     0.9406    0.9296    0.9351      2983
        advcl     0.8924    0.8613    0.8766      1175
       advmod     0.9549    0.9482    0.9515      4712
         amod     0.9296    0.9100    0.9197      4135
        appos     0.9312    0.9570    0.9439      2488
          aux     1.0000    1.0000    1.0000         5
         case     0.9809    0.9823    0.9816     10557
           cc     0.9676    0.9795    0.9735      3170
        ccomp     0.8598    0.8045    0.8312       404
     compound     0.9201    0.9464    0.9331      6605
compound:plur     0.9597    0.9630    0.9613       594
         conj     0.9600    0.9572    0.9586      4158
          cop     0.9670    0.9720    0.9695       966
        csubj     0.8929    0.8333    0.8621        30
   csubj:pass     0.8000    0.6667    0.7273        12
          dep     0.8189    0.9259    0.8691       459
         

In [48]:
from sklearn.metrics import classification_report
print(classification_report(np.array(real_depends).ravel(), 
                            np.array(predict_depends).ravel(), digits = 4))

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000    843055
          1     0.9718    0.9633    0.9675      5037
          2     0.9604    0.9459    0.9531      4285
          3     0.9474    0.9557    0.9515      4971
          4     0.9575    0.9647    0.9611      6594
          5     0.9534    0.9665    0.9599      5880
          6     0.9648    0.9632    0.9640      6037
          7     0.9512    0.9654    0.9582      5548
          8     0.9611    0.9623    0.9617      5542
          9     0.9729    0.9498    0.9612      4877
         10     0.9614    0.9621    0.9617      4559
         11     0.9495    0.9588    0.9541      4316
         12     0.9547    0.9573    0.9560      3698
         13     0.9664    0.9506    0.9584      3600
         14     0.9652    0.9590    0.9621      3294
         15     0.9619    0.9541    0.9580      3179
         16     0.9604    0.9573    0.9589      3117
         17     0.9634    0.9587    0.9610   

  'recall', 'true', average, warn_for)


In [94]:
string = 'tolong tangkap gambar kami'

def char_str_idx(corpus, dic, UNK = 0):
    maxlen = max([len(i) for i in corpus])
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen]):
            val = dic[k] if k in dic else UNK
            X[i, no] = val
    return X

def generate_char_seq(batch, UNK = 2):
    maxlen_c = max([len(k) for k in batch])
    x = [[len(i) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((len(batch),maxlen_c,maxlen),dtype=np.int32)
    for i in range(len(batch)):
        for k in range(len(batch[i])):
            for no, c in enumerate(batch[i][k][::-1]):
                temp[i,k,-1-no] = char2idx.get(c, UNK)
    return temp

sequence = process_string(string)
sequence

['tolong', 'tangkap', 'gambar', 'kami']

In [95]:
X_seq = char_str_idx([sequence], word2idx, 2)
X_char_seq = generate_char_seq([sequence])

In [96]:
X_char_seq.shape

(1, 4, 7)

In [107]:
seq, deps = sess.run([model.tags_seq, model.tags_seq_depends],
        feed_dict={model.word_ids:X_seq,
                  model.char_ids:X_char_seq})

In [108]:
deps[0] - 1

array([2, 2, 2, 3], dtype=int32)

In [109]:
[idx2tag[i] for i in seq[0]]

['advmod', 'obj', 'compound', 'det']

In [110]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bahdanau-dependency/model.ckpt')

strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'logits_depends' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'OptimizeLoss' not in n.name
        and 'Global_Step' not in n.name
        and 'Epoch_Step' not in n.name
        and 'learning_rate' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'Variable',
 'Variable_1',
 'bidirectional_rnn_char_0/fw/lstm_cell/kernel',
 'bidirectional_rnn_char_0/fw/lstm_cell/bias',
 'bidirectional_rnn_char_0/bw/lstm_cell/kernel',
 'bidirectional_rnn_char_0/bw/lstm_cell/bias',
 'bidirectional_rnn_char_1/fw/lstm_cell/kernel',
 'bidirectional_rnn_char_1/fw/lstm_cell/bias',
 'bidirectional_rnn_char_1/bw/lstm_cell/kernel',
 'bidirectional_rnn_char_1/bw/lstm_cell/bias',
 'memory_layer/kernel',
 'memory_layer_1/kernel',
 'bidirectional_rnn_word_0/fw/attention_wrapper/lstm_cell/kernel',
 'bidirectional_rnn_word_0/fw/attention_wrapper/lstm_cell/bias',
 'bidirectional_rnn_word_0/fw/attention_wrapper/bahdanau_attention/query_layer/kernel',
 'bidirectional_rnn_word_0/fw/attention_wrapper/bahdanau_attention/attention_v',
 'bidirectional_rnn_word_0/fw/attention_wrapper/attention_layer/kernel',
 'bidirectional_rnn_word_0/bw/attention_wrapper/lstm_cell/kernel',
 'bidirectional_rnn_word_0/

In [111]:
import json
with open('bahdanau-dependency.json','w') as fopen:
    fopen.write(json.dumps({'idx2tag':idx2tag,'idx2word':idx2word,
           'word2idx':word2idx,'tag2idx':tag2idx,'char2idx':char2idx}))

In [112]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))
        
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [113]:
freeze_graph('bahdanau-dependency', strings)

INFO:tensorflow:Restoring parameters from bahdanau-dependency/model.ckpt
INFO:tensorflow:Froze 45 variables.
INFO:tensorflow:Converted 45 variables to const ops.
2531 ops in the final graph.


In [114]:
g = load_graph('bahdanau-dependency/frozen_model.pb')

In [115]:
word_ids = g.get_tensor_by_name('import/Placeholder:0')
char_ids = g.get_tensor_by_name('import/Placeholder_1:0')
tags_seq = g.get_tensor_by_name('import/logits:0')
depends_seq = g.get_tensor_by_name('import/logits_depends:0')
test_sess = tf.InteractiveSession(graph = g)
seq, deps = test_sess.run([tags_seq, depends_seq],
            feed_dict = {
                word_ids: X_seq,
                char_ids: X_char_seq,
            })

print(seq,deps)



[[14 16 13  3]] [[3 1 3 3]]
