In [1]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np
import re

In [2]:
with open('id_gsd-ud-train.conllu.txt') as fopen:
    corpus = fopen.read().split('\n')
    
with open('id_gsd-ud-test.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))
    
with open('id_gsd-ud-dev.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))

In [3]:
word2idx = {'PAD': 0,'NUM':1,'UNK':2}
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0,'NUM':1,'UNK':2}
word_idx = 3
tag_idx = 1
char_idx = 3

def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [to_title(y.strip()) for y in string]

def to_title(string):
    if string.isupper():
        string = string.title()
    return string

def process_corpus(corpus, until = None):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    sentences, words, depends, labels = [], [], [], []
    temp_sentence, temp_word, temp_depend, temp_label = [], [], [], []
    for sentence in corpus:
        if len(sentence):
            if sentence[0] == '#':
                continue
            sentence = sentence.split('\t')
            temp = process_string(sentence[1])
            if not len(temp):
                sentence[1] = 'EMPTY'
            sentence[1] = process_string(sentence[1])[0]
            for c in sentence[1]:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
            if sentence[7] not in tag2idx:
                tag2idx[sentence[7]] = tag_idx
                tag_idx += 1
            if sentence[1] not in word2idx:
                word2idx[sentence[1]] = word_idx
                word_idx += 1
            temp_word.append(word2idx[sentence[1]])
            temp_depend.append(int(sentence[6]) + 1)
            temp_label.append(tag2idx[sentence[7]])
            temp_sentence.append(sentence[1])
        else:
            words.append(temp_word)
            depends.append(temp_depend)
            labels.append(temp_label)
            sentences.append(temp_sentence)
            temp_word = []
            temp_depend = []
            temp_label = []
            temp_sentence = []
    return sentences[:-1], words[:-1], depends[:-1], labels[:-1]
        
sentences, words, depends, labels = process_corpus(corpus)

In [4]:
import json

with open('augmented.json') as fopen:
    augmented = json.load(fopen)

In [5]:
def parse_XY(texts):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    outside, sentences = [], []
    for no, text in enumerate(texts):
        s = process_string(text)
        sentences.append(s)
        inside = []
        for w in s:
            for c in w:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
            
            if w not in word2idx:
                word2idx[w] = word_idx
                word_idx += 1
                
            inside.append(word2idx[w])
        outside.append(inside)
    return outside, sentences

In [6]:
text_augmented = []
for a in augmented:
    text_augmented.extend(a[0])
    depends.extend(a[1])
    labels.extend(a[2])

In [7]:
outside, new_sentences = parse_XY(text_augmented)

In [8]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [9]:
words.extend(outside)
sentences.extend(new_sentences)

In [10]:
len(words), len(depends), len(labels), len(sentences)

(50365, 50365, 50365, 50365)

In [12]:
def generate_char_seq(batch, UNK = 2):
    maxlen_c = max([len(k) for k in batch])
    x = [[len(i) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((len(batch),maxlen_c,maxlen),dtype=np.int32)
    for i in range(len(batch)):
        for k in range(len(batch[i])):
            for no, c in enumerate(batch[i][k][:maxlen][::-1]):
                temp[i,k,-1-no] = char2idx.get(c, UNK)
    return temp

In [13]:
idx2word = {idx: tag for tag, idx in word2idx.items()}
idx2tag = {i: w for w, i in tag2idx.items()}
char = generate_char_seq(sentences)

In [14]:
words = pad_sequences(words,padding='post')
depends = pad_sequences(depends,padding='post')
labels = pad_sequences(labels,padding='post')
words.shape

(50365, 189)

In [15]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y, train_depends, test_depends, train_char, test_char = train_test_split(
                                                                           words,
                                                                           labels,
                                                                           depends,
                                                                           char,
                                                                           test_size=0.1)



In [16]:
class Model:
    def __init__(
        self,
        dim_word,
        dim_char,
        dropout,
        learning_rate,
        hidden_size_char,
        hidden_size_word,
        num_layers,
        maxlen
    ):
        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                output_keep_prob = dropout,
            )
        
        self.word_ids = tf.placeholder(tf.int32, shape = [None, None])
        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None])
        self.labels = tf.placeholder(tf.int32, shape = [None, None])
        self.depends = tf.placeholder(tf.int32, shape = [None, None])
        self.maxlen = tf.shape(self.word_ids)[1]
        self.lengths = tf.count_nonzero(self.word_ids, 1)

        self.word_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(word2idx), dim_word], stddev = 1.0 / np.sqrt(dim_word)
            )
        )
        self.char_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(char2idx), dim_char], stddev = 1.0 / np.sqrt(dim_char)
            )
        )

        word_embedded = tf.nn.embedding_lookup(
            self.word_embeddings, self.word_ids
        )
        char_embedded = tf.nn.embedding_lookup(
            self.char_embeddings, self.char_ids
        )
        s = tf.shape(char_embedded)
        char_embedded = tf.reshape(
            char_embedded, shape = [s[0] * s[1], s[-2], dim_char]
        )

        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_char),
                cell_bw = cells(hidden_size_char),
                inputs = char_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_char_%d' % (n),
            )
            char_embedded = tf.concat((out_fw, out_bw), 2)
        output = tf.reshape(
            char_embedded[:, -1], shape = [s[0], s[1], 2 * hidden_size_char]
        )
        word_embedded = tf.concat([word_embedded, output], axis = -1)

        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_word),
                cell_bw = cells(hidden_size_word),
                inputs = word_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_word_%d' % (n),
            )
            word_embedded = tf.concat((out_fw, out_bw), 2)

        logits = tf.layers.dense(word_embedded, len(idx2tag))
        
        tag_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(idx2tag), dim_word], stddev = 1.0 / np.sqrt(dim_word)
            )
        )
        logits_max = tf.argmax(logits,axis=2,output_type=tf.int32)
        lookup_logits = tf.nn.embedding_lookup(
            tag_embeddings, logits_max
        )
        (out_fw, out_bw), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_word),
                cell_bw = cells(hidden_size_word),
                inputs = word_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_word_%d' % (10),
            )
        
        cast_mask = tf.cast(tf.sequence_mask(self.lengths + 1, maxlen = maxlen), dtype = tf.float32)
        cast_mask = tf.tile(tf.expand_dims(cast_mask,axis=1),[1,self.maxlen,1]) * 10
        
        lookup_logits = tf.concat((out_fw, out_bw), 2)
        logits_depends = tf.layers.dense(lookup_logits, maxlen)
        
        logits_depends = tf.multiply(logits_depends, cast_mask)
        
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, self.labels, self.lengths
        )
        with tf.variable_scope("depends"):
            log_likelihood_depends, transition_params_depends = tf.contrib.crf.crf_log_likelihood(
                logits_depends, self.depends, self.lengths
            )
        self.cost = tf.reduce_mean(-log_likelihood) + tf.reduce_mean(-log_likelihood_depends)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        
        self.tags_seq, _ = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')
        
        self.tags_seq_depends, _ = tf.contrib.crf.crf_decode(
            logits_depends, transition_params_depends, self.lengths
        )
        self.tags_seq_depends = tf.identity(self.tags_seq_depends, name = 'logits_depends')

        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(self.labels, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        self.prediction = tf.boolean_mask(self.tags_seq_depends, mask)
        mask_label = tf.boolean_mask(self.depends, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy_depends = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

dim_word = 128
dim_char = 256
dropout = 0.9
learning_rate = 1e-3
hidden_size_char = 128
hidden_size_word = 64
num_layers = 2
batch_size = 16

model = Model(dim_word,dim_char,dropout,learning_rate,hidden_size_char,hidden_size_word,num_layers,
             words.shape[1])
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [18]:
import time

for e in range(15):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss, train_acc_depends, test_acc_depends = 0, 0, 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_char = train_char[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        batch_depends = train_depends[i : min(i + batch_size, train_X.shape[0])]
        acc_depends, acc, cost, _ = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        train_acc_depends += acc_depends
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
        
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_depends = test_depends[i : min(i + batch_size, test_X.shape[0])]
        acc_depends, acc, cost = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        assert not np.isnan(cost)
        test_loss += cost
        test_acc += acc
        test_acc_depends += acc_depends
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    train_acc_depends /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size
    test_acc_depends /= len(test_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\n'
        % (e, train_loss, train_acc, train_acc_depends, test_loss, test_acc, test_acc_depends)
    )

train minibatch loop: 100%|██████████| 2833/2833 [55:45<00:00,  1.17s/it, accuracy=0.818, accuracy_depends=0.592, cost=32.4]
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.12it/s, accuracy=0.879, accuracy_depends=0.56, cost=37.3] 
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3499.9400725364685
epoch: 0, training loss: 61.394930, training acc: 0.717017, training depends: 0.385134, valid loss: 40.262435, valid acc: 0.835491, valid depends: 0.558672



train minibatch loop: 100%|██████████| 2833/2833 [55:43<00:00,  1.17s/it, accuracy=0.906, accuracy_depends=0.755, cost=20.1]
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.13it/s, accuracy=0.903, accuracy_depends=0.728, cost=26.1]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3498.635225534439
epoch: 1, training loss: 32.005057, training acc: 0.867122, training depends: 0.644789, valid loss: 28.865218, valid acc: 0.873998, valid depends: 0.695539



train minibatch loop: 100%|██████████| 2833/2833 [55:40<00:00,  1.16s/it, accuracy=0.922, accuracy_depends=0.843, cost=13.8]
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.12it/s, accuracy=0.896, accuracy_depends=0.762, cost=24.1]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3495.000473022461
epoch: 2, training loss: 22.238423, training acc: 0.903136, training depends: 0.756233, valid loss: 22.973688, valid acc: 0.893954, valid depends: 0.775838



train minibatch loop: 100%|██████████| 2833/2833 [55:40<00:00,  1.17s/it, accuracy=0.94, accuracy_depends=0.871, cost=9.31] 
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.12it/s, accuracy=0.94, accuracy_depends=0.768, cost=19.1] 
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3494.9204516410828
epoch: 3, training loss: 16.122365, training acc: 0.926224, training depends: 0.829008, valid loss: 19.624100, valid acc: 0.906612, valid depends: 0.818411



train minibatch loop: 100%|██████████| 2833/2833 [55:39<00:00,  1.17s/it, accuracy=0.969, accuracy_depends=0.922, cost=7.06]
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.11it/s, accuracy=0.926, accuracy_depends=0.815, cost=18.7]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3494.005749940872
epoch: 4, training loss: 12.054353, training acc: 0.943319, training depends: 0.875051, valid loss: 17.211692, valid acc: 0.917632, valid depends: 0.853567



train minibatch loop: 100%|██████████| 2833/2833 [55:39<00:00,  1.17s/it, accuracy=0.956, accuracy_depends=0.925, cost=5.91]
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.11it/s, accuracy=0.923, accuracy_depends=0.832, cost=16.6]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3493.91433262825
epoch: 5, training loss: 9.444567, training acc: 0.954951, training depends: 0.902535, valid loss: 15.432670, valid acc: 0.925205, valid depends: 0.874938



train minibatch loop: 100%|██████████| 2833/2833 [55:40<00:00,  1.16s/it, accuracy=0.984, accuracy_depends=0.944, cost=3.98]
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.12it/s, accuracy=0.933, accuracy_depends=0.869, cost=14]  
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3495.283851146698
epoch: 6, training loss: 7.452240, training acc: 0.965058, training depends: 0.922973, valid loss: 14.489255, valid acc: 0.930936, valid depends: 0.888087



train minibatch loop: 100%|██████████| 2833/2833 [55:40<00:00,  1.17s/it, accuracy=0.994, accuracy_depends=0.94, cost=4.72] 
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.13it/s, accuracy=0.963, accuracy_depends=0.879, cost=10.8]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3494.487522125244
epoch: 7, training loss: 6.025082, training acc: 0.972536, training depends: 0.937287, valid loss: 13.715300, valid acc: 0.937034, valid depends: 0.900252



train minibatch loop: 100%|██████████| 2833/2833 [55:39<00:00,  1.17s/it, accuracy=0.991, accuracy_depends=0.95, cost=3.05]  
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.12it/s, accuracy=0.966, accuracy_depends=0.906, cost=10.5]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3494.0114550590515
epoch: 8, training loss: 5.112749, training acc: 0.977126, training depends: 0.946466, valid loss: 12.850036, valid acc: 0.940592, valid depends: 0.909303



train minibatch loop: 100%|██████████| 2833/2833 [55:42<00:00,  1.17s/it, accuracy=0.987, accuracy_depends=0.959, cost=2.69]
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.10it/s, accuracy=0.946, accuracy_depends=0.889, cost=12.9]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3497.181762933731
epoch: 9, training loss: 4.459866, training acc: 0.980873, training depends: 0.952571, valid loss: 12.666941, valid acc: 0.942265, valid depends: 0.913658



train minibatch loop: 100%|██████████| 2833/2833 [55:43<00:00,  1.16s/it, accuracy=0.994, accuracy_depends=0.969, cost=1.96] 
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.12it/s, accuracy=0.963, accuracy_depends=0.916, cost=12.1]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3497.9769966602325
epoch: 10, training loss: 3.719997, training acc: 0.984097, training depends: 0.960447, valid loss: 12.421102, valid acc: 0.945554, valid depends: 0.918851



train minibatch loop: 100%|██████████| 2833/2833 [55:43<00:00,  1.17s/it, accuracy=0.987, accuracy_depends=0.984, cost=1.43] 
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.09it/s, accuracy=0.956, accuracy_depends=0.903, cost=12.5]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3498.7708554267883
epoch: 11, training loss: 3.262921, training acc: 0.986176, training depends: 0.965447, valid loss: 12.296852, valid acc: 0.947707, valid depends: 0.922254



train minibatch loop: 100%|██████████| 2833/2833 [55:45<00:00,  1.17s/it, accuracy=0.997, accuracy_depends=0.978, cost=1.73] 
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.10it/s, accuracy=0.966, accuracy_depends=0.916, cost=12.8]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3499.6522147655487
epoch: 12, training loss: 3.065945, training acc: 0.987473, training depends: 0.967189, valid loss: 12.074139, valid acc: 0.948292, valid depends: 0.926556



train minibatch loop: 100%|██████████| 2833/2833 [55:44<00:00,  1.16s/it, accuracy=0.997, accuracy_depends=0.984, cost=1.2]  
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.11it/s, accuracy=0.966, accuracy_depends=0.919, cost=12.5]
train minibatch loop:   0%|          | 0/2833 [00:00<?, ?it/s]

time taken: 3499.439937353134
epoch: 13, training loss: 2.730451, training acc: 0.988901, training depends: 0.970606, valid loss: 11.942601, valid acc: 0.950129, valid depends: 0.930428



train minibatch loop: 100%|██████████| 2833/2833 [55:44<00:00,  1.18s/it, accuracy=0.984, accuracy_depends=0.972, cost=2.77] 
test minibatch loop: 100%|██████████| 315/315 [02:34<00:00,  2.11it/s, accuracy=0.977, accuracy_depends=0.899, cost=12]  

time taken: 3498.874011993408
epoch: 14, training loss: 2.494391, training acc: 0.989733, training depends: 0.973406, valid loss: 11.962246, valid acc: 0.950911, valid depends: 0.931331






In [20]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out

In [21]:
real_Y, predict_Y, real_depends, predict_depends = [], [], [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    batch_depends = test_depends[i : min(i + batch_size, test_X.shape[0])]
    seq, deps = sess.run([model.tags_seq, model.tags_seq_depends],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
            },
    )
    predicted = pred2label(seq)
    real = pred2label(batch_y)
    predict_Y.extend(predicted)
    real_Y.extend(real)
    
    real_depends.extend(batch_depends.tolist())
    predict_depends.extend(deps.tolist())


validation minibatch loop:   0%|          | 0/315 [00:00<?, ?it/s][A
validation minibatch loop:   0%|          | 1/315 [00:00<02:28,  2.11it/s][A
validation minibatch loop:   1%|          | 2/315 [00:00<02:28,  2.11it/s][A
validation minibatch loop:   1%|          | 3/315 [00:01<02:27,  2.11it/s][A
validation minibatch loop:   1%|▏         | 4/315 [00:01<02:27,  2.11it/s][A
validation minibatch loop:   2%|▏         | 5/315 [00:02<02:26,  2.11it/s][A
validation minibatch loop:   2%|▏         | 6/315 [00:02<02:25,  2.13it/s][A
validation minibatch loop:   2%|▏         | 7/315 [00:03<02:23,  2.14it/s][A
validation minibatch loop:   3%|▎         | 8/315 [00:03<02:23,  2.14it/s][A
validation minibatch loop:   3%|▎         | 9/315 [00:04<02:25,  2.10it/s][A
validation minibatch loop:   3%|▎         | 10/315 [00:04<02:25,  2.09it/s][A
validation minibatch loop:   3%|▎         | 11/315 [00:05<02:24,  2.11it/s][A
validation minibatch loop:   4%|▍         | 12/315 [00:05<02:23,  2.1

validation minibatch loop:  65%|██████▌   | 205/315 [01:37<00:53,  2.07it/s][A
validation minibatch loop:  65%|██████▌   | 206/315 [01:38<00:52,  2.08it/s][A
validation minibatch loop:  66%|██████▌   | 207/315 [01:38<00:51,  2.10it/s][A
validation minibatch loop:  66%|██████▌   | 208/315 [01:39<00:51,  2.09it/s][A
validation minibatch loop:  66%|██████▋   | 209/315 [01:39<00:51,  2.07it/s][A
validation minibatch loop:  67%|██████▋   | 210/315 [01:40<00:50,  2.08it/s][A
validation minibatch loop:  67%|██████▋   | 211/315 [01:40<00:49,  2.11it/s][A
validation minibatch loop:  67%|██████▋   | 212/315 [01:41<00:48,  2.10it/s][A
validation minibatch loop:  68%|██████▊   | 213/315 [01:41<00:48,  2.09it/s][A
validation minibatch loop:  68%|██████▊   | 214/315 [01:42<00:47,  2.11it/s][A
validation minibatch loop:  68%|██████▊   | 215/315 [01:42<00:47,  2.10it/s][A
validation minibatch loop:  69%|██████▊   | 216/315 [01:42<00:46,  2.12it/s][A
validation minibatch loop:  69%|██████▉ 

In [22]:
from sklearn.metrics import classification_report
print(classification_report(np.array(real_Y).ravel(), np.array(predict_Y).ravel(), digits = 4))

               precision    recall  f1-score   support

          PAD     1.0000    1.0000    1.0000    841717
          acl     0.9501    0.9110    0.9301      2965
        advcl     0.8127    0.8719    0.8413      1249
       advmod     0.9423    0.9329    0.9376      4846
         amod     0.9141    0.9104    0.9123      4208
        appos     0.9282    0.9266    0.9274      2412
         case     0.9757    0.9756    0.9756     10896
           cc     0.9613    0.9726    0.9669      3171
        ccomp     0.8115    0.7094    0.7570       437
     compound     0.9176    0.9350    0.9263      6804
compound:plur     0.9172    0.9767    0.9460       601
         conj     0.9504    0.9493    0.9498      4119
          cop     0.9621    0.9761    0.9690       962
        csubj     0.8095    0.7083    0.7556        24
   csubj:pass     0.7500    0.6000    0.6667        10
          dep     0.8712    0.8333    0.8519       552
          det     0.9288    0.9339    0.9313      4082
        f

In [23]:
from sklearn.metrics import classification_report
print(classification_report(np.array(real_depends).ravel(), 
                            np.array(predict_depends).ravel(), digits = 4))

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000    841717
          1     0.9638    0.9676    0.9657      5037
          2     0.9526    0.9295    0.9409      4367
          3     0.9410    0.9395    0.9403      4942
          4     0.9544    0.9516    0.9530      6440
          5     0.9453    0.9514    0.9484      6035
          6     0.9376    0.9633    0.9503      6024
          7     0.9456    0.9491    0.9473      5398
          8     0.9506    0.9438    0.9472      5482
          9     0.9488    0.9455    0.9472      4977
         10     0.9331    0.9578    0.9453      4430
         11     0.9453    0.9468    0.9460      4583
         12     0.9364    0.9420    0.9392      3673
         13     0.9495    0.9298    0.9395      3719
         14     0.9425    0.9343    0.9384      3316
         15     0.9460    0.9197    0.9327      3065
         16     0.9125    0.9443    0.9281      3071
         17     0.9350    0.9228    0.9289   

  'precision', 'predicted', average, warn_for)


In [24]:
string = 'tolong tangkap gambar kami'

def char_str_idx(corpus, dic, UNK = 0):
    maxlen = max([len(i) for i in corpus])
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen]):
            val = dic[k] if k in dic else UNK
            X[i, no] = val
    return X

def generate_char_seq(batch, UNK = 2):
    maxlen_c = max([len(k) for k in batch])
    x = [[len(i) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((len(batch),maxlen_c,maxlen),dtype=np.int32)
    for i in range(len(batch)):
        for k in range(len(batch[i])):
            for no, c in enumerate(batch[i][k][::-1]):
                temp[i,k,-1-no] = char2idx.get(c, UNK)
    return temp

sequence = process_string(string)
sequence

['tolong', 'tangkap', 'gambar', 'kami']

In [26]:
X_seq = char_str_idx([sequence], word2idx, 2)
X_char_seq = generate_char_seq([sequence])

In [27]:
seq, deps = sess.run([model.tags_seq, model.tags_seq_depends],
        feed_dict={model.word_ids:X_seq,
                  model.char_ids:X_char_seq})

In [28]:
deps - 1

array([[2, 0, 2, 3]], dtype=int32)

In [30]:
[idx2tag[i] for i in seq[0]]

['advmod', 'xcomp', 'obj', 'det']

In [31]:
seq

array([[14, 23, 16,  3]], dtype=int32)

In [32]:
string = []
for i in range(len(seq[0])):
    string.append('%d\t%s\t_\t_\t_\t_\t%d\t%s'%(i+1,sequence[i],deps[0,i],idx2tag[seq[0,i]]))

In [33]:
string

['1\ttolong\t_\t_\t_\t_\t3\tadvmod',
 '2\ttangkap\t_\t_\t_\t_\t1\txcomp',
 '3\tgambar\t_\t_\t_\t_\t3\tobj',
 '4\tkami\t_\t_\t_\t_\t4\tdet']

In [34]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'concat-dependency/model.ckpt')

strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'logits_depends' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'OptimizeLoss' not in n.name
        and 'Global_Step' not in n.name
        and 'Epoch_Step' not in n.name
        and 'learning_rate' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'Variable',
 'Variable_1',
 'bidirectional_rnn_char_0/fw/lstm_cell/kernel',
 'bidirectional_rnn_char_0/fw/lstm_cell/bias',
 'bidirectional_rnn_char_0/bw/lstm_cell/kernel',
 'bidirectional_rnn_char_0/bw/lstm_cell/bias',
 'bidirectional_rnn_char_1/fw/lstm_cell/kernel',
 'bidirectional_rnn_char_1/fw/lstm_cell/bias',
 'bidirectional_rnn_char_1/bw/lstm_cell/kernel',
 'bidirectional_rnn_char_1/bw/lstm_cell/bias',
 'bidirectional_rnn_word_0/fw/lstm_cell/kernel',
 'bidirectional_rnn_word_0/fw/lstm_cell/bias',
 'bidirectional_rnn_word_0/bw/lstm_cell/kernel',
 'bidirectional_rnn_word_0/bw/lstm_cell/bias',
 'bidirectional_rnn_word_1/fw/lstm_cell/kernel',
 'bidirectional_rnn_word_1/fw/lstm_cell/bias',
 'bidirectional_rnn_word_1/bw/lstm_cell/kernel',
 'bidirectional_rnn_word_1/bw/lstm_cell/bias',
 'dense/kernel',
 'dense/bias',
 'Variable_2',
 'bidirectional_rnn_word_10/fw/lstm_cell/kernel',
 'bidirectional_rnn_word_10/fw/lstm_c

In [35]:
import json
with open('concat-dependency.json','w') as fopen:
    fopen.write(json.dumps({'idx2tag':idx2tag,'idx2word':idx2word,
           'word2idx':word2idx,'tag2idx':tag2idx,'char2idx':char2idx}))

In [36]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))
        
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [37]:
freeze_graph('concat-dependency', strings)

INFO:tensorflow:Restoring parameters from concat-dependency/model.ckpt
INFO:tensorflow:Froze 29 variables.
INFO:tensorflow:Converted 29 variables to const ops.
2135 ops in the final graph.


In [38]:
g = load_graph('concat-dependency/frozen_model.pb')

In [39]:
word_ids = g.get_tensor_by_name('import/Placeholder:0')
char_ids = g.get_tensor_by_name('import/Placeholder_1:0')
tags_seq = g.get_tensor_by_name('import/logits:0')
depends_seq = g.get_tensor_by_name('import/logits_depends:0')
test_sess = tf.InteractiveSession(graph = g)
seq, deps = test_sess.run([tags_seq, depends_seq],
            feed_dict = {
                word_ids: X_seq,
                char_ids: X_char_seq,
            })

print(seq,deps)



[[14 16 13  3]] [[3 1 3 4]]
