In [1]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np
import re

In [2]:
with open('id_gsd-ud-train.conllu.txt') as fopen:
    corpus = fopen.read().split('\n')
    
with open('id_gsd-ud-test.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))
    
with open('id_gsd-ud-dev.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))

In [3]:
word2idx = {'PAD': 0,'NUM':1,'UNK':2}
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0,'NUM':1,'UNK':2}
word_idx = 3
tag_idx = 1
char_idx = 3

def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [to_title(y.strip()) for y in string]

def to_title(string):
    if string.isupper():
        string = string.title()
    return string

def process_corpus(corpus, until = None):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    sentences, words, depends, labels, pos = [], [], [], [], []
    temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []
    for sentence in corpus:
        if len(sentence):
            if sentence[0] == '#':
                continue
            sentence = sentence.split('\t')
            temp = process_string(sentence[1])
            if not len(temp):
                sentence[1] = 'EMPTY'
            sentence[1] = process_string(sentence[1])[0]
            for c in sentence[1]:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
            if sentence[7] not in tag2idx:
                tag2idx[sentence[7]] = tag_idx
                tag_idx += 1
            if sentence[1] not in word2idx:
                word2idx[sentence[1]] = word_idx
                word_idx += 1
            temp_word.append(word2idx[sentence[1]])
            temp_depend.append(int(sentence[6]) + 1)
            temp_label.append(tag2idx[sentence[7]])
            temp_sentence.append(sentence[1])
            temp_pos.append(sentence[3])
        else:
            words.append(temp_word)
            depends.append(temp_depend)
            labels.append(temp_label)
            sentences.append(temp_sentence)
            pos.append(temp_pos)
            temp_word = []
            temp_depend = []
            temp_label = []
            temp_sentence = []
            temp_pos = []
    return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1]
        
sentences, words, depends, labels, pos = process_corpus(corpus)

In [4]:
import json

with open('augmented.json') as fopen:
    augmented = json.load(fopen)

In [5]:
def parse_XY(texts):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    outside, sentences = [], []
    for no, text in enumerate(texts):
        s = process_string(text)
        sentences.append(s)
        inside = []
        for w in s:
            for c in w:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
            
            if w not in word2idx:
                word2idx[w] = word_idx
                word_idx += 1
                
            inside.append(word2idx[w])
        outside.append(inside)
    return outside, sentences

In [6]:
text_augmented = []
for a in augmented:
    text_augmented.extend(a[0])
    depends.extend(a[1])
    labels.extend(a[2])

In [7]:
outside, new_sentences = parse_XY(text_augmented)

In [8]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [9]:
words.extend(outside)
sentences.extend(new_sentences)

In [10]:
len(words), len(depends), len(labels), len(sentences)

(50365, 50365, 50365, 50365)

In [11]:
def generate_char_seq(batch, UNK = 2):
    maxlen_c = max([len(k) for k in batch])
    x = [[len(i) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((len(batch),maxlen_c,maxlen),dtype=np.int32)
    for i in range(len(batch)):
        for k in range(len(batch[i])):
            for no, c in enumerate(batch[i][k][:maxlen][::-1]):
                temp[i,k,-1-no] = char2idx.get(c, UNK)
    return temp

In [12]:
idx2word = {idx: tag for tag, idx in word2idx.items()}
idx2tag = {i: w for w, i in tag2idx.items()}
char = generate_char_seq(sentences)

In [13]:
words = pad_sequences(words,padding='post')
depends = pad_sequences(depends,padding='post')
labels = pad_sequences(labels,padding='post')
words.shape

(50365, 189)

In [14]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y, train_depends, test_depends, train_char, test_char = train_test_split(
                                                                           words,
                                                                           labels,
                                                                           depends,
                                                                           char,
                                                                           test_size=0.1)



In [15]:
def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))

    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    
    outputs = gamma * normalized + beta
    return outputs

def multihead_attn(queries, keys, q_masks, k_masks, future_binding, num_units, num_heads):
    
    T_q = tf.shape(queries)[1]                                      
    T_k = tf.shape(keys)[1]                  

    Q = tf.layers.dense(queries, num_units, name='Q')                              
    K_V = tf.layers.dense(keys, 2*num_units, name='K_V')    
    K, V = tf.split(K_V, 2, -1)        

    Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)                         
    K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)                    
    V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)                      

    align = tf.matmul(Q_, tf.transpose(K_, [0,2,1]))                      
    align = align / np.sqrt(K_.get_shape().as_list()[-1])                 

    paddings = tf.fill(tf.shape(align), 0.0)                   

    key_masks = k_masks                                                 
    key_masks = tf.tile(key_masks, [num_heads, 1])                       
    key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, T_q, 1])            
    align = tf.where(tf.equal(key_masks, 0), paddings, align)       

    if future_binding:
        lower_tri = tf.ones([T_q, T_k])                                          
        lower_tri = tf.linalg.LinearOperatorLowerTriangular(lower_tri).to_dense()  
        masks = tf.tile(tf.expand_dims(lower_tri,0), [tf.shape(align)[0], 1, 1]) 
        align = tf.where(tf.equal(masks, 0), paddings, align)                      
    
    align = tf.nn.softmax(align)                                            
    query_masks = tf.to_float(q_masks)                                             
    query_masks = tf.tile(query_masks, [num_heads, 1])                             
    query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, T_k])            
    align *= query_masks
    outputs = tf.matmul(align, V_)                                                 
    outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)             
    outputs += queries                                                             
    outputs = layer_norm(outputs)                                                 
    return outputs


def pointwise_feedforward(inputs, hidden_units, activation=None):
    outputs = tf.layers.dense(inputs, 4*hidden_units, activation=activation)
    outputs = tf.layers.dense(outputs, hidden_units, activation=None)
    outputs += inputs
    outputs = layer_norm(outputs)
    return outputs


def learned_position_encoding(inputs, mask, embed_dim):
    T = tf.shape(inputs)[1]
    outputs = tf.range(tf.shape(inputs)[1])                # (T_q)
    outputs = tf.expand_dims(outputs, 0)                   # (1, T_q)
    outputs = tf.tile(outputs, [tf.shape(inputs)[0], 1])   # (N, T_q)
    outputs = embed_seq(outputs, T, embed_dim, zero_pad=False, scale=False)
    return tf.expand_dims(tf.to_float(mask), -1) * outputs


def sinusoidal_position_encoding(inputs, mask, repr_dim):
    T = tf.shape(inputs)[1]
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1]) * tf.expand_dims(tf.to_float(mask), -1)

def label_smoothing(inputs, epsilon=0.1):
    C = inputs.get_shape().as_list()[-1]
    return ((1 - epsilon) * inputs) + (epsilon / C)


class CRF:
    def __init__(self,
                 dim_word,
                 dim_char,
                 dropout,
                 learning_rate,
                 hidden_size_char,
                 hidden_size_word,
                 maxlen,
                 num_blocks = 2,
                 num_heads = 8,
                 min_freq = 50):
        
        self.word_ids = tf.placeholder(tf.int32, shape = [None, None])
        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None])
        self.labels = tf.placeholder(tf.int32, shape = [None, None])
        self.depends = tf.placeholder(tf.int32, shape = [None, None])
        self.maxlen = tf.shape(self.word_ids)[1]
        self.lengths = tf.count_nonzero(self.word_ids, 1)
        batch_size = tf.shape(self.word_ids)[0]
        
        self.word_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(word2idx), dim_word], stddev = 1.0 / np.sqrt(dim_word)
            )
        )
        self.char_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(char2idx), dim_char], stddev = 1.0 / np.sqrt(dim_char)
            )
        )
        
        word_embedded = tf.nn.embedding_lookup(
            self.word_embeddings, self.word_ids
        )
        char_embedded = tf.nn.embedding_lookup(
            self.char_embeddings, self.char_ids
        )
        s = tf.shape(char_embedded)
        char_embedded = tf.reshape(
            char_embedded, shape = [s[0] * s[1], s[-2], dim_char]
        )
        reshape_char = tf.reshape(self.char_ids, shape = [s[0] * s[1], s[-2]])
        char_masked = tf.sign(reshape_char)
        char_embedded += sinusoidal_position_encoding(reshape_char, char_masked, dim_char)
        for i in range(num_blocks):
            with tf.variable_scope('char_%d'%i,reuse=tf.AUTO_REUSE):
                char_embedded = multihead_attn(queries = char_embedded,
                                                 keys = char_embedded,
                                                 q_masks = char_masked,
                                                 k_masks = char_masked,
                                                 future_binding = False,
                                                 num_units = dim_char,
                                                 num_heads = num_heads)
            with tf.variable_scope('char_feedforward_%d'%i,reuse=tf.AUTO_REUSE):
                char_embedded = pointwise_feedforward(char_embedded,
                                                    dim_char,
                                                    activation = tf.nn.relu)
        output = tf.reshape(
            char_embedded[:, -1], shape = [s[0], s[1], 2 * hidden_size_char]
        )
        
        decoder_embedded = tf.concat([word_embedded, output], axis = -1)
        decoder_embedded = tf.layers.dense(word_embedded, dim_char)
        de_masks = tf.sign(self.word_ids)
        
        decoder_embedded += sinusoidal_position_encoding(self.word_ids, de_masks, dim_char)
        
        for i in range(num_blocks):
            with tf.variable_scope('word_char_%d'%i,reuse=tf.AUTO_REUSE):
                decoder_embedded = multihead_attn(queries = decoder_embedded,
                                         keys = decoder_embedded,
                                         q_masks = de_masks,
                                         k_masks = de_masks,
                                         future_binding = True,
                                         num_units = dim_char,
                                         num_heads = num_heads)
                
            with tf.variable_scope('word_char_attention_%d'%i,reuse=tf.AUTO_REUSE):
                decoder_embedded = multihead_attn(queries = decoder_embedded,
                                         keys = output,
                                         q_masks = de_masks,
                                         k_masks = de_masks,
                                         future_binding = False,
                                         num_units = dim_char,
                                         num_heads = num_heads)
            
            with tf.variable_scope('word_feedforward_%d'%i,reuse=tf.AUTO_REUSE):
                decoder_embedded = pointwise_feedforward(decoder_embedded,
                                                    dim_char,
                                            activation = tf.nn.relu)
                
        logits = tf.layers.dense(decoder_embedded, len(idx2tag))
        
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, self.labels, self.lengths
        )
        
        tag_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(idx2tag), dim_char], stddev = 1.0 / np.sqrt(dim_char)
            )
        )
        logits_max = tf.argmax(logits,axis=2,output_type=tf.int32)
        lookup_logits = tf.nn.embedding_lookup(
            tag_embeddings, logits_max
        )
        
        lookup_logits += sinusoidal_position_encoding(logits_max, de_masks, dim_char)
        
        for i in range(num_blocks):
            with tf.variable_scope('depend_%d'%i,reuse=tf.AUTO_REUSE):
                lookup_logits = multihead_attn(queries = lookup_logits,
                                         keys = lookup_logits,
                                         q_masks = de_masks,
                                         k_masks = de_masks,
                                         future_binding = True,
                                         num_units = dim_char,
                                         num_heads = num_heads)
                
            with tf.variable_scope('depend_attention_%d'%i,reuse=tf.AUTO_REUSE):
                lookup_logits = multihead_attn(queries = lookup_logits,
                                         keys = decoder_embedded,
                                         q_masks = de_masks,
                                         k_masks = de_masks,
                                         future_binding = False,
                                         num_units = dim_char,
                                         num_heads = num_heads)
            
            with tf.variable_scope('depend_feedforward_%d'%i,reuse=tf.AUTO_REUSE):
                lookup_logits = pointwise_feedforward(lookup_logits,
                                                    dim_char,
                                            activation = tf.nn.relu)
        
        cast_mask = tf.cast(tf.sequence_mask(self.lengths + 1, maxlen = maxlen), dtype = tf.float32)
        cast_mask = tf.tile(tf.expand_dims(cast_mask,axis=1),[1,self.maxlen,1]) * 10
        
        logits_depends = tf.layers.dense(lookup_logits, maxlen)
        logits_depends = tf.multiply(logits_depends, cast_mask)
        
        with tf.variable_scope("depends"):
            log_likelihood_depends, transition_params_depends = tf.contrib.crf.crf_log_likelihood(
                logits_depends, self.depends, self.lengths
            )
            
        self.cost = tf.reduce_mean(-log_likelihood) + tf.reduce_mean(-log_likelihood_depends)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        
        self.tags_seq, _ = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')
        
        self.tags_seq_depends, _ = tf.contrib.crf.crf_decode(
            logits_depends, transition_params_depends, self.lengths
        )
        self.tags_seq_depends = tf.identity(self.tags_seq_depends, name = 'logits_depends')

        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(self.labels, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        self.prediction = tf.boolean_mask(self.tags_seq_depends, mask)
        mask_label = tf.boolean_mask(self.depends, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy_depends = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

dim_word = 128
dim_char = 256
dropout = 0.8
learning_rate = 1e-3
hidden_size_char = 128
hidden_size_word = 64
batch_size = 8

model = CRF(dim_word = dim_word,
            dim_char = dim_char,
            dropout = dropout,
            learning_rate = learning_rate,
            hidden_size_char = hidden_size_char,
            hidden_size_word = hidden_size_word,
           maxlen = words.shape[1])
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [17]:
import time

for e in range(10):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss, train_acc_depends, test_acc_depends = 0, 0, 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_char = train_char[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        batch_depends = train_depends[i : min(i + batch_size, train_X.shape[0])]
        acc_depends, acc, cost, _ = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        #assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        train_acc_depends += acc_depends
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_depends = test_depends[i : min(i + batch_size, test_X.shape[0])]
        acc_depends, acc, cost = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        #assert not np.isnan(cost)
        test_loss += cost
        test_acc += acc
        test_acc_depends += acc_depends
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    train_acc_depends /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size
    test_acc_depends /= len(test_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\n'
        % (e, train_loss, train_acc, train_acc_depends, test_loss, test_acc, test_acc_depends)
    )

train minibatch loop: 100%|██████████| 5666/5666 [1:03:18<00:00,  1.51it/s, accuracy=0.756, accuracy_depends=0.524, cost=51.9]   
test minibatch loop: 100%|██████████| 630/630 [02:52<00:00,  4.17it/s, accuracy=0.707, accuracy_depends=0.515, cost=53.7]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3971.3269250392914
epoch: 0, training loss: 74.662333, training acc: 0.687690, training depends: 0.423968, valid loss: 53.324856, valid acc: 0.745969, valid depends: 0.509276



train minibatch loop: 100%|██████████| 5666/5666 [1:03:24<00:00,  1.50it/s, accuracy=0.815, accuracy_depends=0.673, cost=37.4]
test minibatch loop: 100%|██████████| 630/630 [02:51<00:00,  4.11it/s, accuracy=0.717, accuracy_depends=0.566, cost=44.6]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3975.9911386966705
epoch: 1, training loss: 41.632860, training acc: 0.800347, training depends: 0.614527, valid loss: 38.777110, valid acc: 0.807298, valid depends: 0.642584



train minibatch loop: 100%|██████████| 5666/5666 [1:03:28<00:00,  1.51it/s, accuracy=0.81, accuracy_depends=0.726, cost=32.1] 
test minibatch loop: 100%|██████████| 630/630 [02:52<00:00,  4.12it/s, accuracy=0.838, accuracy_depends=0.677, cost=34.6]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3980.6265354156494
epoch: 2, training loss: 34.020197, training acc: 0.828245, training depends: 0.679861, valid loss: 33.156008, valid acc: 0.823669, valid depends: 0.699404



train minibatch loop: 100%|██████████| 5666/5666 [1:03:29<00:00,  1.50it/s, accuracy=0.839, accuracy_depends=0.75, cost=28.7] 
test minibatch loop: 100%|██████████| 630/630 [02:51<00:00,  4.12it/s, accuracy=0.808, accuracy_depends=0.717, cost=36.1]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3980.9095969200134
epoch: 3, training loss: 28.242658, training acc: 0.845301, training depends: 0.740665, valid loss: 28.623581, valid acc: 0.831968, valid depends: 0.757451



train minibatch loop: 100%|██████████| 5666/5666 [1:03:32<00:00,  1.51it/s, accuracy=0.839, accuracy_depends=0.78, cost=27]   
test minibatch loop: 100%|██████████| 630/630 [02:52<00:00,  4.19it/s, accuracy=0.788, accuracy_depends=0.828, cost=25.5]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3985.202503681183
epoch: 4, training loss: 23.337289, training acc: 0.859588, training depends: 0.795232, valid loss: 25.202329, valid acc: 0.840135, valid depends: 0.799228



train minibatch loop: 100%|██████████| 5666/5666 [1:03:33<00:00,  1.51it/s, accuracy=0.869, accuracy_depends=0.821, cost=21.9]
test minibatch loop: 100%|██████████| 630/630 [02:51<00:00,  4.14it/s, accuracy=0.838, accuracy_depends=0.768, cost=23.9]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3985.2355260849
epoch: 5, training loss: 18.881109, training acc: 0.873684, training depends: 0.846420, valid loss: 22.490008, valid acc: 0.849709, valid depends: 0.828853



train minibatch loop: 100%|██████████| 5666/5666 [1:03:32<00:00,  1.50it/s, accuracy=0.863, accuracy_depends=0.827, cost=21.8]
test minibatch loop: 100%|██████████| 630/630 [02:52<00:00,  4.14it/s, accuracy=0.848, accuracy_depends=0.838, cost=20.6]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3985.4282426834106
epoch: 6, training loss: 15.691809, training acc: 0.885103, training depends: 0.882228, valid loss: 19.544368, valid acc: 0.861741, valid depends: 0.863059



train minibatch loop: 100%|██████████| 5666/5666 [1:03:33<00:00,  1.50it/s, accuracy=0.869, accuracy_depends=0.887, cost=17.2]
test minibatch loop: 100%|██████████| 630/630 [02:52<00:00,  4.10it/s, accuracy=0.778, accuracy_depends=0.879, cost=26.3]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3986.3802030086517
epoch: 7, training loss: 13.322382, training acc: 0.895488, training depends: 0.906931, valid loss: 18.686160, valid acc: 0.859560, valid depends: 0.879505



train minibatch loop: 100%|██████████| 5666/5666 [1:03:32<00:00,  1.50it/s, accuracy=0.893, accuracy_depends=0.893, cost=15.4]
test minibatch loop: 100%|██████████| 630/630 [02:52<00:00,  4.12it/s, accuracy=0.848, accuracy_depends=0.848, cost=21.4]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3984.483060359955
epoch: 8, training loss: 11.466844, training acc: 0.906599, training depends: 0.924221, valid loss: 16.073830, valid acc: 0.877447, valid depends: 0.899887



train minibatch loop: 100%|██████████| 5666/5666 [1:03:33<00:00,  1.50it/s, accuracy=0.899, accuracy_depends=0.905, cost=15.3]
test minibatch loop: 100%|██████████| 630/630 [02:52<00:00,  4.08it/s, accuracy=0.838, accuracy_depends=0.848, cost=20.1]

time taken: 3986.1450822353363
epoch: 9, training loss: 9.785712, training acc: 0.918415, training depends: 0.937485, valid loss: 16.119294, valid acc: 0.880889, valid depends: 0.899497






In [20]:
import time

for e in range(3):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss, train_acc_depends, test_acc_depends = 0, 0, 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_char = train_char[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        batch_depends = train_depends[i : min(i + batch_size, train_X.shape[0])]
        acc_depends, acc, cost, _ = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        #assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        train_acc_depends += acc_depends
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_depends = test_depends[i : min(i + batch_size, test_X.shape[0])]
        acc_depends, acc, cost = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        #assert not np.isnan(cost)
        test_loss += cost
        test_acc += acc
        test_acc_depends += acc_depends
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    train_acc_depends /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size
    test_acc_depends /= len(test_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\n'
        % (e, train_loss, train_acc, train_acc_depends, test_loss, test_acc, test_acc_depends)
    )

train minibatch loop: 100%|██████████| 5666/5666 [1:03:34<00:00,  1.50it/s, accuracy=0.946, accuracy_depends=0.923, cost=9.98] 
test minibatch loop: 100%|██████████| 630/630 [02:52<00:00,  4.13it/s, accuracy=0.879, accuracy_depends=0.899, cost=16.5]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3986.8803112506866
epoch: 0, training loss: 4.550854, training acc: 0.962310, training depends: 0.971532, valid loss: 12.594443, valid acc: 0.914797, valid depends: 0.934598



train minibatch loop: 100%|██████████| 5666/5666 [1:03:33<00:00,  1.51it/s, accuracy=0.94, accuracy_depends=0.946, cost=8.71]  
test minibatch loop: 100%|██████████| 630/630 [02:52<00:00,  4.17it/s, accuracy=0.859, accuracy_depends=0.899, cost=21.7]
train minibatch loop:   0%|          | 0/5666 [00:00<?, ?it/s]

time taken: 3985.9828474521637
epoch: 1, training loss: 5.258419, training acc: 0.958980, training depends: 0.965066, valid loss: 13.166835, valid acc: 0.907223, valid depends: 0.925843



train minibatch loop: 100%|██████████| 5666/5666 [1:03:32<00:00,  1.49it/s, accuracy=0.964, accuracy_depends=0.946, cost=6.36] 
test minibatch loop: 100%|██████████| 630/630 [02:52<00:00,  4.13it/s, accuracy=0.899, accuracy_depends=0.909, cost=19.7]

time taken: 3985.3402976989746
epoch: 2, training loss: 3.728318, training acc: 0.968881, training depends: 0.976212, valid loss: 11.758984, valid acc: 0.920203, valid depends: 0.942277






In [21]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'attention/model.ckpt')

strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'logits_depends' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'OptimizeLoss' not in n.name
        and 'Global_Step' not in n.name
        and 'Epoch_Step' not in n.name
        and 'learning_rate' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'Variable',
 'Variable_1',
 'char_0/Q/kernel',
 'char_0/Q/bias',
 'char_0/K_V/kernel',
 'char_0/K_V/bias',
 'char_0/gamma',
 'char_feedforward_0/dense/kernel',
 'char_feedforward_0/dense/bias',
 'char_feedforward_0/dense_1/kernel',
 'char_feedforward_0/dense_1/bias',
 'char_feedforward_0/gamma',
 'char_1/Q/kernel',
 'char_1/Q/bias',
 'char_1/K_V/kernel',
 'char_1/K_V/bias',
 'char_1/gamma',
 'char_feedforward_1/dense/kernel',
 'char_feedforward_1/dense/bias',
 'char_feedforward_1/dense_1/kernel',
 'char_feedforward_1/dense_1/bias',
 'char_feedforward_1/gamma',
 'dense/kernel',
 'dense/bias',
 'word_char_0/Q/kernel',
 'word_char_0/Q/bias',
 'word_char_0/K_V/kernel',
 'word_char_0/K_V/bias',
 'word_char_0/gamma',
 'word_char_attention_0/Q/kernel',
 'word_char_attention_0/Q/bias',
 'word_char_attention_0/K_V/kernel',
 'word_char_attention_0/K_V/bias',
 'word_char_attention_0/gamma',
 'word_feedforward_0/dense/kernel',


In [22]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out

In [23]:
seq, deps = sess.run([model.tags_seq, model.tags_seq_depends],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
            },
)

In [24]:
real_Y, predict_Y, real_depends, predict_depends = [], [], [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    batch_depends = test_depends[i : min(i + batch_size, test_X.shape[0])]
    seq, deps = sess.run([model.tags_seq, model.tags_seq_depends],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
            },
    )
    predicted = pred2label(seq)
    real = pred2label(batch_y)
    predict_Y.extend(predicted)
    real_Y.extend(real)
    
    real_depends.extend(batch_depends.tolist())
    predict_depends.extend(deps.tolist())

validation minibatch loop: 100%|██████████| 630/630 [02:47<00:00,  4.18it/s]


In [25]:
from sklearn.metrics import classification_report
print(classification_report(np.array(real_Y).ravel(), np.array(predict_Y).ravel(), digits = 4))

               precision    recall  f1-score   support

          PAD     1.0000    1.0000    1.0000    841796
          acl     0.8768    0.8849    0.8809      3016
        advcl     0.8290    0.7943    0.8113      1196
       advmod     0.9043    0.9163    0.9102      4754
         amod     0.9121    0.8773    0.8943      4149
        appos     0.8934    0.8983    0.8958      2547
          aux     1.0000    1.0000    1.0000         6
         case     0.9593    0.9670    0.9631     10888
           cc     0.9523    0.9606    0.9564      3198
        ccomp     0.7984    0.7385    0.7673       413
     compound     0.8677    0.8956    0.8815      6679
compound:plur     0.9073    0.9255    0.9163       550
         conj     0.8625    0.9330    0.8964      4162
          cop     0.9296    0.9679    0.9484       996
        csubj     0.9000    0.4091    0.5625        22
   csubj:pass     0.8462    0.8462    0.8462        13
          dep     0.8274    0.7377    0.7800       507
         

In [26]:
from sklearn.metrics import classification_report
print(classification_report(np.array(real_depends).ravel(), 
                            np.array(predict_depends).ravel(), digits = 4))

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000    841796
          1     0.9486    0.9277    0.9381      5037
          2     0.9157    0.9547    0.9348      4325
          3     0.9505    0.9137    0.9318      4856
          4     0.9439    0.9311    0.9374      6309
          5     0.9422    0.9396    0.9409      6540
          6     0.9314    0.9516    0.9414      5697
          7     0.9468    0.9461    0.9464      5414
          8     0.9524    0.9394    0.9458      5559
          9     0.9432    0.9421    0.9427      5028
         10     0.9308    0.9544    0.9425      4300
         11     0.9623    0.9323    0.9471      4358
         12     0.9449    0.9493    0.9471      3903
         13     0.9338    0.9442    0.9390      3497
         14     0.9444    0.9475    0.9459      3445
         15     0.9445    0.9487    0.9466      3177
         16     0.9411    0.9589    0.9500      3068
         17     0.9350    0.9589    0.9468   

In [27]:
string = 'tolong tangkap gambar kami'

def char_str_idx(corpus, dic, UNK = 0):
    maxlen = max([len(i) for i in corpus])
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen]):
            val = dic[k] if k in dic else UNK
            X[i, no] = val
    return X

def generate_char_seq(batch, UNK = 2):
    maxlen_c = max([len(k) for k in batch])
    x = [[len(i) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((len(batch),maxlen_c,maxlen),dtype=np.int32)
    for i in range(len(batch)):
        for k in range(len(batch[i])):
            for no, c in enumerate(batch[i][k][::-1]):
                temp[i,k,-1-no] = char2idx.get(c, UNK)
    return temp

sequence = process_string(string)
sequence

['tolong', 'tangkap', 'gambar', 'kami']

In [28]:
X_seq = char_str_idx([sequence], word2idx, 2)
X_char_seq = generate_char_seq([sequence])

In [29]:
X_char_seq.shape

(1, 4, 7)

In [30]:
seq, deps = sess.run([model.tags_seq, model.tags_seq_depends],
        feed_dict={model.word_ids:X_seq,
                  model.char_ids:X_char_seq})

In [31]:
deps[0] - 1

array([2, 0, 3, 3], dtype=int32)

In [32]:
[idx2tag[i] for i in seq[0]]

['advmod', 'root', 'case', 'nmod']

In [33]:
import json
with open('attention-is-all-you-need-dependency.json','w') as fopen:
    fopen.write(json.dumps({'idx2tag':idx2tag,'idx2word':idx2word,
           'word2idx':word2idx,'tag2idx':tag2idx,'char2idx':char2idx}))

In [34]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))
        
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [36]:
freeze_graph('attention', strings)

INFO:tensorflow:Restoring parameters from attention/model.ckpt
INFO:tensorflow:Froze 107 variables.
INFO:tensorflow:Converted 107 variables to const ops.
2815 ops in the final graph.


In [37]:
g = load_graph('attention/frozen_model.pb')

In [38]:
word_ids = g.get_tensor_by_name('import/Placeholder:0')
char_ids = g.get_tensor_by_name('import/Placeholder_1:0')
tags_seq = g.get_tensor_by_name('import/logits:0')
depends_seq = g.get_tensor_by_name('import/logits_depends:0')
test_sess = tf.InteractiveSession(graph = g)
seq, deps = test_sess.run([tags_seq, depends_seq],
            feed_dict = {
                word_ids: X_seq,
                char_ids: X_char_seq,
            })

print(seq,deps)



[[14  4  7 20]] [[3 1 4 4]]


In [40]:
idx2tag

{0: 'PAD',
 1: 'nsubj',
 2: 'cop',
 3: 'det',
 4: 'root',
 5: 'nsubj:pass',
 6: 'acl',
 7: 'case',
 8: 'obl',
 9: 'flat',
 10: 'punct',
 11: 'appos',
 12: 'amod',
 13: 'compound',
 14: 'advmod',
 15: 'cc',
 16: 'obj',
 17: 'conj',
 18: 'mark',
 19: 'advcl',
 20: 'nmod',
 21: 'nummod',
 22: 'dep',
 23: 'xcomp',
 24: 'ccomp',
 25: 'parataxis',
 26: 'compound:plur',
 27: 'fixed',
 28: 'aux',
 29: 'csubj',
 30: 'iobj',
 31: 'csubj:pass'}