In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
with open('../Malaya-Dataset/dependency/gsd-ud-train.conllu.txt') as fopen:
    corpus = fopen.read().split('\n')
    
with open('../Malaya-Dataset/dependency/gsd-ud-test.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))
    
with open('../Malaya-Dataset/dependency/gsd-ud-dev.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))

In [3]:
import xlnet
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import model_utils
import pickle
import json
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences




In [4]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('xlnet-base/sp10m.cased.v9.model')

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [5]:
SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

In [6]:
tag2idx = {'PAD': 0, 'X': 1}
tag_idx = 2

def process_corpus(corpus, until = None):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    sentences, words, depends, labels, pos, sequences = [], [], [], [], [], []
    temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []
    segments, masks = [], []
    first_time = True
    for sentence in corpus:
        try:
            if len(sentence):
                if sentence[0] == '#':
                    continue
                if first_time:
                    print(sentence)
                    first_time = False
                sentence = sentence.split('\t')
                if sentence[7] not in tag2idx:
                    tag2idx[sentence[7]] = tag_idx
                    tag_idx += 1
                temp_word.append(sentence[1])
                temp_depend.append(int(sentence[6]) + 1)
                temp_label.append(tag2idx[sentence[7]])
                temp_sentence.append(sentence[1])
                temp_pos.append(sentence[3])
            else:
                if len(temp_sentence) < 2 or len(temp_word) != len(temp_label):
                    temp_word = []
                    temp_depend = []
                    temp_label = []
                    temp_sentence = []
                    temp_pos = []
                    continue
                bert_tokens = []
                labels_ = []
                depends_ = []
                seq_ = []
                for no, orig_token in enumerate(temp_word):
                    labels_.append(temp_label[no])
                    depends_.append(temp_depend[no])
                    t = tokenize_fn(orig_token)
                    bert_tokens.extend(t)
                    labels_.extend([1] * (len(t) - 1))
                    depends_.extend([0] * (len(t) - 1))
                    seq_.append(no + 1)
                bert_tokens.extend([4, 3])
                labels_.extend([0, 0])
                depends_.extend([0, 0])
                segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]
                input_mask = [0] * len(segment)
                words.append(bert_tokens)
                depends.append(depends_)
                labels.append(labels_)
                sentences.append(bert_tokens)
                pos.append(temp_pos)
                sequences.append(seq_)
                segments.append(segment)
                masks.append(input_mask)
                temp_word = []
                temp_depend = []
                temp_label = []
                temp_sentence = []
                temp_pos = []
        except Exception as e:
            print(e, sentence)
    return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1], sequences[:-1], segments[:-1], masks[:-1]

In [7]:
sentences, words, depends, labels, _, _, segments, masks = process_corpus(corpus)

1	Sembungan	sembungan	PROPN	X--	_	4	nsubj	_	MorphInd=^sembungan<x>_X--$


In [8]:
len(words[0]), len(depends[0]), len(labels[0])

(26, 26, 26)

In [9]:
import json

with open('../Malaya-Dataset/dependency/augmented-dependency.json') as fopen:
    augmented = json.load(fopen)

In [10]:
text_augmented, depends_augmented, labels_augmented = [], [], []

for a in augmented:
    text_augmented.extend(a[0])
    depends_augmented.extend(a[1])
    labels_augmented.extend((np.array(a[2]) + 1).tolist())

In [11]:
def parse_XY(texts, depends, labels):
    outside, sentences, outside_depends, outside_labels = [], [], [], []
    segments, masks = [], []
    for no, text in enumerate(texts):
        temp_depend = depends[no]
        temp_label = labels[no]
        s = text.split()
        sentences.append(s)
        bert_tokens = []
        labels_ = []
        depends_ = []
        for no, orig_token in enumerate(s):
            labels_.append(temp_label[no])
            depends_.append(temp_depend[no])
            t = tokenize_fn(orig_token)
            bert_tokens.extend(t)
            labels_.extend([1] * (len(t) - 1))
            depends_.extend([0] * (len(t) - 1))
        bert_tokens.extend([4, 3])
        labels_.extend([0, 0])
        depends_.extend([0, 0])
        segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]
        input_mask = [0] * len(segment)
        outside.append(bert_tokens)
        outside_depends.append(depends_)
        outside_labels.append(labels_)
        segments.append(segment)
        masks.append(input_mask)
    return outside, sentences, outside_depends, outside_labels, segments, masks

In [12]:
outside, _, outside_depends, outside_labels, outside_segments, outside_masks = parse_XY(text_augmented, 
                                                       depends_augmented, 
                                                       labels_augmented)

In [13]:
words.extend(outside)
depends.extend(outside_depends)
labels.extend(outside_labels)
segments.extend(outside_segments)
masks.extend(outside_masks)

In [14]:
idx2tag = {v:k for k, v in tag2idx.items()}
idx2tag

{0: 'PAD',
 1: 'X',
 2: 'nsubj',
 3: 'cop',
 4: 'det',
 5: 'root',
 6: 'nsubj:pass',
 7: 'acl',
 8: 'case',
 9: 'obl',
 10: 'flat',
 11: 'punct',
 12: 'appos',
 13: 'amod',
 14: 'compound',
 15: 'advmod',
 16: 'cc',
 17: 'obj',
 18: 'conj',
 19: 'mark',
 20: 'advcl',
 21: 'nmod',
 22: 'nummod',
 23: 'dep',
 24: 'xcomp',
 25: 'ccomp',
 26: 'parataxis',
 27: 'compound:plur',
 28: 'fixed',
 29: 'aux',
 30: 'csubj',
 31: 'iobj',
 32: 'csubj:pass'}

In [15]:
from sklearn.model_selection import train_test_split

words_train, words_test, depends_train, depends_test, labels_train, labels_test, \
segments_train, segments_test, masks_train, masks_test \
= train_test_split(words, depends, labels, segments, masks, test_size = 0.2)

In [16]:
len(words_train), len(words_test)

(40289, 10073)

In [17]:
train_X = words_train
train_Y = labels_train
train_depends = depends_train

test_X = words_test
test_Y = labels_test
test_depends = depends_test

In [18]:
import xlnet
import model_utils
import tensorflow as tf
import numpy as np

kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base/config.json')




In [19]:
epoch = 15
batch_size = 16
warmup_proportion = 0.1
num_train_steps = int(len(train_X) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
print(num_train_steps, num_warmup_steps)

training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = 2e-5,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.02,
      clip = 1.0,
      clamp_len=-1,)

37770 3777


In [20]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [21]:
class BiAAttention:
    def __init__(self, input_size_encoder, input_size_decoder, num_labels):
        self.input_size_encoder = input_size_encoder
        self.input_size_decoder = input_size_decoder
        self.num_labels = num_labels
        
        self.W_d = tf.get_variable("W_d", shape=[self.num_labels, self.input_size_decoder],
           initializer=tf.contrib.layers.xavier_initializer())
        self.W_e = tf.get_variable("W_e", shape=[self.num_labels, self.input_size_encoder],
           initializer=tf.contrib.layers.xavier_initializer())
        self.U = tf.get_variable("U", shape=[self.num_labels, self.input_size_decoder, self.input_size_encoder],
           initializer=tf.contrib.layers.xavier_initializer())
        
    def forward(self, input_d, input_e, mask_d=None, mask_e=None):
        batch = tf.shape(input_d)[0]
        length_decoder = tf.shape(input_d)[1]
        length_encoder = tf.shape(input_e)[1]
        out_d = tf.expand_dims(tf.matmul(self.W_d, tf.transpose(input_d, [0, 2, 1])), 3)
        out_e = tf.expand_dims(tf.matmul(self.W_e, tf.transpose(input_e, [0, 2, 1])), 2)
        output = tf.matmul(tf.expand_dims(input_d, 1), self.U)
        output = tf.matmul(output, tf.transpose(tf.expand_dims(input_e, 1), [0, 1, 3, 2]))
        
        output = output + out_d + out_e
        
        if mask_d is not None:
            d = tf.expand_dims(tf.expand_dims(mask_d, 1), 3)
            e = tf.expand_dims(tf.expand_dims(mask_e, 1), 2)
            output = output * d * e
            
        return output
    
class BiLinear:
    def __init__(self, left_features, right_features, out_features):
        self.left_features = left_features
        self.right_features = right_features
        self.out_features = out_features
        
        self.U = tf.get_variable("U-bi", shape=[out_features, left_features, right_features],
           initializer=tf.contrib.layers.xavier_initializer())
        self.W_l = tf.get_variable("Wl", shape=[out_features, left_features],
           initializer=tf.contrib.layers.xavier_initializer())
        self.W_r = tf.get_variable("Wr", shape=[out_features, right_features],
           initializer=tf.contrib.layers.xavier_initializer())
    
    def forward(self, input_left, input_right):
        left_size = tf.shape(input_left)
        output_shape = tf.concat([left_size[:-1], [self.out_features]], axis = 0)
        batch = tf.cast(tf.reduce_prod(left_size[:-1]), tf.int32)
        input_left = tf.reshape(input_left, (batch, self.left_features))
        input_right = tf.reshape(input_right, (batch, self.right_features))
        tiled = tf.tile(tf.expand_dims(input_left, axis = 0), (self.out_features,1,1))
        output = tf.transpose(tf.reduce_sum(tf.matmul(tiled, self.U), axis = 2))
        output = output + tf.matmul(input_left, tf.transpose(self.W_l))\
        + tf.matmul(input_right, tf.transpose(self.W_r))
        
        return tf.reshape(output, output_shape)


    
    def decode(self, input_word, input_char, mask, leading_symbolic=0):
        out_arc, out_type, _ = self.forward(input_word, input_char, mask)
        batch = tf.shape(out_arc)[0]
        max_len = tf.shape(out_arc)[1]
        sec_max_len = tf.shape(out_arc)[2]
        out_arc = out_arc + tf.linalg.diag(tf.fill([max_len], -np.inf))
        minus_mask = tf.expand_dims(tf.cast(1 - mask, tf.bool), axis = 2)
        minus_mask = tf.tile(minus_mask, [1, 1, sec_max_len])
        out_arc = tf.where(minus_mask, tf.fill(tf.shape(out_arc), -np.inf), out_arc)
        heads = tf.argmax(out_arc, axis = 1)
        type_h, type_c = out_type
        batch = tf.shape(type_h)[0]
        max_len = tf.shape(type_h)[1]
        batch_index = tf.range(0, batch)
        t = tf.cast(tf.transpose(heads), tf.int32)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), 
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        type_h = tf.gather_nd(type_h, concatenated)
        out_type = self.bilinear.forward(type_h, type_c)
        out_type = out_type[:, :, leading_symbolic:]
        types = tf.argmax(out_type, axis = 2)
        return heads, types
    
class Model:
    def __init__(
        self,
        learning_rate,
        hidden_size_word,
        cov = 0.0):
        
        self.words = tf.placeholder(tf.int32, (None, None))
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.heads = tf.placeholder(tf.int32, (None, None))
        self.types = tf.placeholder(tf.int32, (None, None))
        self.switch = tf.placeholder(tf.bool, None)
        self.mask = tf.cast(tf.math.not_equal(self.words, 0), tf.float32)
        self.maxlen = tf.shape(self.words)[1]
        self.lengths = tf.count_nonzero(self.words, 1)
        mask = self.mask
        heads = self.heads
        types = self.types
        
        self.arc_h = tf.layers.Dense(hidden_size_word)
        self.arc_c = tf.layers.Dense(hidden_size_word)
        self.attention = BiAAttention(hidden_size_word, hidden_size_word, 1)

        self.type_h = tf.layers.Dense(hidden_size_word)
        self.type_c = tf.layers.Dense(hidden_size_word)
        self.bilinear = BiLinear(hidden_size_word, hidden_size_word, len(tag2idx))
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.transpose(self.words, [1, 0]),
            seg_ids=tf.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.transpose(self.input_masks, [1, 0]))
        output_layer = xlnet_model.get_sequence_output()
        output_layer = tf.transpose(output_layer, [1, 0, 2])
        
        arc_h = tf.nn.elu(self.arc_h(output_layer))
        arc_c = tf.nn.elu(self.arc_c(output_layer))
        
        type_h = tf.nn.elu(self.type_h(output_layer))
        type_c = tf.nn.elu(self.type_c(output_layer))
        
        out_arc = tf.squeeze(self.attention.forward(arc_h, arc_c, mask_d=self.mask, 
                                                    mask_e=self.mask), axis = 1)
        
        batch = tf.shape(out_arc)[0]
        max_len = tf.shape(out_arc)[1]
        sec_max_len = tf.shape(out_arc)[2]
        batch_index = tf.range(0, batch)
        
        decode_arc = out_arc + tf.linalg.diag(tf.fill([max_len], -np.inf))
        minus_mask = tf.expand_dims(tf.cast(1 - mask, tf.bool), axis = 2)
        minus_mask = tf.tile(minus_mask, [1, 1, sec_max_len])
        decode_arc = tf.where(minus_mask, tf.fill(tf.shape(decode_arc), -np.inf), decode_arc)
        self.heads_seq = tf.argmax(decode_arc, axis = 1)
        self.heads_seq = tf.identity(self.heads_seq, name = 'heads_seq')
        
        t = tf.cast(tf.transpose(self.heads_seq), tf.int32)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), 
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        type_h = tf.gather_nd(type_h, concatenated)
        out_type = self.bilinear.forward(type_h, type_c)
        self.tags_seq = tf.argmax(out_type, axis = 2)
        self.tags_seq = tf.identity(self.tags_seq, name = 'tags_seq')
        
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            out_type, self.types, self.lengths
        )
        crf_loss = tf.reduce_mean(-log_likelihood)
        self.logits, _ = tf.contrib.crf.crf_decode(
            out_type, transition_params, self.lengths
        )
        self.logits = tf.identity(self.logits, name = 'logits')
        
        batch = tf.shape(out_arc)[0]
        max_len = tf.shape(out_arc)[1]
        batch_index = tf.range(0, batch)
        t = tf.transpose(heads)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), 
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        type_h = tf.gather_nd(type_h, concatenated)
        out_type = self.bilinear.forward(type_h, type_c)
        minus_inf = -1e8
        minus_mask = (1 - mask) * minus_inf
        out_arc = out_arc + tf.expand_dims(minus_mask, axis = 2) + tf.expand_dims(minus_mask, axis = 1)
        loss_arc = tf.nn.log_softmax(out_arc, dim=1)
        loss_type = tf.nn.log_softmax(out_type, dim=2)
        loss_arc = loss_arc * tf.expand_dims(mask, axis = 2) * tf.expand_dims(mask, axis = 1)
        loss_type = loss_type * tf.expand_dims(mask, axis = 2)
        num = tf.reduce_sum(mask) - tf.cast(batch, tf.float32)
        child_index = tf.tile(tf.expand_dims(tf.range(0, max_len), 1), [1, batch])
        t = tf.transpose(heads)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0),
                                               tf.expand_dims(t, axis = 0),
                                               tf.expand_dims(child_index, axis = 0)], axis = 0))
        loss_arc = tf.gather_nd(loss_arc, concatenated)
        loss_arc = tf.transpose(loss_arc, [1, 0])
        
        t = tf.transpose(types)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0),
                                               tf.expand_dims(child_index, axis = 0),
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        loss_type = tf.gather_nd(loss_type, concatenated)
        loss_type = tf.transpose(loss_type, [1, 0])
        cost = (tf.reduce_sum(-loss_arc) / num) + (tf.reduce_sum(-loss_type) / num)
        
        self.cost = tf.cond(self.switch, lambda: cost + crf_loss, lambda: cost)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        
        self.prediction = tf.boolean_mask(self.logits, mask)
        mask_label = tf.boolean_mask(self.types, mask)
        correct_pred = tf.equal(tf.cast(self.prediction, tf.int32), mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        self.prediction = tf.cast(tf.boolean_mask(self.heads_seq, mask), tf.int32)
        mask_label = tf.boolean_mask(self.heads, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy_depends = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [22]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

learning_rate = 2e-5
hidden_size_word = 128

model = Model(learning_rate, hidden_size_word)
sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
dim is deprecated, use axis instead


In [23]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [24]:
tvars = tf.trainable_variables()
checkpoint = 'xlnet-base/model.ckpt'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [25]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

INFO:tensorflow:Restoring parameters from xlnet-base/model.ckpt


In [26]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

batch_x = train_X[:5]
batch_x = pad_sequences(batch_x,padding='post')
batch_y = train_Y[:5]
batch_y = pad_sequences(batch_y,padding='post')
batch_depends = train_depends[:5]
batch_depends = pad_sequences(batch_depends,padding='post')
batch_segments = segments_train[:5]
batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
batch_masks = masks_train[:5]
batch_masks = pad_sequences(batch_masks, padding='post', value = 1)

In [27]:
sess.run([model.accuracy, model.accuracy_depends, model.cost],
        feed_dict = {model.words: batch_x,
                model.types: batch_y,
                model.heads: batch_depends,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
                model.switch: False})

[0.04255319, 0.014184397, 128.52495]

In [28]:
sess.run([model.accuracy, model.accuracy_depends, model.cost],
        feed_dict = {model.words: batch_x,
                model.types: batch_y,
                model.heads: batch_depends,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
                model.switch: True})

[0.0070921984, 0.04964539, 544.50476]

In [29]:
tags_seq, heads = sess.run(
    [model.logits, model.heads_seq],
    feed_dict = {
        model.words: batch_x,
        model.segment_ids: batch_segments,
        model.input_masks: batch_masks
    },
)
tags_seq[0], heads[0], batch_depends[0]

(array([26,  6,  6, 28, 26, 18, 19, 18, 28,  6, 26, 32, 27, 28, 27,  6, 28,
        19, 19, 28,  6, 28,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=int32),
 array([21,  3, 15, 18, 10, 21, 10, 21,  7, 10,  1, 21,  1,  7,  3,  8,  7,
         9, 10, 18,  8, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]),
 array([ 3,  1,  3,  3,  3,  6,  3,  0,  3,  0,  9, 13, 11,  9, 15, 13, 15,
         3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=int32))

In [30]:
from tqdm import tqdm

epoch = 20
for e in range(epoch):
    train_acc, train_loss = [], []
    test_acc, test_loss = [], []
    train_acc_depends, test_acc_depends = [], []
    
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i: index]
        batch_x = pad_sequences(batch_x,padding='post')
        batch_y = train_Y[i: index]
        batch_y = pad_sequences(batch_y,padding='post')
        batch_depends = train_depends[i: index]
        batch_depends = pad_sequences(batch_depends,padding='post')
        batch_segments = segments_train[i: index]
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = masks_train[i: index]
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        
        acc_depends, acc, cost, _ = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.words: batch_x,
                model.types: batch_y,
                model.heads: batch_depends,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
                model.switch: False
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        train_acc_depends.append(acc_depends)
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
        
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i: index]
        batch_x = pad_sequences(batch_x,padding='post')
        batch_y = test_Y[i: index]
        batch_y = pad_sequences(batch_y,padding='post')
        batch_depends = test_depends[i: index]
        batch_depends = pad_sequences(batch_depends,padding='post')
        batch_segments = segments_test[i: index]
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = masks_test[i: index]
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        
        acc_depends, acc, cost = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost],
            feed_dict = {
                model.words: batch_x,
                model.types: batch_y,
                model.heads: batch_depends,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
                model.switch: False
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        test_acc_depends.append(acc_depends)
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    
    
    print(
    'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\n'
    % (e, np.mean(train_loss), 
       np.mean(train_acc), 
       np.mean(train_acc_depends), 
       np.mean(test_loss), 
       np.mean(test_acc), 
       np.mean(test_acc_depends)
    ))

train minibatch loop: 100%|██████████| 2519/2519 [15:04<00:00,  2.78it/s, accuracy=0.8, accuracy_depends=0.5, cost=1.95]     
test minibatch loop: 100%|██████████| 630/630 [02:03<00:00,  5.12it/s, accuracy=0.843, accuracy_depends=0.545, cost=2.06]
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 0, training loss: 3.876894, training acc: 0.650480, training depends: 0.419876, valid loss: 2.197435, valid acc: 0.826289, valid depends: 0.512468



train minibatch loop: 100%|██████████| 2519/2519 [15:22<00:00,  2.73it/s, accuracy=0.9, accuracy_depends=0.55, cost=1.35]   
test minibatch loop: 100%|██████████| 630/630 [02:05<00:00,  5.03it/s, accuracy=0.886, accuracy_depends=0.609, cost=1.59]
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 1, training loss: 1.890330, training acc: 0.857705, training depends: 0.550963, valid loss: 1.625166, valid acc: 0.878244, valid depends: 0.601878



train minibatch loop: 100%|██████████| 2519/2519 [15:23<00:00,  2.73it/s, accuracy=0.85, accuracy_depends=0.7, cost=0.788]   
test minibatch loop: 100%|██████████| 630/630 [02:06<00:00,  4.99it/s, accuracy=0.902, accuracy_depends=0.636, cost=1.37] 
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 2, training loss: 1.410951, training acc: 0.887465, training depends: 0.646661, valid loss: 1.243608, valid acc: 0.898173, valid depends: 0.682270



train minibatch loop: 100%|██████████| 2519/2519 [15:23<00:00,  2.73it/s, accuracy=0.75, accuracy_depends=0.75, cost=0.889]  
test minibatch loop: 100%|██████████| 630/630 [02:05<00:00,  5.02it/s, accuracy=0.911, accuracy_depends=0.718, cost=1.1]  
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 3, training loss: 1.064192, training acc: 0.905402, training depends: 0.724641, valid loss: 0.975716, valid acc: 0.909459, valid depends: 0.749798



train minibatch loop: 100%|██████████| 2519/2519 [15:25<00:00,  2.72it/s, accuracy=0.95, accuracy_depends=0.75, cost=0.542]  
test minibatch loop: 100%|██████████| 630/630 [02:05<00:00,  5.02it/s, accuracy=0.941, accuracy_depends=0.748, cost=0.93] 
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 4, training loss: 0.837911, training acc: 0.922502, training depends: 0.774991, valid loss: 0.811241, valid acc: 0.923954, valid depends: 0.790458



train minibatch loop: 100%|██████████| 2519/2519 [15:24<00:00,  2.72it/s, accuracy=0.9, accuracy_depends=0.8, cost=0.952]    
test minibatch loop: 100%|██████████| 630/630 [02:05<00:00,  5.03it/s, accuracy=0.957, accuracy_depends=0.768, cost=0.798]
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 5, training loss: 0.686040, training acc: 0.935296, training depends: 0.807902, valid loss: 0.716174, valid acc: 0.933297, valid depends: 0.809396



train minibatch loop: 100%|██████████| 2519/2519 [15:23<00:00,  2.73it/s, accuracy=0.9, accuracy_depends=0.85, cost=0.566]   
test minibatch loop: 100%|██████████| 630/630 [02:04<00:00,  5.07it/s, accuracy=0.948, accuracy_depends=0.839, cost=0.688]
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 6, training loss: 0.574526, training acc: 0.945672, training depends: 0.831804, valid loss: 0.622911, valid acc: 0.940280, valid depends: 0.835143



train minibatch loop: 100%|██████████| 2519/2519 [15:23<00:00,  2.73it/s, accuracy=0.95, accuracy_depends=0.8, cost=0.212]   
test minibatch loop: 100%|██████████| 630/630 [01:59<00:00,  5.28it/s, accuracy=0.95, accuracy_depends=0.841, cost=0.594] 
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 7, training loss: 0.491905, training acc: 0.953639, training depends: 0.848994, valid loss: 0.572177, valid acc: 0.944793, valid depends: 0.846656



train minibatch loop: 100%|██████████| 2519/2519 [15:28<00:00,  2.71it/s, accuracy=1, accuracy_depends=0.8, cost=0.237]      
test minibatch loop: 100%|██████████| 630/630 [02:06<00:00,  4.97it/s, accuracy=0.961, accuracy_depends=0.852, cost=0.484]
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 8, training loss: 0.424984, training acc: 0.960825, training depends: 0.862493, valid loss: 0.532216, valid acc: 0.949322, valid depends: 0.856430



train minibatch loop: 100%|██████████| 2519/2519 [15:29<00:00,  2.71it/s, accuracy=1, accuracy_depends=0.85, cost=0.081]     
test minibatch loop: 100%|██████████| 630/630 [02:07<00:00,  4.96it/s, accuracy=0.957, accuracy_depends=0.843, cost=0.57] 
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 9, training loss: 0.372532, training acc: 0.966455, training depends: 0.872990, valid loss: 0.523298, valid acc: 0.953737, valid depends: 0.851046



train minibatch loop: 100%|██████████| 2519/2519 [15:27<00:00,  2.72it/s, accuracy=1, accuracy_depends=0.85, cost=0.0546]     
test minibatch loop: 100%|██████████| 630/630 [02:06<00:00,  4.99it/s, accuracy=0.959, accuracy_depends=0.845, cost=0.579]
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 10, training loss: 0.329999, training acc: 0.971050, training depends: 0.882039, valid loss: 0.489395, valid acc: 0.958892, valid depends: 0.860195



train minibatch loop: 100%|██████████| 2519/2519 [15:26<00:00,  2.72it/s, accuracy=1, accuracy_depends=0.8, cost=0.303]       
test minibatch loop: 100%|██████████| 630/630 [02:06<00:00,  4.97it/s, accuracy=0.957, accuracy_depends=0.882, cost=0.386]
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 11, training loss: 0.296270, training acc: 0.975037, training depends: 0.888372, valid loss: 0.414677, valid acc: 0.960939, valid depends: 0.883841



train minibatch loop: 100%|██████████| 2519/2519 [15:27<00:00,  2.72it/s, accuracy=0.95, accuracy_depends=0.85, cost=0.0516]  
test minibatch loop: 100%|██████████| 630/630 [02:06<00:00,  4.97it/s, accuracy=0.948, accuracy_depends=0.882, cost=0.456]
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 12, training loss: 0.263290, training acc: 0.978965, training depends: 0.895745, valid loss: 0.402342, valid acc: 0.963851, valid depends: 0.887353



train minibatch loop: 100%|██████████| 2519/2519 [15:27<00:00,  2.71it/s, accuracy=1, accuracy_depends=0.85, cost=0.00413]    
test minibatch loop: 100%|██████████| 630/630 [02:07<00:00,  4.95it/s, accuracy=0.975, accuracy_depends=0.902, cost=0.358] 
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 13, training loss: 0.238861, training acc: 0.982122, training depends: 0.901446, valid loss: 0.393664, valid acc: 0.968449, valid depends: 0.882857



train minibatch loop: 100%|██████████| 2519/2519 [15:33<00:00,  2.70it/s, accuracy=1, accuracy_depends=0.85, cost=0.00173]    
test minibatch loop: 100%|██████████| 630/630 [02:06<00:00,  4.96it/s, accuracy=0.97, accuracy_depends=0.914, cost=0.349]  
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 14, training loss: 0.222116, training acc: 0.984062, training depends: 0.904335, valid loss: 0.413641, valid acc: 0.969395, valid depends: 0.878797



train minibatch loop: 100%|██████████| 2519/2519 [15:24<00:00,  2.73it/s, accuracy=1, accuracy_depends=0.85, cost=0.0861]     
test minibatch loop: 100%|██████████| 630/630 [02:02<00:00,  5.16it/s, accuracy=0.977, accuracy_depends=0.907, cost=0.311] 
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 15, training loss: 0.206292, training acc: 0.985854, training depends: 0.908097, valid loss: 0.333951, valid acc: 0.970574, valid depends: 0.901331



train minibatch loop: 100%|██████████| 2519/2519 [15:01<00:00,  2.80it/s, accuracy=0.95, accuracy_depends=0.85, cost=0.169]   
test minibatch loop: 100%|██████████| 630/630 [02:01<00:00,  5.17it/s, accuracy=0.977, accuracy_depends=0.92, cost=0.227]  
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 16, training loss: 0.190376, training acc: 0.986826, training depends: 0.911747, valid loss: 0.326896, valid acc: 0.971972, valid depends: 0.903429



train minibatch loop: 100%|██████████| 2519/2519 [15:00<00:00,  2.80it/s, accuracy=1, accuracy_depends=0.85, cost=0.00131]    
test minibatch loop: 100%|██████████| 630/630 [02:02<00:00,  5.15it/s, accuracy=0.964, accuracy_depends=0.916, cost=0.278] 
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 17, training loss: 0.177621, training acc: 0.987951, training depends: 0.915344, valid loss: 0.332637, valid acc: 0.971900, valid depends: 0.902321



train minibatch loop: 100%|██████████| 2519/2519 [15:00<00:00,  2.80it/s, accuracy=1, accuracy_depends=0.85, cost=0.000426]   
test minibatch loop: 100%|██████████| 630/630 [02:02<00:00,  5.15it/s, accuracy=0.966, accuracy_depends=0.93, cost=0.245]  
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 18, training loss: 0.166710, training acc: 0.989098, training depends: 0.917499, valid loss: 0.322491, valid acc: 0.974446, valid depends: 0.907440



train minibatch loop: 100%|██████████| 2519/2519 [15:01<00:00,  2.79it/s, accuracy=1, accuracy_depends=0.85, cost=0.00151]    
test minibatch loop: 100%|██████████| 630/630 [02:03<00:00,  5.12it/s, accuracy=0.973, accuracy_depends=0.909, cost=0.285] 

epoch: 19, training loss: 0.155884, training acc: 0.989576, training depends: 0.920570, valid loss: 0.300692, valid acc: 0.976090, valid depends: 0.909638






In [31]:
from tqdm import tqdm

epoch = 5
for e in range(epoch):
    train_acc, train_loss = [], []
    test_acc, test_loss = [], []
    train_acc_depends, test_acc_depends = [], []
    
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i: index]
        batch_x = pad_sequences(batch_x,padding='post')
        batch_y = train_Y[i: index]
        batch_y = pad_sequences(batch_y,padding='post')
        batch_depends = train_depends[i: index]
        batch_depends = pad_sequences(batch_depends,padding='post')
        batch_segments = segments_train[i: index]
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = masks_train[i: index]
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        
        acc_depends, acc, cost, _ = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.words: batch_x,
                model.types: batch_y,
                model.heads: batch_depends,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
                model.switch: True
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        train_acc_depends.append(acc_depends)
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
        
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i: index]
        batch_x = pad_sequences(batch_x,padding='post')
        batch_y = test_Y[i: index]
        batch_y = pad_sequences(batch_y,padding='post')
        batch_depends = test_depends[i: index]
        batch_depends = pad_sequences(batch_depends,padding='post')
        batch_segments = segments_test[i: index]
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = masks_test[i: index]
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        
        acc_depends, acc, cost = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost],
            feed_dict = {
                model.words: batch_x,
                model.types: batch_y,
                model.heads: batch_depends,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
                model.switch: True
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        test_acc_depends.append(acc_depends)
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    
    
    print(
    'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\n'
    % (e, np.mean(train_loss), 
       np.mean(train_acc), 
       np.mean(train_acc_depends), 
       np.mean(test_loss), 
       np.mean(test_acc), 
       np.mean(test_acc_depends)
    ))

train minibatch loop: 100%|██████████| 2519/2519 [15:03<00:00,  2.79it/s, accuracy=0.95, accuracy_depends=0.85, cost=0.854]  
test minibatch loop: 100%|██████████| 630/630 [02:02<00:00,  5.15it/s, accuracy=0.989, accuracy_depends=0.895, cost=2.23] 
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 0, training loss: 1.406487, training acc: 0.990304, training depends: 0.911989, valid loss: 3.639354, valid acc: 0.980902, valid depends: 0.892664



train minibatch loop: 100%|██████████| 2519/2519 [14:55<00:00,  2.81it/s, accuracy=1, accuracy_depends=0.85, cost=0.000302]  
test minibatch loop: 100%|██████████| 630/630 [02:00<00:00,  5.24it/s, accuracy=0.991, accuracy_depends=0.936, cost=2.01] 
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 1, training loss: 0.790412, training acc: 0.994914, training depends: 0.917277, valid loss: 3.557691, valid acc: 0.982399, valid depends: 0.902226



train minibatch loop: 100%|██████████| 2519/2519 [15:08<00:00,  2.77it/s, accuracy=1, accuracy_depends=0.85, cost=0.00217]   
test minibatch loop: 100%|██████████| 630/630 [02:05<00:00,  5.02it/s, accuracy=0.993, accuracy_depends=0.918, cost=2.68] 
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 2, training loss: 0.662908, training acc: 0.995871, training depends: 0.917625, valid loss: 3.628110, valid acc: 0.982800, valid depends: 0.901144



train minibatch loop: 100%|██████████| 2519/2519 [14:50<00:00,  2.83it/s, accuracy=1, accuracy_depends=0.85, cost=0.00313]   
test minibatch loop: 100%|██████████| 630/630 [02:00<00:00,  5.23it/s, accuracy=0.989, accuracy_depends=0.93, cost=3.5]   
train minibatch loop:   0%|          | 0/2519 [00:00<?, ?it/s]

epoch: 3, training loss: 0.599805, training acc: 0.996370, training depends: 0.917015, valid loss: 3.422855, valid acc: 0.983998, valid depends: 0.900396



train minibatch loop: 100%|██████████| 2519/2519 [14:50<00:00,  2.83it/s, accuracy=1, accuracy_depends=0.85, cost=0.00308]   
test minibatch loop: 100%|██████████| 630/630 [02:00<00:00,  5.24it/s, accuracy=0.995, accuracy_depends=0.9, cost=0.604]  

epoch: 4, training loss: 0.545647, training acc: 0.996865, training depends: 0.917185, valid loss: 3.373970, valid acc: 0.984332, valid depends: 0.899482






In [35]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'xlnet-base-dependency/model.ckpt')

'xlnet-base-dependency/model.ckpt'

In [36]:
kwargs = dict(
      is_training=False,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base/config.json')

In [37]:
learning_rate = 2e-5
hidden_size_word = 128

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(learning_rate, hidden_size_word)
sess.run(tf.global_variables_initializer())

INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>




In [38]:
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, 'xlnet-base-dependency/model.ckpt')

INFO:tensorflow:Restoring parameters from xlnet-base-dependency/model.ckpt


In [39]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out

In [40]:
def evaluate(heads_pred, types_pred, heads, types, lengths,
             symbolic_root=False, symbolic_end=False):
    batch_size, _ = heads_pred.shape
    ucorr = 0.
    lcorr = 0.
    total = 0.
    ucomplete_match = 0.
    lcomplete_match = 0.

    corr_root = 0.
    total_root = 0.
    start = 1 if symbolic_root else 0
    end = 1 if symbolic_end else 0
    for i in range(batch_size):
        ucm = 1.
        lcm = 1.
        for j in range(start, lengths[i] - end):

            total += 1
            if heads[i, j] == heads_pred[i, j]:
                ucorr += 1
                if types[i, j] == types_pred[i, j]:
                    lcorr += 1
                else:
                    lcm = 0
            else:
                ucm = 0
                lcm = 0

            if heads[i, j] == 0:
                total_root += 1
                corr_root += 1 if heads_pred[i, j] == 0 else 0

        ucomplete_match += ucm
        lcomplete_match += lcm
    
    return ucorr / total, lcorr / total, corr_root / total_root

In [41]:
arcs, types, roots = [], [], []
real_Y, predict_Y = [], []

for i in tqdm(range(0, len(test_X), batch_size)):
    index = min(i + batch_size, len(test_X))
    batch_x = test_X[i: index]
    batch_x = pad_sequences(batch_x,padding='post')
    batch_y = test_Y[i: index]
    batch_y = pad_sequences(batch_y,padding='post')
    batch_depends = test_depends[i: index]
    batch_depends = pad_sequences(batch_depends,padding='post')
    batch_segments = segments_test[i: index]
    batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
    batch_masks = masks_test[i: index]
    batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
    
    tags_seq, heads = sess.run(
        [model.logits, model.heads_seq],
        feed_dict = {
            model.words: batch_x,
            model.segment_ids: batch_segments,
            model.input_masks: batch_masks
        },
    )
    
    arc_accuracy, type_accuracy, root_accuracy = evaluate(heads - 1, tags_seq, batch_depends - 1, batch_y, 
            np.count_nonzero(batch_x, axis = 1))
    arcs.append(arc_accuracy)
    types.append(type_accuracy)
    roots.append(root_accuracy)
    predicted = pred2label(tags_seq)
    real = pred2label(batch_y)
    predict_Y.extend(predicted)
    real_Y.extend(real)

100%|██████████| 630/630 [01:56<00:00,  5.39it/s]


In [42]:
temp_real_Y = []
for r in real_Y:
    temp_real_Y.extend(r)
    
temp_predict_Y = []
for r in predict_Y:
    temp_predict_Y.extend(r)

In [43]:
from sklearn.metrics import classification_report
print(classification_report(temp_real_Y, temp_predict_Y, digits = 5))

               precision    recall  f1-score   support

          PAD    0.99998   1.00000   0.99999    632972
            X    1.00000   0.99997   0.99999    143586
          acl    0.98091   0.98226   0.98158      5806
        advcl    0.97098   0.95161   0.96120      2356
       advmod    0.98802   0.97806   0.98302      9527
         amod    0.95966   0.97100   0.96530      8208
        appos    0.98846   0.98947   0.98896      4936
          aux    1.00000   1.00000   1.00000        10
         case    0.99454   0.99110   0.99282     21128
           cc    0.98704   0.99518   0.99109      6429
        ccomp    0.89091   0.97313   0.93021       856
     compound    0.98091   0.96643   0.97362     13079
compound:plur    0.99068   0.98401   0.98733      1188
         conj    0.98303   0.99214   0.98756      8524
          cop    0.98664   0.99071   0.98867      1938
        csubj    0.96000   0.96000   0.96000        50
   csubj:pass    0.95652   0.91667   0.93617        24
         

In [44]:
print('arc accuracy:', np.mean(arcs))
print('types accuracy:', np.mean(types))
print('root accuracy:', np.mean(roots))

arc accuracy: 0.9310084738376598
types accuracy: 0.9258795751889828
root accuracy: 0.9474206349206349


In [45]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or '_seq' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
        and 'adam' not in n.name
        and 'gradients/bert' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'Placeholder_4',
 'Placeholder_5',
 'W_d',
 'W_e',
 'U',
 'U-bi',
 'Wl',
 'Wr',
 'model/transformer/r_w_bias',
 'model/transformer/r_r_bias',
 'model/transformer/word_embedding/lookup_table',
 'model/transformer/r_s_bias',
 'model/transformer/seg_embed',
 'model/transformer/layer_0/rel_attn/q/kernel',
 'model/transformer/layer_0/rel_attn/k/kernel',
 'model/transformer/layer_0/rel_attn/v/kernel',
 'model/transformer/layer_0/rel_attn/r/kernel',
 'model/transformer/layer_0/rel_attn/o/kernel',
 'model/transformer/layer_0/rel_attn/LayerNorm/gamma',
 'model/transformer/layer_0/ff/layer_1/kernel',
 'model/transformer/layer_0/ff/layer_1/bias',
 'model/transformer/layer_0/ff/layer_2/kernel',
 'model/transformer/layer_0/ff/layer_2/bias',
 'model/transformer/layer_0/ff/LayerNorm/gamma',
 'model/transformer/layer_1/rel_attn/q/kernel',
 'model/transformer/layer_1/rel_attn/k/kernel',
 'model/transformer/layer_1/rel_attn/v/kernel'

In [46]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [47]:
freeze_graph('xlnet-base-dependency', strings)

INFO:tensorflow:Restoring parameters from xlnet-base-dependency/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 176 variables.
INFO:tensorflow:Converted 176 variables to const ops.
8206 ops in the final graph.


In [48]:
def merge_sentencepiece_tokens_tagging(x, y):
    new_paired_tokens = []
    n_tokens = len(x)
    rejected = ['<cls>', '<sep>']

    i = 0

    while i < n_tokens:

        current_token, current_label = x[i], y[i]
        if not current_token.startswith('▁') and current_token not in rejected:
            previous_token, previous_label = new_paired_tokens.pop()
            merged_token = previous_token
            merged_label = [previous_label]
            while (
                not current_token.startswith('▁')
                and current_token not in rejected
            ):
                merged_token = merged_token + current_token.replace('▁', '')
                merged_label.append(current_label)
                i = i + 1
                current_token, current_label = x[i], y[i]
            merged_label = merged_label[0]
            new_paired_tokens.append((merged_token, merged_label))

        else:
            new_paired_tokens.append((current_token, current_label))
            i = i + 1

    words = [
        i[0].replace('▁', '')
        for i in new_paired_tokens
        if i[0] not in ['<cls>', '<sep>']
    ]
    labels = [i[1] for i in new_paired_tokens if i[0] not in ['<cls>', '<sep>']]
    return words, labels

In [49]:
string = 'KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar  sekiranya mengantuk ketika memandu.'

import re

def entities_textcleaning(string, lowering = False):
    """
    use by entities recognition, pos recognition and dependency parsing
    """
    string = re.sub('[^A-Za-z0-9\-\/():,. ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    original_string = string.split()
    if lowering:
        string = string.lower()
    string = [
        (original_string[no], word.title() if word.isupper() else word)
        for no, word in enumerate(string.split())
        if len(word)
    ]
    return [s[0] for s in string], [s[1] for s in string]

def parse_X(left):
    left = ' '.join(left)
    bert_tokens = tokenize_fn(left)
    bert_tokens.extend([4, 3])
    segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]
    input_mask = [0] * len(segment)
    s_tokens = [sp_model.IdToPiece(i) for i in bert_tokens]
    return bert_tokens, segment, input_mask, s_tokens

sequence = entities_textcleaning(string)[1]
print(sequence, len(sequence))
parsed_sequence, segment_sequence, mask_sequence, xlnet_sequence = parse_X(sequence)
len(parsed_sequence)

['Kuala', 'Lumpur:', 'Sempena', 'sambutan', 'Aidilfitri', 'minggu', 'depan,', 'Perdana', 'Menteri', 'Tun', 'Dr', 'Mahathir', 'Mohamad', 'dan', 'Menteri', 'Pengangkutan', 'Anthony', 'Loke', 'Siew', 'Fook', 'menitipkan', 'pesanan', 'khas', 'kepada', 'orang', 'ramai', 'yang', 'mahu', 'pulang', 'ke', 'kampung', 'halaman', 'masing-masing.', 'Dalam', 'video', 'pendek', 'terbitan', 'Jabatan', 'Keselamatan', 'Jalan', 'Raya', '(Jkjr)', 'itu,', 'Dr', 'Mahathir', 'menasihati', 'mereka', 'supaya', 'berhenti', 'berehat', 'dan', 'tidur', 'sebentar', 'sekiranya', 'mengantuk', 'ketika', 'memandu.'] 57


73

In [50]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

g = load_graph('xlnet-base-dependency/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
seg = g.get_tensor_by_name('import/Placeholder_1:0')
m = g.get_tensor_by_name('import/Placeholder_2:0')
heads_seq = g.get_tensor_by_name('import/heads_seq:0')
tags_seq = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)



In [51]:
h, t = test_sess.run([heads_seq, tags_seq],
        feed_dict = {
            x: [parsed_sequence],
            seg: [segment_sequence],
            m: [mask_sequence],
        },
)
h = h[0] - 1
t = [idx2tag[d] for d in t[0]]
merged_h = merge_sentencepiece_tokens_tagging(xlnet_sequence, h)
merged_t = merge_sentencepiece_tokens_tagging(xlnet_sequence, t)

In [52]:
print(list(zip(merged_h[0], merged_h[1])))

[('Kuala', 23), ('Lumpur:', 1), ('Sempena', 5), ('sambutan', 23), ('Aidilfitri', 5), ('minggu', 6), ('depan,', 7), ('Perdana', 23), ('Menteri', 10), ('Tun', 11), ('Dr', 12), ('Mahathir', 21), ('Mohamad', 21), ('dan', 16), ('Menteri', 10), ('Pengangkutan', 17), ('Anthony', 24), ('Loke', 19), ('Siew', 21), ('Fook', 21), ('menitipkan', 0), ('pesanan', 23), ('khas', 24), ('kepada', 27), ('orang', 24), ('ramai', 27), ('yang', 31), ('mahu', 31), ('pulang', 27), ('ke', 33), ('kampung', 31), ('halaman', 33), ('masing-masing.', 33), ('Dalam', 38), ('video', 50), ('pendek', 38), ('terbitan', 38), ('Jabatan', 39), ('Keselamatan', 40), ('Jalan', 41), ('Raya', 41), ('(Jkjr)', 50), ('itu,', 38), ('Dr', 50), ('Mahathir', 50), ('menasihati', 23), ('mereka', 50), ('supaya', 52), ('berhenti', 50), ('berehat', 52), ('dan', 54), ('tidur', 52), ('sebentar', 56), ('sekiranya', 58), ('mengantuk', 54), ('ketika', 59), ('memandu.', 58)]


In [53]:
print(list(zip(merged_t[0], merged_t[1])))

[('Kuala', 'nsubj'), ('Lumpur:', 'flat'), ('Sempena', 'case'), ('sambutan', 'obl'), ('Aidilfitri', 'compound'), ('minggu', 'compound'), ('depan,', 'compound'), ('Perdana', 'nsubj'), ('Menteri', 'flat'), ('Tun', 'flat'), ('Dr', 'flat'), ('Mahathir', 'flat'), ('Mohamad', 'flat'), ('dan', 'cc'), ('Menteri', 'conj'), ('Pengangkutan', 'flat'), ('Anthony', 'flat'), ('Loke', 'flat'), ('Siew', 'flat'), ('Fook', 'flat'), ('menitipkan', 'root'), ('pesanan', 'obj'), ('khas', 'amod'), ('kepada', 'case'), ('orang', 'nmod'), ('ramai', 'compound'), ('yang', 'nsubj'), ('mahu', 'advmod'), ('pulang', 'acl'), ('ke', 'case'), ('kampung', 'obl'), ('halaman', 'compound'), ('masing-masing.', 'det'), ('Dalam', 'case'), ('video', 'obl'), ('pendek', 'amod'), ('terbitan', 'compound'), ('Jabatan', 'flat'), ('Keselamatan', 'flat'), ('Jalan', 'flat'), ('Raya', 'flat'), ('(Jkjr)', 'punct'), ('itu,', 'det'), ('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'parataxis'), ('mereka', 'obj'), ('supaya', 'case'), ('b

In [54]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'xlnet-base-dependency/frozen_model.pb'
outPutname = "v34/dependency/xlnet-base-dependency.pb"

s3 = boto3.client('s3')

s3.upload_file(Key,bucketName,outPutname)