In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import xlnet
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import model_utils
import pickle
import json
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences




In [3]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('xlnet-base/sp10m.cased.v9.model')

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [4]:
with open('../bert/session-pos.pkl', 'rb') as fopen:
    data = pickle.load(fopen)
data.keys()

dict_keys(['train_X', 'test_X', 'train_Y', 'test_Y'])

In [5]:
train_X = data['train_X']
test_X = data['test_X']
train_Y = data['train_Y']
test_Y = data['test_Y']

In [6]:
with open('../bert/dictionary-pos.json') as fopen:
    dictionary = json.load(fopen)
dictionary.keys()

dict_keys(['word2idx', 'idx2word', 'tag2idx', 'idx2tag', 'char2idx'])

In [7]:
word2idx = dictionary['word2idx']
idx2word = {int(k): v for k, v in dictionary['idx2word'].items()}
tag2idx = dictionary['tag2idx']
idx2tag = {int(k): v for k, v in dictionary['idx2tag'].items()}
char2idx = dictionary['char2idx']

In [8]:
from tqdm import tqdm

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

def XY(left_train, right_train):
    X, Y, segments, masks = [], [], [], []
    for i in tqdm(range(len(left_train))):
        left = [idx2word[d] for d in left_train[i]]
        right = [idx2tag[d] for d in right_train[i]]
        bert_tokens = []
        y = []
        for no, orig_token in enumerate(left):
            t = tokenize_fn(orig_token)
            bert_tokens.extend(t)
            if len(t):
                y.append(right[no])
            y.extend(['X'] * (len(t) - 1))
        bert_tokens.extend([4, 3])
        segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]
        input_mask = [0] * len(segment)
        y.extend(['PAD', 'PAD'])
        y = [tag2idx[i] for i in y]
        if len(bert_tokens) != len(y):
            print(i)
        X.append(bert_tokens)
        Y.append(y)
        segments.append(segment)
        masks.append(input_mask)
    return X, Y, segments, masks

In [9]:
train_X, train_Y, train_segments, train_masks = XY(train_X, train_Y)
test_X, test_Y, test_segments, test_masks = XY(test_X, test_Y)

100%|██████████| 97488/97488 [01:21<00:00, 1199.73it/s]
100%|██████████| 24335/24335 [00:20<00:00, 1163.33it/s]


In [10]:
len(train_X[0]), len(train_X[0]), len(train_segments[0]), len(train_masks[0])

(68, 68, 68, 68)

In [11]:
kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base/config.json')




In [12]:
epoch = 5
batch_size = 32
warmup_proportion = 0.1
num_train_steps = int(len(train_X) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
print(num_train_steps, num_warmup_steps)

training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = 2e-5,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.02,
      clip = 1.0,
      clamp_len=-1,)

15232 1523


In [13]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [14]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.lengths = tf.count_nonzero(self.X, 1)
        self.maxlen = tf.shape(self.X)[1]
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.transpose(self.X, [1, 0]),
            seg_ids=tf.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.transpose(self.input_masks, [1, 0]))
        output_layer = xlnet_model.get_sequence_output()
        output_layer = tf.transpose(output_layer, [1, 0, 2])
        
        logits = tf.layers.dense(output_layer, dimension_output)
        y_t = self.Y
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_t, self.lengths
        )
        self.cost = tf.reduce_mean(-log_likelihood)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        self.tags_seq, tags_score = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')

        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(y_t, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [15]:
dimension_output = len(tag2idx)
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead



INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `layer.__call__` method instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


In [16]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [17]:
tvars = tf.trainable_variables()
checkpoint = 'xlnet-base/model.ckpt'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [18]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

INFO:tensorflow:Restoring parameters from xlnet-base/model.ckpt


In [19]:
def merge_sentencepiece_tokens_tagging(x, y):
    new_paired_tokens = []
    n_tokens = len(x)
    rejected = ['<cls>', '<sep>']

    i = 0

    while i < n_tokens:

        current_token, current_label = x[i], y[i]
        if not current_token.startswith('▁') and current_token not in rejected:
            previous_token, previous_label = new_paired_tokens.pop()
            merged_token = previous_token
            merged_label = [previous_label]
            while (
                not current_token.startswith('▁')
                and current_token not in rejected
            ):
                merged_token = merged_token + current_token.replace('▁', '')
                merged_label.append(current_label)
                i = i + 1
                current_token, current_label = x[i], y[i]
            merged_label = merged_label[0]
            new_paired_tokens.append((merged_token, merged_label))

        else:
            new_paired_tokens.append((current_token, current_label))
            i = i + 1

    words = [
        i[0].replace('▁', '')
        for i in new_paired_tokens
        if i[0] not in ['<cls>', '<sep>']
    ]
    labels = [i[1] for i in new_paired_tokens if i[0] not in ['<cls>', '<sep>']]
    return words, labels

In [20]:
string = 'KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar  sekiranya mengantuk ketika memandu.'

import re

def entities_textcleaning(string, lowering = False):
    """
    use by entities recognition, pos recognition and dependency parsing
    """
    string = re.sub('[^A-Za-z0-9\-\/() ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    original_string = string.split()
    if lowering:
        string = string.lower()
    string = [
        (original_string[no], word.title() if word.isupper() else word)
        for no, word in enumerate(string.split())
        if len(word)
    ]
    return [s[0] for s in string], [s[1] for s in string]

def parse_X(left):
    bert_tokens = []
    for no, orig_token in enumerate(left):
        t = tokenize_fn(orig_token)
        bert_tokens.extend(t)
    bert_tokens.extend([4, 3])
    segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]
    input_mask = [0] * len(segment)
    s_tokens = [sp_model.IdToPiece(i) for i in bert_tokens]
    return bert_tokens, segment, input_mask, s_tokens

sequence = entities_textcleaning(string)[1]
parsed_sequence, segment_sequence, mask_sequence, xlnet_sequence = parse_X(sequence)

In [21]:
predicted = sess.run(model.tags_seq,
                feed_dict = {
                    model.X: [parsed_sequence],
                    model.segment_ids: [segment_sequence],
                    model.input_masks: [mask_sequence],
                },
        )[0]
merged = merge_sentencepiece_tokens_tagging(xlnet_sequence, [idx2tag[d] for d in predicted])
list(zip(merged[0], merged[1]))

[('Kuala', 'ADP'),
 ('Lumpur', 'NUM'),
 ('Sempena', 'NUM'),
 ('sambutan', 'NUM'),
 ('Aidilfitri', 'AUX'),
 ('minggu', 'DET'),
 ('depan', 'ADV'),
 ('Perdana', 'NOUN'),
 ('Menteri', 'CCONJ'),
 ('Tun', 'PROPN'),
 ('Dr', 'ADV'),
 ('Mahathir', 'NUM'),
 ('Mohamad', 'CCONJ'),
 ('dan', 'CCONJ'),
 ('Menteri', 'NOUN'),
 ('Pengangkutan', 'VERB'),
 ('Anthony', 'NUM'),
 ('Loke', 'SYM'),
 ('Siew', 'SYM'),
 ('Fook', 'SYM'),
 ('menitipkan', 'VERB'),
 ('pesanan', 'VERB'),
 ('khas', 'VERB'),
 ('kepada', 'VERB'),
 ('orang', 'ADV'),
 ('ramai', 'ADJ'),
 ('yang', 'SYM'),
 ('mahu', 'SYM'),
 ('pulang', 'SCONJ'),
 ('ke', 'ADP'),
 ('kampung', 'ADP'),
 ('halaman', 'VERB'),
 ('masing-masing', 'NUM'),
 ('Dalam', 'SCONJ'),
 ('video', 'NOUN'),
 ('pendek', 'PUNCT'),
 ('terbitan', 'VERB'),
 ('Jabatan', 'SYM'),
 ('Keselamatan', 'VERB'),
 ('Jalan', 'DET'),
 ('Raya', 'DET'),
 ('(Jkjr)', 'PAD'),
 ('itu', 'SCONJ'),
 ('Dr', 'SCONJ'),
 ('Mahathir', 'NOUN'),
 ('menasihati', 'PART'),
 ('mereka', 'ADV'),
 ('supaya', 'SYM'),
 ('

In [22]:
import time

for e in range(5):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i : index]
        batch_y = train_Y[i : index]
        batch_masks = train_masks[i : index]
        batch_segments = train_segments[i : index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_y = pad_sequences(batch_y, padding='post')
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
            },
        )
        assert not np.isnan(cost)
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i : index]
        batch_y = test_Y[i : index]
        batch_masks = test_masks[i : index]
        batch_segments = test_segments[i : index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_y = pad_sequences(batch_y, padding='post')
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
            },
        )
        assert not np.isnan(cost)
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (e, train_loss, train_acc, test_loss, test_acc)
    )
    
    predicted = sess.run(model.tags_seq,
                feed_dict = {
                    model.X: [parsed_sequence],
                    model.segment_ids: [segment_sequence],
                    model.input_masks: [mask_sequence],
                },
        )[0]
    merged = merge_sentencepiece_tokens_tagging(xlnet_sequence, [idx2tag[d] for d in predicted])
    print(list(zip(merged[0], merged[1])))

train minibatch loop: 100%|██████████| 3047/3047 [19:18<00:00,  2.63it/s, accuracy=0.956, cost=10.3]
test minibatch loop: 100%|██████████| 761/761 [02:25<00:00,  5.24it/s, accuracy=0.947, cost=12.5]
train minibatch loop:   0%|          | 0/3047 [00:00<?, ?it/s]

time taken: 1303.7744524478912
epoch: 0, training loss: 16.043856, training acc: 0.936956, valid loss: 14.449385, valid acc: 0.939994

[('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'VERB'), ('sambutan', 'NOUN'), ('Aidilfitri', 'NOUN'), ('minggu', 'NOUN'), ('depan', 'NOUN'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'NOUN'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'DET'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'PROPN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('(Jkjr)', 'PUNCT'), ('i

train minibatch loop: 100%|██████████| 3047/3047 [19:22<00:00,  2.62it/s, accuracy=0.971, cost=5.78] 
test minibatch loop: 100%|██████████| 761/761 [02:24<00:00,  5.26it/s, accuracy=0.918, cost=16.2] 
train minibatch loop:   0%|          | 0/3047 [00:00<?, ?it/s]

time taken: 1307.0042235851288
epoch: 1, training loss: 7.613085, training acc: 0.966161, valid loss: 17.831388, valid acc: 0.940940

[('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'ADP'), ('sambutan', 'NOUN'), ('Aidilfitri', 'NOUN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'NOUN'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'DET'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'PROPN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('(Jkjr)', 'PUNCT'), ('itu'

train minibatch loop: 100%|██████████| 3047/3047 [19:22<00:00,  2.62it/s, accuracy=0.986, cost=2.8]  
test minibatch loop: 100%|██████████| 761/761 [02:24<00:00,  5.27it/s, accuracy=0.933, cost=27.9] 
train minibatch loop:   0%|          | 0/3047 [00:00<?, ?it/s]

time taken: 1307.146255016327
epoch: 2, training loss: 3.466507, training acc: 0.983474, valid loss: 23.406965, valid acc: 0.946970

[('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'ADP'), ('sambutan', 'NOUN'), ('Aidilfitri', 'NOUN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'SYM'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'DET'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'PROPN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('(Jkjr)', 'PUNCT'), ('itu', 'D

train minibatch loop: 100%|██████████| 3047/3047 [19:22<00:00,  2.62it/s, accuracy=0.995, cost=1.09] 
test minibatch loop: 100%|██████████| 761/761 [02:24<00:00,  5.28it/s, accuracy=0.955, cost=13.1] 
train minibatch loop:   0%|          | 0/3047 [00:00<?, ?it/s]

time taken: 1306.8624274730682
epoch: 3, training loss: 1.671926, training acc: 0.992149, valid loss: 27.410393, valid acc: 0.945298

[('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'ADP'), ('sambutan', 'NOUN'), ('Aidilfitri', 'NOUN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'DET'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'PROPN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('(Jkjr)', 'PUNCT'), ('itu', 

train minibatch loop: 100%|██████████| 3047/3047 [19:21<00:00,  2.62it/s, accuracy=0.999, cost=0.261] 
test minibatch loop: 100%|██████████| 761/761 [02:23<00:00,  5.29it/s, accuracy=0.941, cost=15.3]


time taken: 1305.7238042354584
epoch: 4, training loss: 0.975642, training acc: 0.995640, valid loss: 29.684076, valid acc: 0.949162

[('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'NOUN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'NOUN'), ('minggu', 'NOUN'), ('depan', 'NOUN'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'NOUN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'DET'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('(Jkjr)', 'PUNCT'), ('itu', 

In [23]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'xlnet-base-pos/model.ckpt')

'xlnet-base-pos/model.ckpt'

In [24]:
kwargs = dict(
      is_training=False,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base/config.json')

In [25]:
dimension_output = len(tag2idx)
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())

INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>




In [26]:
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, 'xlnet-base-pos/model.ckpt')

INFO:tensorflow:Restoring parameters from xlnet-base-pos/model.ckpt


In [27]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out

In [28]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_X))
    batch_x = test_X[i : index]
    batch_y = test_Y[i : index]
    batch_masks = test_masks[i : index]
    batch_segments = test_segments[i : index]
    batch_x = pad_sequences(batch_x, padding='post')
    batch_y = pad_sequences(batch_y, padding='post')
    batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
    batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
    predicted = pred2label(sess.run(
        model.tags_seq,
        feed_dict = {
            model.X: batch_x,
            model.segment_ids: batch_segments,
            model.input_masks: batch_masks,
        },
    ))
    real = pred2label(batch_y)
    predict_Y.extend(predicted)
    real_Y.extend(real)

validation minibatch loop: 100%|██████████| 761/761 [02:18<00:00,  5.51it/s]


In [29]:
temp_real_Y = []
for r in real_Y:
    temp_real_Y.extend(r)
    
temp_predict_Y = []
for r in predict_Y:
    temp_predict_Y.extend(r)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(temp_real_Y, temp_predict_Y, digits = 5))

              precision    recall  f1-score   support

         ADJ    0.83194   0.77563   0.80280     45666
         ADP    0.96501   0.95786   0.96142    119589
         ADV    0.85073   0.84144   0.84606     47760
         AUX    0.99502   0.99950   0.99726     10000
       CCONJ    0.96564   0.92473   0.94474     37171
         DET    0.94985   0.93192   0.94080     38839
        NOUN    0.89484   0.92123   0.90784    268329
         NUM    0.94009   0.94511   0.94260     41211
         PAD    0.99816   1.00000   0.99908    146373
        PART    0.91259   0.94345   0.92777      5500
        PRON    0.96988   0.94223   0.95586     48835
       PROPN    0.93581   0.92557   0.93066    227608
       PUNCT    0.99831   0.99933   0.99882    182824
       SCONJ    0.73907   0.82376   0.77912     15150
         SYM    0.96944   0.96917   0.96930      3600
        VERB    0.94517   0.94727   0.94622    124518
           X    0.99992   0.99957   0.99975    410749

    accuracy              

In [31]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'model/transformer/r_w_bias',
 'model/transformer/r_r_bias',
 'model/transformer/word_embedding/lookup_table',
 'model/transformer/r_s_bias',
 'model/transformer/seg_embed',
 'model/transformer/layer_0/rel_attn/q/kernel',
 'model/transformer/layer_0/rel_attn/k/kernel',
 'model/transformer/layer_0/rel_attn/v/kernel',
 'model/transformer/layer_0/rel_attn/r/kernel',
 'model/transformer/layer_0/rel_attn/o/kernel',
 'model/transformer/layer_0/rel_attn/LayerNorm/gamma',
 'model/transformer/layer_0/ff/layer_1/kernel',
 'model/transformer/layer_0/ff/layer_1/bias',
 'model/transformer/layer_0/ff/layer_2/kernel',
 'model/transformer/layer_0/ff/layer_2/bias',
 'model/transformer/layer_0/ff/LayerNorm/gamma',
 'model/transformer/layer_1/rel_attn/q/kernel',
 'model/transformer/layer_1/rel_attn/k/kernel',
 'model/transformer/layer_1/rel_attn/v/kernel',
 'model/transformer/layer_1/rel_attn/r/kernel',
 'model/transformer/layer_1/rel

In [32]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [33]:
freeze_graph('xlnet-base-pos', strings)

INFO:tensorflow:Restoring parameters from xlnet-base-pos/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 164 variables.
INFO:tensorflow:Converted 164 variables to const ops.
7970 ops in the final graph.


In [34]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

g = load_graph('xlnet-base-pos/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
seg = g.get_tensor_by_name('import/Placeholder_1:0')
m = g.get_tensor_by_name('import/Placeholder_2:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)



In [35]:
predicted = test_sess.run(logits,
            feed_dict = {
                x: [parsed_sequence],
                seg: [segment_sequence],
                m: [mask_sequence],
            },
    )[0]
merged = merge_sentencepiece_tokens_tagging(xlnet_sequence, [idx2tag[d] for d in predicted])
print(list(zip(merged[0], merged[1])))

[('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'ADP'), ('sambutan', 'NOUN'), ('Aidilfitri', 'NOUN'), ('minggu', 'NOUN'), ('depan', 'NOUN'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADV'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'DET'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('(Jkjr)', 'PUNCT'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'),

In [36]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'xlnet-base-pos/frozen_model.pb'
outPutname = "v34/pos/xlnet-base-pos.pb"

s3 = boto3.client('s3')

s3.upload_file(Key,bucketName,outPutname)