In [1]:
# !wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
# !unzip multi_cased_L-12_H-768_A-12.zip

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import tensorflow as tf
import pandas as pd
from tqdm import tqdm




In [4]:
import json

with open('dataset.json') as fopen:
    data = json.load(fopen)
    
train_X = data['train_X']
train_Y = data['train_Y']
test_X = data['test_X']
test_Y = data['test_Y']

In [5]:
BERT_VOCAB = 'multi_cased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = 'multi_cased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = 'multi_cased_L-12_H-768_A-12/bert_config.json'

tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=False)




In [6]:
GO = 101
EOS = 102

In [7]:
from unidecode import unidecode

def get_inputs(x, y):
    input_ids, input_masks, segment_ids, ys = [], [], [], []
    for i in tqdm(range(len(x))):
        tokens_a = tokenizer.tokenize(unidecode(x[i]))
        tokens_b = tokenizer.tokenize(unidecode(y[i]))
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        
        segment_id = [0] * len(tokens)
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)

        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        
        r = tokenizer.convert_tokens_to_ids(tokens_b + ["[SEP]"])
        if len([k for k in r if k == 0]):
            print(y[i], i)
            break
        
        ys.append(r)
        
    return input_ids, input_masks, segment_ids, ys

In [8]:
train_input_ids, train_input_masks, train_segment_ids, train_Y = get_inputs(train_X, train_Y)

100%|██████████| 200000/200000 [02:39<00:00, 1255.89it/s]


In [9]:
test_input_ids, test_input_masks, test_segment_ids, test_Y = get_inputs(test_X, test_Y)

100%|██████████| 5000/5000 [00:04<00:00, 1089.86it/s]


In [10]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
epoch = 20
batch_size = 16
warmup_proportion = 0.1
num_train_steps = len(train_input_ids)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [11]:
class Model:
    def __init__(
        self,
        size_layer,
        num_layers,
        learning_rate = 2e-5,
        training = True,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=training,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_sequence_output()
        pooled_output = model.get_pooled_output()
        embedding = model.get_embedding_table()
        
        dense = tf.layers.Dense(bert_config.vocab_size)
        
        def cells(size_layer=size_layer, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size_layer,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        lstm_state = tf.nn.rnn_cell.LSTMStateTuple(c=pooled_output, h=pooled_output)
        
        encoder_state = tuple([lstm_state] * num_layers)
        decoder_cells = tf.nn.rnn_cell.MultiRNNCell([cells(size_layer) for _ in range(num_layers)])
        
        training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs = tf.nn.embedding_lookup(embedding, decoder_input),
                sequence_length = self.Y_seq_len,
                time_major = False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = training_helper,
                initial_state = encoder_state,
                output_layer = dense)
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
        self.training_logits = training_decoder_output.rnn_output
        
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding = embedding,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS)
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = predicting_helper,
                initial_state = encoder_state,
                output_layer = dense)
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = True,
                maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))
        self.fast_result = predicting_decoder_output.sample_id
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(bert_config.hidden_size, 2)

sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `layer.a

In [14]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if 'bert/' + name in name_to_variable:
            assignment_map[name] = name_to_variable['bert/' + name]
            initialized_variable_names[name] = 1
            initialized_variable_names[name + ':0'] = 1
        elif name in name_to_variable:
            assignment_map[name] = name_to_variable[name]
            initialized_variable_names[name] = 1
            initialized_variable_names[name + ':0'] = 1
        

    return (assignment_map, initialized_variable_names)

In [15]:
tvars = tf.trainable_variables()

checkpoint = BERT_INIT_CHKPNT
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [16]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

INFO:tensorflow:Restoring parameters from multi_cased_L-12_H-768_A-12/bert_model.ckpt


In [17]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

In [18]:
from tqdm import tqdm
import time

for EPOCH in range(epoch):

    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_mask = train_input_masks[i: index]
        batch_mask = pad_sequences(batch_mask, padding='post')
        batch_segment = train_segment_ids[i: index]
        batch_segment = pad_sequences(batch_segment, padding='post')
        batch_y = pad_sequences(train_Y[i: index], padding='post')
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.input_masks: batch_mask,
                model.segment_ids: batch_segment
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_y = pad_sequences(test_Y[i: index], padding='post')
        batch_mask = test_input_masks[i: index]
        batch_mask = pad_sequences(batch_mask, padding='post')
        batch_segment = test_segment_ids[i: index]
        batch_segment = pad_sequences(batch_segment, padding='post')
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.input_masks: batch_mask,
                model.segment_ids: batch_segment
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 12500/12500 [1:38:03<00:00,  2.12it/s, accuracy=0.163, cost=5.76] 
test minibatch loop: 100%|██████████| 313/313 [00:45<00:00,  6.95it/s, accuracy=0.163, cost=5.62]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 0, training loss: 6.346653, training acc: 0.080043, valid loss: 5.756541, valid acc: 0.151906



train minibatch loop: 100%|██████████| 12500/12500 [1:37:35<00:00,  2.13it/s, accuracy=0.231, cost=4.79]
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.09it/s, accuracy=0.229, cost=4.58]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 1, training loss: 5.131091, training acc: 0.197364, valid loss: 4.706825, valid acc: 0.234835



train minibatch loop: 100%|██████████| 12500/12500 [1:37:10<00:00,  2.14it/s, accuracy=0.29, cost=4.26] 
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.10it/s, accuracy=0.331, cost=4.05]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 2, training loss: 4.417070, training acc: 0.265139, valid loss: 4.205559, valid acc: 0.291394



train minibatch loop: 100%|██████████| 12500/12500 [1:37:08<00:00,  2.14it/s, accuracy=0.326, cost=3.98]
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.11it/s, accuracy=0.343, cost=3.71]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 3, training loss: 4.012529, training acc: 0.311701, valid loss: 3.900843, valid acc: 0.325425



train minibatch loop:  60%|██████    | 7519/12500 [58:18<41:03,  2.02it/s, accuracy=0.301, cost=3.79]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  90%|█████████ | 11261/12500 [1:27:24<09:18,  2.22it/s, accuracy=0.337, cost=3.68]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  14%|█▍        | 1725/12500 [13:24<1:33:25,  1.92it/s, accuracy=0.355, cost=3.42]IOPub message rate exceeded.
The noteboo

epoch: 5, training loss: 3.518369, training acc: 0.372840, valid loss: 3.498968, valid acc: 0.376734



train minibatch loop: 100%|██████████| 12500/12500 [1:37:07<00:00,  2.15it/s, accuracy=0.39, cost=3.43] 
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.07it/s, accuracy=0.424, cost=3.16]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 6, training loss: 3.337285, training acc: 0.395921, valid loss: 3.367668, valid acc: 0.393659



train minibatch loop: 100%|██████████| 12500/12500 [1:37:42<00:00,  2.13it/s, accuracy=0.426, cost=3.3] 
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.09it/s, accuracy=0.449, cost=3.09]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 7, training loss: 3.183454, training acc: 0.415893, valid loss: 3.250831, valid acc: 0.408419



train minibatch loop: 100%|██████████| 12500/12500 [1:38:27<00:00,  2.12it/s, accuracy=0.415, cost=3.18]
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.07it/s, accuracy=0.457, cost=3]   
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 8, training loss: 3.049351, training acc: 0.433559, valid loss: 3.166048, valid acc: 0.420588



train minibatch loop: 100%|██████████| 12500/12500 [1:38:07<00:00,  2.12it/s, accuracy=0.422, cost=3.09]
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.09it/s, accuracy=0.473, cost=2.9] 
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 9, training loss: 2.930445, training acc: 0.449600, valid loss: 3.079593, valid acc: 0.432120



train minibatch loop: 100%|██████████| 12500/12500 [1:37:11<00:00,  2.14it/s, accuracy=0.449, cost=2.97]
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.11it/s, accuracy=0.465, cost=2.82]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 10, training loss: 2.823860, training acc: 0.464009, valid loss: 3.006260, valid acc: 0.441914



train minibatch loop: 100%|██████████| 12500/12500 [1:37:08<00:00,  2.14it/s, accuracy=0.475, cost=2.88]
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.11it/s, accuracy=0.49, cost=2.84] 
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 11, training loss: 2.727184, training acc: 0.477504, valid loss: 2.951655, valid acc: 0.450500



train minibatch loop: 100%|██████████| 12500/12500 [1:37:04<00:00,  2.15it/s, accuracy=0.466, cost=2.8] 
test minibatch loop: 100%|██████████| 313/313 [00:43<00:00,  7.14it/s, accuracy=0.506, cost=2.73]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 12, training loss: 2.637947, training acc: 0.490476, valid loss: 2.894938, valid acc: 0.458457



train minibatch loop: 100%|██████████| 12500/12500 [1:37:00<00:00,  2.15it/s, accuracy=0.466, cost=2.74]
test minibatch loop: 100%|██████████| 313/313 [00:43<00:00,  7.14it/s, accuracy=0.494, cost=2.72]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 13, training loss: 2.555691, training acc: 0.502384, valid loss: 2.853183, valid acc: 0.465069



train minibatch loop: 100%|██████████| 12500/12500 [1:37:10<00:00,  2.14it/s, accuracy=0.458, cost=2.64]
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.05it/s, accuracy=0.49, cost=2.73] 
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 14, training loss: 2.479316, training acc: 0.513608, valid loss: 2.815553, valid acc: 0.469557



train minibatch loop: 100%|██████████| 12500/12500 [1:37:28<00:00,  2.14it/s, accuracy=0.481, cost=2.61]
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.04it/s, accuracy=0.498, cost=2.61]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 15, training loss: 2.407951, training acc: 0.524401, valid loss: 2.787772, valid acc: 0.475006



train minibatch loop: 100%|██████████| 12500/12500 [1:37:16<00:00,  2.14it/s, accuracy=0.506, cost=2.51]
test minibatch loop: 100%|██████████| 313/313 [00:44<00:00,  7.10it/s, accuracy=0.498, cost=2.69]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 16, training loss: 2.340384, training acc: 0.534496, valid loss: 2.757216, valid acc: 0.480557



train minibatch loop: 100%|██████████| 12500/12500 [1:36:58<00:00,  2.15it/s, accuracy=0.508, cost=2.44]
test minibatch loop: 100%|██████████| 313/313 [00:43<00:00,  7.12it/s, accuracy=0.486, cost=2.64]
train minibatch loop:   0%|          | 0/12500 [00:00<?, ?it/s]

epoch: 17, training loss: 2.275915, training acc: 0.544488, valid loss: 2.735755, valid acc: 0.483841



train minibatch loop:  97%|█████████▋| 12166/12500 [1:34:17<02:39,  2.10it/s, accuracy=0.565, cost=2.19]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  25%|██▍       | 3111/12500 [24:08<1:11:48,  2.18it/s, accuracy=0.574, cost=2.05]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop: 100%|██████████| 12500/12500 [1:37:16<00:00,  2.14it/s, accuracy=0.54, cost=2.33]  
test minibatch loop: 100%|██████████|

epoch: 19, training loss: 2.158188, training acc: 0.562544, valid loss: 2.700762, valid acc: 0.489972






In [19]:
from tensor2tensor.utils import bleu_hook

In [21]:
results = []
for i in tqdm(range(0, len(test_X), batch_size)):
    index = min(i + batch_size, len(test_X))
    batch_x = test_input_ids[i: index]
    batch_x = pad_sequences(batch_x, padding='post')
    batch_y = pad_sequences(test_Y[i: index], padding='post')
    batch_mask = test_input_masks[i: index]
    batch_mask = pad_sequences(batch_mask, padding='post')
    batch_segment = test_segment_ids[i: index]
    batch_segment = pad_sequences(batch_segment, padding='post')
    feed = {
        model.X: batch_x,
        model.input_masks: batch_mask,
        model.segment_ids: batch_segment
    }
    p = sess.run(model.fast_result,feed_dict = feed)
    result = []
    for row in p:
        result.append([i for i in row if i > 3 and i not in [101, 102]])
    results.extend(result)

100%|██████████| 313/313 [04:26<00:00,  1.18it/s]


In [22]:
rights = []
for r in test_Y:
    rights.append([i for i in r if i > 3 and i not in [101, 102]])

In [23]:
bleu_hook.compute_bleu(reference_corpus = rights,
                       translation_corpus = results)

0.11384286