In [1]:
# !wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
# !unzip uncased_L-12_H-768_A-12.zip

In [2]:
BERT_VOCAB = 'uncased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = 'uncased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = 'uncased_L-12_H-768_A-12/bert_config.json'

In [3]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import tensorflow as tf

In [4]:
tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=True)

In [5]:
# !wget https://raw.githubusercontent.com/huseinzol05/NLP-Models-Tensorflow/master/neural-machine-translation/english-train
# !wget https://raw.githubusercontent.com/huseinzol05/NLP-Models-Tensorflow/master/neural-machine-translation/vietnam-train

In [6]:
import collections

def build_dataset(words, n_words, atleast=1):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [7]:
with open('english-train', 'r') as fopen:
    text_from = fopen.read().lower().split('\n')[:-1]
with open('vietnam-train', 'r') as fopen:
    text_to = fopen.read().lower().split('\n')[:-1]
print('len from: %d, len to: %d'%(len(text_from), len(text_to)))

len from: 500, len to: 500


In [8]:
concat_to = ' '.join(text_to).split()
vocabulary_size_to = len(list(set(concat_to)))
data_to, count_to, dictionary_to, rev_dictionary_to = build_dataset(concat_to, vocabulary_size_to)
print('vocab to size: %d'%(vocabulary_size_to))
print('Most common words', count_to[4:10])
print('Sample data', data_to[:10], [rev_dictionary_to[i] for i in data_to[:10]])

vocab to size: 1461
Most common words [(',', 472), ('.', 430), ('tôi', 283), ('và', 230), ('có', 199), ('chúng', 196)]
Sample data [84, 22, 668, 73, 10, 389, 110, 34, 81, 299] ['khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu']


In [9]:
GO = dictionary_to['GO']
PAD = dictionary_to['PAD']
EOS = dictionary_to['EOS']
UNK = dictionary_to['UNK']

In [10]:
for i in range(len(text_to)):
    text_to[i] += ' EOS'

In [11]:
MAX_SEQ_LENGTH = 200

from tqdm import tqdm

input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(text_from):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 500/500 [00:00<00:00, 2828.55it/s]


In [12]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
epoch = 20
batch_size = 10
warmup_proportion = 0.1
num_train_steps = int(len(input_ids) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [13]:
class Chatbot:
    def __init__(self, size_layer, num_layers, embedded_size, 
                 to_dict_size, learning_rate, dropout = 0.5):
        
        def gru_cell(reuse=False):
            return tf.nn.rnn_cell.GRUCell(size_layer, reuse=reuse)
        
        def attention(encoder_out, seq_len, reuse=False):
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units = size_layer, 
                                                                    memory = encoder_out,
                                                                    memory_sequence_length = seq_len)
            return tf.contrib.seq2seq.AttentionWrapper(
            cell = tf.nn.rnn_cell.MultiRNNCell([gru_cell(reuse) for _ in range(num_layers)]), 
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        self.encoder_out = model.get_sequence_output()
        self.encoder_state = tf.layers.dense(model.get_pooled_output(), size_layer)
        self.encoder_state = tuple(self.encoder_state for _ in range(num_layers))
        
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        decoder_embeddings = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        decoder_cell = attention(self.encoder_out, self.X_seq_len)
        dense_layer = tf.layers.Dense(to_dict_size)
        training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs = tf.nn.embedding_lookup(decoder_embeddings, decoder_input),
                sequence_length = self.Y_seq_len,
                time_major = False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cell,
                helper = training_helper,
                initial_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=self.encoder_state),
                output_layer = dense_layer)
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding = decoder_embeddings,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS)
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cell,
                helper = predicting_helper,
                initial_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=self.encoder_state),
                output_layer = dense_layer)
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = True,
                maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))
        self.training_logits = training_decoder_output.rnn_output
        self.predicting_ids = predicting_decoder_output.sample_id
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [14]:
size_layer = 256
num_layers = 2
embedded_size = 128
learning_rate = 2e-5

In [15]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Chatbot(size_layer, num_layers, embedded_size, 
                len(dictionary_to), learning_rate)
sess.run(tf.global_variables_initializer())

In [16]:
sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

INFO:tensorflow:Restoring parameters from uncased_L-12_H-768_A-12/bert_model.ckpt


In [17]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k, 2))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

Y = str_idx(text_to, dictionary_to)

In [18]:
from tqdm import tqdm
import time
import numpy as np

for e in range(epoch):

    accuracy, loss = 0, 0
    pbar = tqdm(
        range(0, len(input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(input_ids))
        batch_x = input_ids[i: index]
        batch_masks = input_masks[i: index]
        batch_segment = segment_ids[i: index]
        batch_y, seq_y = pad_sentence_batch(Y[i: index], PAD)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        loss += cost
        accuracy += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    loss /= len(input_ids) / batch_size
    accuracy /= len(input_ids) / batch_size
        
    print(
        'epoch: %d, training loss: %f, training acc: %f\n'
        % (e, loss, accuracy)
    )

train minibatch loop: 100%|██████████| 50/50 [01:01<00:00,  1.20s/it, accuracy=0.0389, cost=6.67] 
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, training loss: 7.054604, training acc: 0.029876



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.20s/it, accuracy=0.07, cost=6.14]  
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 1, training loss: 6.310021, training acc: 0.056042



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.21s/it, accuracy=0.07, cost=6.02]  
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 2, training loss: 5.983596, training acc: 0.071740



train minibatch loop: 100%|██████████| 50/50 [01:00<00:00,  1.21s/it, accuracy=0.0739, cost=5.94]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 3, training loss: 5.815576, training acc: 0.080142



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.20s/it, accuracy=0.0856, cost=5.84]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 4, training loss: 5.675362, training acc: 0.087450



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.20s/it, accuracy=0.0739, cost=5.73]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 5, training loss: 5.566076, training acc: 0.091284



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.21s/it, accuracy=0.0661, cost=5.63]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 6, training loss: 5.478631, training acc: 0.094310



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.20s/it, accuracy=0.0817, cost=5.55]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 7, training loss: 5.395147, training acc: 0.098053



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.21s/it, accuracy=0.0934, cost=5.46]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 8, training loss: 5.329400, training acc: 0.098470



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.22s/it, accuracy=0.0856, cost=5.41]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 9, training loss: 5.262462, training acc: 0.097364



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.21s/it, accuracy=0.0739, cost=5.33]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 10, training loss: 5.192742, training acc: 0.106581



train minibatch loop: 100%|██████████| 50/50 [01:00<00:00,  1.22s/it, accuracy=0.0895, cost=5.28]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 11, training loss: 5.136653, training acc: 0.105985



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.21s/it, accuracy=0.101, cost=5.22] 
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 12, training loss: 5.095239, training acc: 0.107460



train minibatch loop: 100%|██████████| 50/50 [01:00<00:00,  1.21s/it, accuracy=0.0895, cost=5.19]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 13, training loss: 5.053156, training acc: 0.109944



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.21s/it, accuracy=0.0934, cost=5.17]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 14, training loss: 5.021431, training acc: 0.112993



train minibatch loop: 100%|██████████| 50/50 [01:00<00:00,  1.20s/it, accuracy=0.0973, cost=5.12]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 15, training loss: 4.991793, training acc: 0.112515



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.21s/it, accuracy=0.0973, cost=5.14]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 16, training loss: 4.982308, training acc: 0.113534



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.20s/it, accuracy=0.0895, cost=5.09]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 17, training loss: 4.958596, training acc: 0.114056



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.22s/it, accuracy=0.0934, cost=5.07]
train minibatch loop:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 18, training loss: 4.939974, training acc: 0.117923



train minibatch loop: 100%|██████████| 50/50 [00:59<00:00,  1.21s/it, accuracy=0.0895, cost=5.07]

epoch: 19, training loss: 4.931262, training acc: 0.115937




