In [1]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import re
import sentencepiece as spm

In [2]:
# !git clone https://github.com/huseinzol05/Malaya-Dataset.git

In [3]:
# Change to your local Malaya-Dataset
import glob

left, right, label = [], [], []
for file in glob.glob('../../Malaya-Dataset/text-similarity/quora/*.json'):
    with open(file) as fopen:
        x = json.load(fopen)
    for i in x:
        splitted = i[0].split(' <> ')
        if len(splitted) != 2:
            continue
        left.append(splitted[0])
        right.append(splitted[1])
        label.append(i[1])
        
np.unique(label, return_counts = True)

(array([0, 1]), array([254659, 149172]))

In [4]:
from prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.v4.model')

with open('sp10m.cased.v4.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [5]:
BERT_INIT_CHKPNT = 'pretraining_output2/model.ckpt-1000000'
BERT_CONFIG = 'checkpoint/bert_config.json'

In [6]:
MAX_SEQ_LENGTH = 100
tokenizer.tokenize(left[1])

['▁Apa', '▁yang', '▁membuat', '▁seseorang', '▁marah', '?']

In [7]:
list(v.keys())[:10]

['<unk>',
 '<s>',
 '</s>',
 '<cls>',
 '<sep>',
 '<pad>',
 '<mask>',
 '<eod>',
 '<eop>',
 '.']

In [8]:
from tqdm import tqdm

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
              tokens_a.pop()
        else:
              tokens_b.pop()
                
def get_inputs(left, right):

    input_ids, input_masks, segment_ids = [], [], []

    for i in tqdm(range(len(left))):
        tokens_a = tokenizer.tokenize(' '.join(left[i]))
        tokens_b = tokenizer.tokenize(' '.join(right[i]))
        _truncate_seq_pair(tokens_a, tokens_b, MAX_SEQ_LENGTH - 3)

        tokens = []
        segment_id = []
        tokens.append("<cls>")
        segment_id.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_id.append(0)
        tokens.append("<sep>")
        segment_id.append(0)
        for token in tokens_b:
            tokens.append(token)
            segment_id.append(1)
        tokens.append("<sep>")
        segment_id.append(1)
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)
        
        while len(input_id) < MAX_SEQ_LENGTH:
            input_id.append(0)
            input_mask.append(0)
            segment_id.append(0)

        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
    
    return input_ids, input_masks, segment_ids

In [9]:
input_ids, input_masks, segment_ids = get_inputs(left, right)

100%|██████████| 403831/403831 [02:52<00:00, 2341.68it/s]


In [10]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [11]:
epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(left) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [12]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_pooled_output()
        self.logits = tf.layers.dense(output_layer, dimension_output)
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [13]:
dimension_output = 2
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from pretraining_output2/model.ckpt-1000000


In [14]:
from sklearn.cross_validation import train_test_split

train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, label, test_size = 0.2
)



In [15]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_input_ids) / batch_size
    train_acc /= len(train_input_ids) / batch_size
    test_loss /= len(test_input_ids) / batch_size
    test_acc /= len(test_input_ids) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop:  75%|███████▍  | 4030/5385 [27:04<09:06,  2.48it/s, accuracy=0.75, cost=0.55]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.24it/s, accuracy=1, cost=0.137]    
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.721433
time taken: 2356.2040345668793
epoch: 0, training loss: 0.593182, training acc: 0.677103, valid loss: 0.538423, valid acc: 0.721433



train minibatch loop:  13%|█▎        | 716/5385 [04:48<31:19,  2.48it/s, accuracy=0.717, cost=0.563]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  78%|███████▊  | 4217/5385 [28:19<07:50,  2.48it/s, accuracy=0.617, cost=0.622]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.26it/s, accuracy=1, cost=0.162]    
train minibatch loop:   0%|          | 0/5385 [0

epoch: 1, pass acc: 0.721433, current acc: 0.745899
time taken: 2355.265573978424
epoch: 1, training loss: 0.521362, training acc: 0.731132, valid loss: 0.500190, valid acc: 0.745899



train minibatch loop:  13%|█▎        | 679/5385 [04:33<31:35,  2.48it/s, accuracy=0.717, cost=0.593]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  78%|███████▊  | 4191/5385 [28:08<08:00,  2.49it/s, accuracy=0.817, cost=0.436]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.26it/s, accuracy=1, cost=0.185]    
train minibatch loop:   0%|          | 0/5385 [0

epoch: 2, pass acc: 0.745899, current acc: 0.759493
time taken: 2354.9915947914124
epoch: 2, training loss: 0.480434, training acc: 0.758876, valid loss: 0.483184, valid acc: 0.759493



train minibatch loop:  21%|██        | 1110/5385 [07:27<28:42,  2.48it/s, accuracy=0.8, cost=0.451]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  85%|████████▍ | 4558/5385 [30:37<05:33,  2.48it/s, accuracy=0.75, cost=0.455] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.26it/s, accuracy=0.857, cost=0.24] 
train minibatch loop:   0%|          | 0/5385 [

epoch: 3, pass acc: 0.759493, current acc: 0.767138
time taken: 2355.3217313289642
epoch: 3, training loss: 0.448302, training acc: 0.779561, valid loss: 0.474211, valid acc: 0.767138



train minibatch loop:  28%|██▊       | 1492/5385 [10:01<26:09,  2.48it/s, accuracy=0.883, cost=0.352]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  95%|█████████▌| 5137/5385 [34:29<01:39,  2.48it/s, accuracy=0.9, cost=0.281]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  34%|███▍      | 1821/5385 [12:14<23:57,  2.48it/s, accuracy=0.833, cost=0.346]IOPub message rate exceeded.
The notebook serve

time taken: 2355.039293527603
epoch: 8, training loss: 0.301134, training acc: 0.864188, valid loss: 0.542045, valid acc: 0.774040



train minibatch loop:   1%|          | 38/5385 [00:15<35:57,  2.48it/s, accuracy=0.8, cost=0.459]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  68%|██████▊   | 3671/5385 [24:39<11:29,  2.48it/s, accuracy=0.8, cost=0.461]   IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.26it/s, accuracy=0.714, cost=0.74] 
train minibatch loop:   0%|          | 0/5385 [0

epoch: 9, pass acc: 0.774988, current acc: 0.776652
time taken: 2355.1706018447876
epoch: 9, training loss: 0.275650, training acc: 0.877623, valid loss: 0.567738, valid acc: 0.776652



train minibatch loop:  11%|█         | 569/5385 [03:49<32:20,  2.48it/s, accuracy=0.867, cost=0.401] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop: 100%|██████████| 5385/5385 [36:09<00:00,  2.93it/s, accuracy=0.958, cost=0.0879]
test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.25it/s, accuracy=0.714, cost=1.1]  
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

epoch: 10, pass acc: 0.776652, current acc: 0.780032
time taken: 2355.6139965057373
epoch: 10, training loss: 0.255923, training acc: 0.887073, valid loss: 0.572282, valid acc: 0.780032



train minibatch loop: 100%|██████████| 5385/5385 [36:10<00:00,  2.94it/s, accuracy=1, cost=0.0458]    
test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.25it/s, accuracy=0.714, cost=1.49] 
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

epoch: 11, pass acc: 0.780032, current acc: 0.783412
time taken: 2356.574961423874
epoch: 11, training loss: 0.241479, training acc: 0.894606, valid loss: 0.579975, valid acc: 0.783412



train minibatch loop: 100%|██████████| 5385/5385 [36:09<00:00,  2.93it/s, accuracy=0.958, cost=0.0711]
test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.25it/s, accuracy=0.714, cost=0.924]
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

epoch: 12, pass acc: 0.783412, current acc: 0.784601
time taken: 2355.606646299362
epoch: 12, training loss: 0.231818, training acc: 0.899297, valid loss: 0.577474, valid acc: 0.784601



train minibatch loop: 100%|██████████| 5385/5385 [36:11<00:00,  2.93it/s, accuracy=1, cost=0.0567]    
test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.25it/s, accuracy=0.857, cost=0.383]
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

time taken: 2356.9761004447937
epoch: 13, training loss: 0.228900, training acc: 0.901004, valid loss: 0.579955, valid acc: 0.783568



train minibatch loop: 100%|██████████| 5385/5385 [36:11<00:00,  2.94it/s, accuracy=1, cost=0.0496]    
test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.25it/s, accuracy=0.857, cost=0.864]
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

epoch: 14, pass acc: 0.784601, current acc: 0.785264
time taken: 2356.7622389793396
epoch: 14, training loss: 0.228709, training acc: 0.900472, valid loss: 0.580626, valid acc: 0.785264



train minibatch loop: 100%|██████████| 5385/5385 [36:11<00:00,  2.93it/s, accuracy=1, cost=0.064]     
test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.25it/s, accuracy=0.857, cost=0.889]
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

epoch: 15, pass acc: 0.785264, current acc: 0.785561
time taken: 2356.996570825577
epoch: 15, training loss: 0.228210, training acc: 0.901270, valid loss: 0.576234, valid acc: 0.785561



train minibatch loop: 100%|██████████| 5385/5385 [36:11<00:00,  2.93it/s, accuracy=0.958, cost=0.0824]
test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.25it/s, accuracy=0.857, cost=0.727]
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

time taken: 2356.6890137195587
epoch: 16, training loss: 0.227874, training acc: 0.900975, valid loss: 0.578875, valid acc: 0.785239



train minibatch loop: 100%|██████████| 5385/5385 [36:10<00:00,  2.94it/s, accuracy=1, cost=0.0637]    
test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.25it/s, accuracy=0.714, cost=1.06] 
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

time taken: 2356.642462015152
epoch: 17, training loss: 0.228326, training acc: 0.901527, valid loss: 0.577638, valid acc: 0.785121



train minibatch loop: 100%|██████████| 5385/5385 [36:10<00:00,  2.93it/s, accuracy=1, cost=0.0894]    
test minibatch loop: 100%|██████████| 1347/1347 [03:05<00:00,  7.25it/s, accuracy=0.857, cost=0.435]

time taken: 2356.3395166397095
epoch: 18, training loss: 0.228625, training acc: 0.901688, valid loss: 0.579267, valid acc: 0.784843

break epoch:19






In [16]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    batch_masks = test_input_masks[i: index]
    batch_segment = test_segment_ids[i: index]
    batch_y = test_Y[i: index]
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
    ), 1, ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 1347/1347 [03:04<00:00,  7.29it/s]


In [24]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['not similar', 'similar'],digits=5
    )
)

             precision    recall  f1-score   support

not similar    0.81908   0.83530   0.82711     51007
    similar    0.70779   0.68377   0.69557     29760

avg / total    0.77807   0.77946   0.77864     80767

