In [1]:
import json
import re
import sentencepiece as spm

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [3]:
from prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('albert-base/sp10m.cased.v6.model')

with open('albert-base/sp10m.cased.v6.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [4]:
import optimization
import tokenization
import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])





  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
ALBERT_INIT_CHKPNT = 'albert-base/model.ckpt'
ALBERT_CONFIG = 'albert-base/albert_config_base.json'

In [6]:
import json
import glob

left, right, label = [], [], []
for file in glob.glob('../Malaya-Dataset/text-similarity/quora/*.json'):
    with open(file) as fopen:
        x = json.load(fopen)
    for i in x:
        splitted = i[0].split(' <> ')
        if len(splitted) != 2:
            continue
        left.append(splitted[0])
        right.append(splitted[1])
        label.append(i[1])

In [7]:
tokenizer.tokenize(left[0])

['▁Bagaimana',
 'kah',
 '▁saya',
 '▁boleh',
 '▁menjadi',
 '▁ahli',
 '▁ge',
 'ologi',
 '▁yang',
 '▁baik',
 '?']

In [8]:
from tqdm import tqdm
MAX_SEQ_LENGTH = 100

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
              tokens_a.pop()
        else:
              tokens_b.pop()
                
def get_inputs(left, right):

    input_ids, input_masks, segment_ids = [], [], []

    for i in tqdm(range(len(left))):
        tokens_a = tokenizer.tokenize(left[i])
        tokens_b = tokenizer.tokenize(right[i])
        _truncate_seq_pair(tokens_a, tokens_b, MAX_SEQ_LENGTH - 3)

        tokens = []
        segment_id = []
        tokens.append("<cls>")
        segment_id.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_id.append(0)
        tokens.append("<sep>")
        segment_id.append(0)
        for token in tokens_b:
            tokens.append(token)
            segment_id.append(1)
        tokens.append("<sep>")
        segment_id.append(1)
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)
        while len(input_id) < MAX_SEQ_LENGTH:
            input_id.append(0)
            input_mask.append(0)
            segment_id.append(0)

        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
    
    return input_ids, input_masks, segment_ids

In [9]:
input_ids, input_masks, segment_ids = get_inputs(left, right)

100%|██████████| 403831/403831 [01:06<00:00, 6084.31it/s]


In [10]:
albert_config = modeling.BertConfig.from_json_file(ALBERT_CONFIG)




In [11]:
epoch = 20
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(left) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [12]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=albert_config,
            is_training=False,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_pooled_output()
        self.logits = tf.layers.dense(output_layer, dimension_output)
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [13]:
dimension_output = 2
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, ALBERT_INIT_CHKPNT)


embedding_lookup_factorized. factorized embedding parameterization is used.


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.


Instructions for updating:
Use keras.layers.dense instead.
















Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from albert-base/model.ckpt


In [14]:
from sklearn.model_selection import train_test_split

train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, label, test_size = 0.2)

In [15]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 2, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 5385/5385 [32:35<00:00,  2.75it/s, accuracy=0.75, cost=0.508] 
test minibatch loop: 100%|██████████| 1347/1347 [02:54<00:00,  7.70it/s, accuracy=1, cost=0.127]    
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.789285
time taken: 2130.5657935142517
epoch: 0, training loss: 0.461285, training acc: 0.767923, valid loss: 0.427728, valid acc: 0.789285



train minibatch loop: 100%|██████████| 5385/5385 [32:34<00:00,  2.75it/s, accuracy=0.792, cost=0.456]
test minibatch loop: 100%|██████████| 1347/1347 [02:54<00:00,  7.71it/s, accuracy=1, cost=0.0975]   
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.789285, current acc: 0.812138
time taken: 2129.443473339081
epoch: 1, training loss: 0.380663, training acc: 0.821778, valid loss: 0.394985, valid acc: 0.812138



train minibatch loop: 100%|██████████| 5385/5385 [32:34<00:00,  2.76it/s, accuracy=0.792, cost=0.334]
test minibatch loop: 100%|██████████| 1347/1347 [02:54<00:00,  7.71it/s, accuracy=1, cost=0.0874]   
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.812138, current acc: 0.824969
time taken: 2129.2231068611145
epoch: 2, training loss: 0.331307, training acc: 0.850237, valid loss: 0.378222, valid acc: 0.824969



train minibatch loop: 100%|██████████| 5385/5385 [32:34<00:00,  2.76it/s, accuracy=0.958, cost=0.155] 
test minibatch loop: 100%|██████████| 1347/1347 [02:54<00:00,  7.71it/s, accuracy=1, cost=0.0917]   
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.824969, current acc: 0.839062
time taken: 2129.1830151081085
epoch: 3, training loss: 0.263914, training acc: 0.886173, valid loss: 0.383554, valid acc: 0.839062



train minibatch loop: 100%|██████████| 5385/5385 [32:34<00:00,  2.75it/s, accuracy=1, cost=0.0459]    
test minibatch loop: 100%|██████████| 1347/1347 [02:54<00:00,  7.71it/s, accuracy=0.857, cost=0.176]
train minibatch loop:   0%|          | 0/5385 [00:00<?, ?it/s]

time taken: 2129.500275850296
epoch: 4, training loss: 0.202765, training acc: 0.916599, valid loss: 0.481462, valid acc: 0.836110



train minibatch loop: 100%|██████████| 5385/5385 [32:40<00:00,  2.75it/s, accuracy=1, cost=0.0661]    
test minibatch loop: 100%|██████████| 1347/1347 [02:54<00:00,  7.70it/s, accuracy=0.857, cost=0.236]

time taken: 2135.572097301483
epoch: 5, training loss: 0.155870, training acc: 0.938487, valid loss: 0.543280, valid acc: 0.838102

break epoch:6






In [16]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    batch_masks = test_input_masks[i: index]
    batch_segment = test_segment_ids[i: index]
    batch_y = test_Y[i: index]
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
    ), 1, ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 1347/1347 [02:52<00:00,  7.81it/s]


In [17]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['not similar', 'similar'],
        digits = 5
    )
)

              precision    recall  f1-score   support

 not similar    0.88273   0.85781   0.87009     51052
     similar    0.76701   0.80421   0.78517     29715

    accuracy                        0.83809     80767
   macro avg    0.82487   0.83101   0.82763     80767
weighted avg    0.84015   0.83809   0.83885     80767



In [18]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bert-base-similarity/model.ckpt')

'bert-base-similarity/model.ckpt'

In [19]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [20]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'bert/embeddings/word_embeddings',
 'bert/embeddings/word_embeddings_2',
 'bert/embeddings/token_type_embeddings',
 'bert/embeddings/position_embeddings',
 'bert/embeddings/LayerNorm/gamma',
 'bert/encoder/layer_shared/attention/output/LayerNorm/gamma',
 'bert/encoder/layer_shared/attention/self/query/kernel',
 'bert/encoder/layer_shared/attention/self/query/bias',
 'bert/encoder/layer_shared/attention/self/key/kernel',
 'bert/encoder/layer_shared/attention/self/key/bias',
 'bert/encoder/layer_shared/attention/self/value/kernel',
 'bert/encoder/layer_shared/attention/self/value/bias',
 'bert/encoder/layer_shared/attention/self/Softmax',
 'bert/encoder/layer_shared/attention/output/dense/kernel',
 'bert/encoder/layer_shared/attention/output/dense/bias',
 'bert/encoder/layer_shared/output/LayerNorm/gamma',
 'bert/encoder/layer_shared/intermediate/dense/kernel',
 'bert/encoder/layer_shared/intermediate/dense/bias',
 'b

In [21]:
freeze_graph('bert-base-similarity', strings)

INFO:tensorflow:Restoring parameters from bert-base-similarity/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 26 variables.
INFO:tensorflow:Converted 26 variables to const ops.
5389 ops in the final graph.


In [22]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [23]:
g = load_graph('bert-base-similarity/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
segment_ids = g.get_tensor_by_name('import/Placeholder_1:0')
input_masks = g.get_tensor_by_name('import/Placeholder_2:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
result = test_sess.run(tf.nn.softmax(logits), feed_dict = {x: batch_x,
                                           segment_ids: batch_segment,
                                           input_masks: batch_masks})
result



array([[9.9936980e-01, 6.3020526e-04],
       [8.0295616e-01, 1.9704384e-01],
       [9.9999726e-01, 2.6890016e-06],
       [3.3331758e-03, 9.9666685e-01],
       [9.9999762e-01, 2.3260109e-06],
       [9.9951994e-01, 4.8006378e-04],
       [2.1933889e-02, 9.7806609e-01]], dtype=float32)

In [24]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'bert-base-similarity/frozen_model.pb'
outPutname = "v30/similarity/albert-base-similarity.pb"

s3 = boto3.client('s3',
                 aws_access_key_id='',
                 aws_secret_access_key='')

s3.upload_file(Key,bucketName,outPutname)