In [1]:
import json
import re
import sentencepiece as spm

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
from prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('bert-small/sp10m.cased.v4.model')

with open('bert-small/sp10m.cased.v4.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [4]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import re

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])





  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
BERT_INIT_CHKPNT = 'bert-small/model.ckpt'
BERT_CONFIG = 'bert-small/bert_config.json'

In [6]:
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/subjectivity/subjectivity-negative-bm.txt
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/subjectivity/subjectivity-positive-bm.txt

In [7]:
with open('subjectivity-negative-bm.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('subjectivity-positive-bm.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts

assert len(labels) == len(texts)

In [8]:

tokenizer.tokenize(texts[1])

['▁yang',
 '▁muncul',
 '▁dari',
 '▁jiwa',
 '▁manusia',
 '▁dan',
 '▁menunjukkan',
 '▁ciri',
 '-',
 'ciri',
 '▁',
 'abstrak',
 '▁express',
 'ion',
 'ism',
 '▁',
 'abstrak',
 '▁dan',
 '▁penyingkiran',
 '▁',
 'grafi',
 'ti',
 '▁kon',
 's',
 'truk',
 'tiv',
 'isme',
 '▁russian',
 '▁telah',
 '▁menguatkan',
 '▁tempatnya',
 '▁dalam',
 '▁sejarah',
 '▁seni',
 '▁moden',
 '▁ketika',
 '▁dicipta',
 '▁oleh',
 '▁artis',
 '▁yang',
 '▁tidak',
 '▁sedar',
 'kan',
 '▁diri',
 '▁dengan',
 '▁pencapaian',
 '▁kesenian',
 '▁mereka']

In [9]:
list(v.keys())[:10]

['<unk>',
 '<s>',
 '</s>',
 '<cls>',
 '<sep>',
 '<pad>',
 '<mask>',
 '<eod>',
 '<eop>',
 '.']

In [10]:
from tqdm import tqdm

input_ids = []

for text in tqdm(texts):
    tokens_a = tokenizer.tokenize(text)
    tokens = ["<cls>"] + tokens_a + ["<sep>"]
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    
    input_ids.append(input_id)

100%|██████████| 9962/9962 [00:01<00:00, 8948.91it/s]


In [11]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)




In [12]:
epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [13]:
bert_config.hidden_size

512

In [14]:
def create_initializer(initializer_range=0.02):
    return tf.truncated_normal_initializer(stddev=initializer_range)

class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
        training = True,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=training,
            input_ids=self.X,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_sequence_output()
        output_layer = tf.layers.dense(
            output_layer,
            bert_config.hidden_size,
            activation=tf.tanh,
            kernel_initializer=create_initializer())
        self.logits_seq = tf.layers.dense(output_layer, dimension_output,
                                         kernel_initializer=create_initializer())
        self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')
        self.logits = self.logits_seq[:, 0]
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [15]:
dimension_output = 2
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)




The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.










Instructions for updating:
Deprecated in favor of operator or tf.math.divide.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from bert-small/model.ckpt


In [16]:
from sklearn.model_selection import train_test_split

train_input_ids, test_input_ids, train_Y, test_Y = train_test_split(
    input_ids, labels, test_size = 0.2
)

In [17]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

In [18]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
        
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 133/133 [00:15<00:00,  8.60it/s, accuracy=0.857, cost=0.326]
test minibatch loop: 100%|██████████| 34/34 [00:01<00:00, 25.73it/s, accuracy=1, cost=0.116]    
train minibatch loop:   1%|          | 1/133 [00:00<00:22,  5.75it/s, accuracy=0.917, cost=0.257]

epoch: 0, pass acc: 0.000000, current acc: 0.900490
time taken: 16.786338090896606
epoch: 0, training loss: 0.403547, training acc: 0.811833, valid loss: 0.256257, valid acc: 0.900490



train minibatch loop: 100%|██████████| 133/133 [00:13<00:00,  9.65it/s, accuracy=0.939, cost=0.156] 
test minibatch loop: 100%|██████████| 34/34 [00:01<00:00, 30.91it/s, accuracy=0.692, cost=0.935]
train minibatch loop:   1%|          | 1/133 [00:00<00:23,  5.72it/s, accuracy=0.917, cost=0.197]

time taken: 14.887073040008545
epoch: 1, training loss: 0.221262, training acc: 0.913073, valid loss: 0.392999, valid acc: 0.846342



train minibatch loop: 100%|██████████| 133/133 [00:13<00:00,  9.69it/s, accuracy=0.98, cost=0.0631] 
test minibatch loop: 100%|██████████| 34/34 [00:01<00:00, 30.91it/s, accuracy=0.846, cost=0.727] 
train minibatch loop:   1%|          | 1/133 [00:00<00:23,  5.72it/s, accuracy=0.983, cost=0.0652]

time taken: 14.831328868865967
epoch: 2, training loss: 0.137727, training acc: 0.951100, valid loss: 0.314562, valid acc: 0.894495



train minibatch loop: 100%|██████████| 133/133 [00:13<00:00,  9.66it/s, accuracy=1, cost=0.00649]   
test minibatch loop: 100%|██████████| 34/34 [00:01<00:00, 30.95it/s, accuracy=0.769, cost=0.883] 
train minibatch loop:   1%|          | 1/133 [00:00<00:23,  5.72it/s, accuracy=1, cost=0.00886]

epoch: 3, pass acc: 0.900490, current acc: 0.911350
time taken: 14.87791919708252
epoch: 3, training loss: 0.082044, training acc: 0.972807, valid loss: 0.319659, valid acc: 0.911350



train minibatch loop: 100%|██████████| 133/133 [00:13<00:00,  9.66it/s, accuracy=1, cost=0.00592]   
test minibatch loop: 100%|██████████| 34/34 [00:01<00:00, 30.98it/s, accuracy=0.846, cost=0.865] 
train minibatch loop:   1%|          | 1/133 [00:00<00:22,  5.78it/s, accuracy=1, cost=0.00334]

time taken: 14.876336574554443
epoch: 4, training loss: 0.054280, training acc: 0.982957, valid loss: 0.362106, valid acc: 0.906750



train minibatch loop: 100%|██████████| 133/133 [00:13<00:00,  9.67it/s, accuracy=1, cost=0.00283]   
test minibatch loop: 100%|██████████| 34/34 [00:01<00:00, 31.07it/s, accuracy=0.846, cost=0.854]
train minibatch loop:   1%|          | 1/133 [00:00<00:23,  5.72it/s, accuracy=1, cost=0.00199]

epoch: 5, pass acc: 0.911350, current acc: 0.914593
time taken: 14.852880001068115
epoch: 5, training loss: 0.035213, training acc: 0.989724, valid loss: 0.361177, valid acc: 0.914593



train minibatch loop: 100%|██████████| 133/133 [00:13<00:00,  9.65it/s, accuracy=1, cost=0.00619]   
test minibatch loop: 100%|██████████| 34/34 [00:01<00:00, 31.20it/s, accuracy=0.846, cost=0.744]
train minibatch loop:   1%|          | 1/133 [00:00<00:22,  5.79it/s, accuracy=1, cost=0.0015]

time taken: 14.872130155563354
epoch: 6, training loss: 0.022985, training acc: 0.992857, valid loss: 0.485655, valid acc: 0.897436



train minibatch loop: 100%|██████████| 133/133 [00:13<00:00,  9.68it/s, accuracy=0.98, cost=0.0211] 
test minibatch loop: 100%|██████████| 34/34 [00:01<00:00, 30.82it/s, accuracy=0.769, cost=1.25] 
train minibatch loop:   1%|          | 1/133 [00:00<00:22,  5.74it/s, accuracy=1, cost=0.000903]

time taken: 14.853012084960938
epoch: 7, training loss: 0.015787, training acc: 0.995962, valid loss: 0.521133, valid acc: 0.896644



train minibatch loop: 100%|██████████| 133/133 [00:13<00:00,  9.66it/s, accuracy=1, cost=0.000702]  
test minibatch loop: 100%|██████████| 34/34 [00:01<00:00, 31.13it/s, accuracy=0.769, cost=1.5]  

time taken: 14.864527463912964
epoch: 8, training loss: 0.012661, training acc: 0.995865, valid loss: 0.471805, valid acc: 0.905468

break epoch:9






In [19]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bert-small-subjectivity/model.ckpt')

'bert-small-subjectivity/model.ckpt'

In [20]:
dimension_output = 2
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate,
    training = False
)

sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, 'bert-small-subjectivity/model.ckpt')











INFO:tensorflow:Restoring parameters from bert-small-subjectivity/model.ckpt


In [21]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'bert/embeddings/word_embeddings',
 'bert/embeddings/token_type_embeddings',
 'bert/embeddings/position_embeddings',
 'bert/embeddings/LayerNorm/gamma',
 'bert/encoder/layer_0/attention/self/query/kernel',
 'bert/encoder/layer_0/attention/self/query/bias',
 'bert/encoder/layer_0/attention/self/key/kernel',
 'bert/encoder/layer_0/attention/self/key/bias',
 'bert/encoder/layer_0/attention/self/value/kernel',
 'bert/encoder/layer_0/attention/self/value/bias',
 'bert/encoder/layer_0/attention/self/Softmax',
 'bert/encoder/layer_0/attention/output/dense/kernel',
 'bert/encoder/layer_0/attention/output/dense/bias',
 'bert/encoder/layer_0/attention/output/LayerNorm/gamma',
 'bert/encoder/layer_0/intermediate/dense/kernel',
 'bert/encoder/layer_0/intermediate/dense/bias',
 'bert/encoder/layer_0/output/dense/kernel',
 'bert/encoder/layer_0/output/dense/bias',
 'bert/encoder/layer_0/output/LayerNorm/gamma',
 'bert/encoder/layer_1/attention/self/query/kernel',
 

In [22]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    batch_x = pad_sequences(batch_x, padding='post')
    batch_y = test_Y[i: index]
    
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
            model.X: batch_x,
            },
    ), 1, ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 34/34 [00:01<00:00, 30.09it/s]


In [23]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['negative', 'positive'],digits = 5
    )
)

              precision    recall  f1-score   support

    negative    0.89731   0.92402   0.91047       974
    positive    0.92525   0.89892   0.91190      1019

    accuracy                        0.91119      1993
   macro avg    0.91128   0.91147   0.91118      1993
weighted avg    0.91160   0.91119   0.91120      1993



In [24]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [25]:
freeze_graph('bert-small-subjectivity', strings)

INFO:tensorflow:Restoring parameters from bert-small-subjectivity/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 107 variables.
INFO:tensorflow:Converted 107 variables to const ops.
3582 ops in the final graph.


In [26]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [27]:
g = load_graph('bert-small-subjectivity/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
result = test_sess.run(tf.nn.softmax(logits), feed_dict = {x: [input_id]})
result



array([[4.2893685e-04, 9.9957103e-01]], dtype=float32)

In [28]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'bert-small-subjectivity/frozen_model.pb'
outPutname = "v30/subjective/bert-small-subjective.pb"

s3 = boto3.client('s3',
                 aws_access_key_id='',
                 aws_secret_access_key='')

s3.upload_file(Key,bucketName,outPutname)