## Download model

Download any models you prefer.

In [6]:
# !wget https://huseinhouse-storage.s3-ap-southeast-1.amazonaws.com/bert-bahasa/bert-bahasa-9-july-2019.tar.gz
# !tar -zxf bert-bahasa-9-july-2019.tar.gz
# !pip3 install bert-tensorflow --user

In [4]:
!ls bert-bahasa-9-july-2019

config.json				model.ckpt-1000000.meta
model.ckpt-1000000.data-00000-of-00001	sp10m.cased.v4.model
model.ckpt-1000000.index		sp10m.cased.v4.vocab


In [5]:
import json
import re
import sentencepiece as spm

In [8]:
from prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('bert-bahasa-9-july-2019/sp10m.cased.v4.model')

with open('bert-bahasa-9-july-2019/sp10m.cased.v4.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [11]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import re

In [12]:
BERT_INIT_CHKPNT = 'bert-bahasa-9-july-2019/model.ckpt-1000000'
BERT_CONFIG = 'bert-bahasa-9-july-2019/config.json'

In [13]:
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/subjectivity/subjectivity-negative-bm.txt
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/subjectivity/subjectivity-positive-bm.txt

In [14]:
with open('subjectivity-negative-bm.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('subjectivity-positive-bm.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts

assert len(labels) == len(texts)

In [15]:
MAX_SEQ_LENGTH = 100
tokenizer.tokenize(texts[1])

['▁yang',
 '▁muncul',
 '▁dari',
 '▁jiwa',
 '▁manusia',
 '▁dan',
 '▁menunjukkan',
 '▁ciri',
 '-',
 'ciri',
 '▁',
 'abstrak',
 '▁express',
 'ion',
 'ism',
 '▁',
 'abstrak',
 '▁dan',
 '▁penyingkiran',
 '▁',
 'grafi',
 'ti',
 '▁kon',
 's',
 'truk',
 'tiv',
 'isme',
 '▁russian',
 '▁telah',
 '▁menguatkan',
 '▁tempatnya',
 '▁dalam',
 '▁sejarah',
 '▁seni',
 '▁moden',
 '▁ketika',
 '▁dicipta',
 '▁oleh',
 '▁artis',
 '▁yang',
 '▁tidak',
 '▁sedar',
 'kan',
 '▁diri',
 '▁dengan',
 '▁pencapaian',
 '▁kesenian',
 '▁mereka']

In [16]:
from tqdm import tqdm

input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(texts):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["<cls>"] + tokens_a + ["<sep>"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 9962/9962 [00:01<00:00, 7403.98it/s]


In [17]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [18]:
epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [19]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=False,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_pooled_output()
        self.logits = tf.layers.dense(output_layer, dimension_output)
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [20]:
dimension_output = 2
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from bert-bahasa-9-july-2019/model.ckpt-1000000


In [21]:
from sklearn.cross_validation import train_test_split

train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, labels, test_size = 0.2
)



In [22]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_input_ids) / batch_size
    train_acc /= len(train_input_ids) / batch_size
    test_loss /= len(test_input_ids) / batch_size
    test_acc /= len(test_input_ids) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 133/133 [00:52<00:00,  2.74it/s, accuracy=0.918, cost=0.158]
test minibatch loop: 100%|██████████| 34/34 [00:04<00:00,  7.81it/s, accuracy=1, cost=0.0594]   
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.910186
time taken: 56.43599534034729
epoch: 0, training loss: 0.357670, training acc: 0.836630, valid loss: 0.275480, valid acc: 0.910186



train minibatch loop: 100%|██████████| 133/133 [00:50<00:00,  2.74it/s, accuracy=0.959, cost=0.081] 
test minibatch loop: 100%|██████████| 34/34 [00:04<00:00,  8.31it/s, accuracy=1, cost=0.0618]   
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.910186, current acc: 0.926242
time taken: 54.693442583084106
epoch: 1, training loss: 0.197910, training acc: 0.927036, valid loss: 0.275675, valid acc: 0.926242



train minibatch loop: 100%|██████████| 133/133 [00:50<00:00,  2.72it/s, accuracy=0.98, cost=0.0683] 
test minibatch loop: 100%|██████████| 34/34 [00:04<00:00,  8.27it/s, accuracy=0.846, cost=0.824]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

time taken: 54.85988974571228
epoch: 2, training loss: 0.087594, training acc: 0.972616, valid loss: 0.569989, valid acc: 0.875950



train minibatch loop: 100%|██████████| 133/133 [00:50<00:00,  2.72it/s, accuracy=1, cost=0.00078]   
test minibatch loop: 100%|██████████| 34/34 [00:04<00:00,  8.25it/s, accuracy=0.846, cost=0.53] 
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

time taken: 55.07711744308472
epoch: 3, training loss: 0.035270, training acc: 0.990589, valid loss: 0.486300, valid acc: 0.915087



train minibatch loop: 100%|██████████| 133/133 [00:50<00:00,  2.72it/s, accuracy=1, cost=0.000849]  
test minibatch loop: 100%|██████████| 34/34 [00:04<00:00,  8.24it/s, accuracy=1, cost=0.00546]  
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

epoch: 4, pass acc: 0.926242, current acc: 0.934772
time taken: 54.99451947212219
epoch: 4, training loss: 0.013198, training acc: 0.997992, valid loss: 0.602157, valid acc: 0.934772



train minibatch loop: 100%|██████████| 133/133 [00:50<00:00,  2.72it/s, accuracy=1, cost=1.38e-5]   
test minibatch loop: 100%|██████████| 34/34 [00:04<00:00,  8.23it/s, accuracy=0.923, cost=0.116]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

time taken: 55.04400396347046
epoch: 5, training loss: 0.003051, training acc: 1.000376, valid loss: 0.753478, valid acc: 0.923926



train minibatch loop: 100%|██████████| 133/133 [00:50<00:00,  2.72it/s, accuracy=1, cost=7.52e-6] 
test minibatch loop: 100%|██████████| 34/34 [00:04<00:00,  8.25it/s, accuracy=0.923, cost=0.109]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

time taken: 55.0029354095459
epoch: 6, training loss: 0.000175, training acc: 1.001380, valid loss: 0.736715, valid acc: 0.929445



train minibatch loop: 100%|██████████| 133/133 [00:50<00:00,  2.72it/s, accuracy=1, cost=4.75e-6]
test minibatch loop: 100%|██████████| 34/34 [00:04<00:00,  8.26it/s, accuracy=0.923, cost=0.266]

time taken: 54.99624180793762
epoch: 7, training loss: 0.000011, training acc: 1.001380, valid loss: 0.754906, valid acc: 0.926435

break epoch:8






In [23]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    batch_masks = test_input_masks[i: index]
    batch_segment = test_segment_ids[i: index]
    batch_y = test_Y[i: index]
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
    ), 1, ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 34/34 [00:04<00:00,  7.83it/s]


In [24]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['negative', 'positive'],digits=5
    )
)

             precision    recall  f1-score   support

   negative    0.91466   0.89665   0.90557      1016
   positive    0.89468   0.91300   0.90375       977

avg / total    0.90487   0.90467   0.90468      1993

