In [1]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import re

In [2]:
BERT_INIT_CHKPNT = 'bert-entities/model.ckpt'
BERT_CONFIG = 'bert-bahasa/config.json'

In [3]:
from tqdm import tqdm
import malaya

In [4]:
_tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer
rejected = ['wkwk', 'http', 'https', 'lolol', 'hahaha']

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    string = ''.join(''.join(s)[:2] for _, s in itertools.groupby(unidecode(string)))
    tokenized = _tokenizer(string)
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [w for w in tokenized if all([r not in w for r in rejected])]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

with open('dictionary.json') as fopen:
    d = json.load(fopen)
dictionary = d['dictionary']
rev_dictionary = d['reverse_dictionary']

class Tokenizer:
    def __init__(self, vocab, rev_dictionary):
        self.vocab = vocab
        self.inv_vocab = rev_dictionary
    
    def tokenize(self, string):
        return preprocessing(string)
    
    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(t, 1) for t in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [self.inv_vocab[i] for i in ids]
    
tokenizer = Tokenizer(dictionary, rev_dictionary)

In [5]:
with open('subjectivity-negative-bm.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('subjectivity-positive-bm.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts

assert len(labels) == len(texts)

In [6]:
MAX_SEQ_LENGTH = 100

In [7]:
tokenizer.tokenize(texts[1])

['yang',
 'muncul',
 'dari',
 'jiwa',
 'manusia',
 'unjuk',
 'ciri',
 'ciri',
 'abstrak',
 'expressionism',
 'abstrak',
 'yingkir',
 'grafiti',
 'konstruktivisme',
 'russi',
 'te',
 'uat',
 'tempat',
 'dalam',
 'jarah',
 'ini',
 'moden',
 'tika',
 'cipta',
 'oleh',
 'artis',
 'yang',
 'tidak',
 'dar',
 'ri',
 'deng',
 'capai',
 'seni',
 'reka']

In [8]:
input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(texts):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 9962/9962 [00:03<00:00, 2572.44it/s]


In [9]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [10]:
epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [11]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_pooled_output()
        self.logits = tf.layers.dense(output_layer, dimension_output)
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [12]:
dimension_output = 2
learning_rate = 1e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from bert-entities/model.ckpt


In [13]:
from sklearn.cross_validation import train_test_split

train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, labels, test_size = 0.2
)

In [None]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_input_ids) / batch_size
    train_acc /= len(train_input_ids) / batch_size
    test_loss /= len(test_input_ids) / batch_size
    test_acc /= len(test_input_ids) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 133/133 [00:28<00:00,  5.11it/s, accuracy=0.918, cost=0.21] 
test minibatch loop: 100%|██████████| 34/34 [00:02<00:00, 14.01it/s, accuracy=0.923, cost=0.325]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.915898
time taken: 31.009352922439575
epoch: 0, training loss: 0.467018, training acc: 0.757950, valid loss: 0.279691, valid acc: 0.915898



train minibatch loop: 100%|██████████| 133/133 [00:27<00:00,  5.11it/s, accuracy=0.98, cost=0.0793] 
test minibatch loop: 100%|██████████| 34/34 [00:02<00:00, 15.91it/s, accuracy=0.846, cost=0.328]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.915898, current acc: 0.933652
time taken: 29.295251607894897
epoch: 1, training loss: 0.234492, training acc: 0.907614, valid loss: 0.231998, valid acc: 0.933652



train minibatch loop: 100%|██████████| 133/133 [00:27<00:00,  5.11it/s, accuracy=1, cost=0.0358]    
test minibatch loop: 100%|██████████| 34/34 [00:02<00:00, 15.90it/s, accuracy=0.923, cost=0.103]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.933652, current acc: 0.934463
time taken: 29.30099081993103
epoch: 2, training loss: 0.154732, training acc: 0.943155, valid loss: 0.241546, valid acc: 0.934463



train minibatch loop: 100%|██████████| 133/133 [00:27<00:00,  5.11it/s, accuracy=1, cost=0.0128]    
test minibatch loop: 100%|██████████| 34/34 [00:02<00:00, 15.88it/s, accuracy=0.923, cost=0.343] 
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.934463, current acc: 0.936972
time taken: 29.323830366134644
epoch: 3, training loss: 0.116042, training acc: 0.960346, valid loss: 0.255636, valid acc: 0.936972



train minibatch loop: 100%|██████████| 133/133 [00:27<00:00,  5.10it/s, accuracy=1, cost=0.0206]    
test minibatch loop: 100%|██████████| 34/34 [00:02<00:00, 15.89it/s, accuracy=0.923, cost=0.514]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

epoch: 4, pass acc: 0.936972, current acc: 0.937975
time taken: 29.332242727279663
epoch: 4, training loss: 0.084044, training acc: 0.971515, valid loss: 0.276147, valid acc: 0.937975



train minibatch loop: 100%|██████████| 133/133 [00:27<00:00,  5.12it/s, accuracy=1, cost=0.00727]   
test minibatch loop: 100%|██████████| 34/34 [00:02<00:00, 15.86it/s, accuracy=0.923, cost=0.479]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

time taken: 29.367876291275024
epoch: 5, training loss: 0.073976, training acc: 0.977412, valid loss: 0.349317, valid acc: 0.924930



train minibatch loop: 100%|██████████| 133/133 [00:27<00:00,  5.12it/s, accuracy=1, cost=0.00174]   
test minibatch loop: 100%|██████████| 34/34 [00:02<00:00, 15.88it/s, accuracy=0.923, cost=0.372]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

epoch: 6, pass acc: 0.937975, current acc: 0.939480
time taken: 29.362041234970093
epoch: 6, training loss: 0.052359, training acc: 0.985318, valid loss: 0.324266, valid acc: 0.939480



train minibatch loop: 100%|██████████| 133/133 [00:27<00:00,  5.10it/s, accuracy=1, cost=0.00147]   
test minibatch loop: 100%|██████████| 34/34 [00:02<00:00, 15.89it/s, accuracy=0.846, cost=0.396]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

epoch: 7, pass acc: 0.939480, current acc: 0.945695
time taken: 29.364604711532593
epoch: 7, training loss: 0.048711, training acc: 0.985318, valid loss: 0.314287, valid acc: 0.945695



train minibatch loop: 100%|██████████| 133/133 [00:27<00:00,  5.10it/s, accuracy=1, cost=0.00109]   
test minibatch loop: 100%|██████████| 34/34 [00:02<00:00, 15.89it/s, accuracy=0.923, cost=0.434] 
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

time taken: 29.361509561538696
epoch: 8, training loss: 0.042602, training acc: 0.987200, valid loss: 0.363322, valid acc: 0.943996



train minibatch loop: 100%|██████████| 133/133 [00:27<00:00,  5.10it/s, accuracy=1, cost=0.00055]   
test minibatch loop: 100%|██████████| 34/34 [00:02<00:00, 15.85it/s, accuracy=0.923, cost=0.507]
train minibatch loop:   0%|          | 0/133 [00:00<?, ?it/s]

time taken: 29.36323094367981
epoch: 9, training loss: 0.037917, training acc: 0.989459, valid loss: 0.356021, valid acc: 0.935466



train minibatch loop:  90%|█████████ | 120/133 [00:24<00:02,  4.87it/s, accuracy=1, cost=0.0031]    

In [None]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    batch_masks = test_input_masks[i: index]
    batch_segment = test_segment_ids[i: index]
    batch_y = test_Y[i: index]
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
    ), 1, ).tolist()
    real_Y += batch_y

In [None]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['negative', 'positive']
    )
)