In [1]:
# !pip3 install bert-tensorflow --user
# !wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
# !unzip multi_cased_L-12_H-768_A-12.zip

In [2]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import re

In [3]:
BERT_INIT_CHKPNT = 'bert-sentiment/model.ckpt'
BERT_CONFIG = 'bert-bahasa/config.json'

In [4]:
from tqdm import tqdm
import malaya

In [6]:
_tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer
rejected = ['wkwk', 'http', 'https', 'lolol', 'hahaha']

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    string = ''.join(''.join(s)[:2] for _, s in itertools.groupby(unidecode(string)))
    tokenized = _tokenizer(string)
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [w for w in tokenized if all([r not in w for r in rejected])]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

with open('dictionary.json') as fopen:
    d = json.load(fopen)
dictionary = d['dictionary']
rev_dictionary = d['reverse_dictionary']

class Tokenizer:
    def __init__(self, vocab, rev_dictionary):
        self.vocab = vocab
        self.inv_vocab = rev_dictionary
    
    def tokenize(self, string):
        return preprocessing(string)
    
    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(t, 1) for t in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [self.inv_vocab[i] for i in ids]
    
tokenizer = Tokenizer(dictionary, rev_dictionary)

In [7]:
with open('emotion.json') as fopen:
    x = json.load(fopen)
texts = x['x']
labels = x['y']

In [8]:
labels[:10]

[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]

In [9]:
MAX_SEQ_LENGTH = 100

In [10]:
tokenizer.tokenize(texts[1])

['@izzatianiss', 'ehh', 'si', 'busuk', 'hati', 'level', 'mahadi', 'ini']

In [11]:
input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(texts):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 420516/420516 [02:06<00:00, 3327.22it/s]


In [12]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [13]:
epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [14]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=False,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_pooled_output()
        self.logits = tf.layers.dense(output_layer, dimension_output)
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
dimension_output = 6
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

INFO:tensorflow:Restoring parameters from bert-sentiment/model.ckpt


In [17]:
from sklearn.cross_validation import train_test_split

train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, labels, test_size = 0.2
)

In [18]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_input_ids) / batch_size
    train_acc /= len(train_input_ids) / batch_size
    test_loss /= len(test_input_ids) / batch_size
    test_acc /= len(test_input_ids) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 5607/5607 [17:45<00:00,  5.39it/s, accuracy=0.904, cost=0.222]
test minibatch loop: 100%|██████████| 1402/1402 [01:14<00:00, 18.71it/s, accuracy=0.841, cost=0.445]
train minibatch loop:   0%|          | 1/5607 [00:00<17:50,  5.24it/s, accuracy=0.867, cost=0.272]

epoch: 0, pass acc: 0.000000, current acc: 0.854864
time taken: 1140.3475568294525
epoch: 0, training loss: 0.502027, training acc: 0.787722, valid loss: 0.305998, valid acc: 0.854864



train minibatch loop: 100%|██████████| 5607/5607 [17:44<00:00,  5.40it/s, accuracy=0.923, cost=0.213] 
test minibatch loop: 100%|██████████| 1402/1402 [01:14<00:00, 18.89it/s, accuracy=0.841, cost=0.403] 
train minibatch loop:   0%|          | 1/5607 [00:00<17:41,  5.28it/s, accuracy=0.883, cost=0.211]

epoch: 1, pass acc: 0.854864, current acc: 0.861332
time taken: 1138.5014758110046
epoch: 1, training loss: 0.279244, training acc: 0.859569, valid loss: 0.276922, valid acc: 0.861332



train minibatch loop: 100%|██████████| 5607/5607 [17:45<00:00,  5.39it/s, accuracy=0.923, cost=0.19]  
test minibatch loop: 100%|██████████| 1402/1402 [01:14<00:00, 18.88it/s, accuracy=0.841, cost=0.387] 
train minibatch loop:   0%|          | 1/5607 [00:00<17:55,  5.21it/s, accuracy=0.917, cost=0.186]

time taken: 1140.176148891449
epoch: 2, training loss: 0.246413, training acc: 0.871272, valid loss: 0.281651, valid acc: 0.860286



train minibatch loop: 100%|██████████| 5607/5607 [17:45<00:00,  5.42it/s, accuracy=0.904, cost=0.177] 
test minibatch loop: 100%|██████████| 1402/1402 [01:14<00:00, 18.86it/s, accuracy=0.795, cost=0.4]  
train minibatch loop:   0%|          | 1/5607 [00:00<17:45,  5.26it/s, accuracy=0.95, cost=0.133]

time taken: 1139.8034629821777
epoch: 3, training loss: 0.222100, training acc: 0.886818, valid loss: 0.301438, valid acc: 0.855878



train minibatch loop: 100%|██████████| 5607/5607 [17:45<00:00,  5.40it/s, accuracy=0.865, cost=0.163] 
test minibatch loop: 100%|██████████| 1402/1402 [01:14<00:00, 18.87it/s, accuracy=0.818, cost=0.394] 

time taken: 1140.224315404892
epoch: 4, training loss: 0.195414, training acc: 0.905241, valid loss: 0.354453, valid acc: 0.850341

break epoch:5






In [19]:
saver.save(sess, 'bert-emotion/model.ckpt')

'bert-emotion/model.ckpt'