In [1]:
# !wget https://f000.backblazeb2.com/file/malaya-model/bert-bahasa/bert-base-2020-10-08.tar.gz
# !wget https://raw.githubusercontent.com/huseinzol05/malaya/master/pretrained-model/bert/tokenizer/sp10m.cased.bert.vocab
# !wget https://raw.githubusercontent.com/huseinzol05/malaya/master/pretrained-model/bert/tokenizer/sp10m.cased.bert.model
# !tar -xf bert-base-2020-10-08.tar.gz
# !rm bert-base-2020-10-08.tar.gz

In [2]:
import json
import re
import sentencepiece as spm

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [4]:
from prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.bert.model')

with open('sp10m.cased.bert.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [5]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re

In [6]:
BERT_INIT_CHKPNT = 'bert-base/model.ckpt-1000000'
BERT_CONFIG = 'bert_base_config.json'

In [7]:
from malaya.text.rules import normalized_chars
import random

laughing = {
    'huhu',
    'haha',
    'gagaga',
    'hihi',
    'wkawka',
    'wkwk',
    'kiki',
    'keke',
    'huehue',
    'hshs',
    'hoho',
    'hewhew',
    'uwu',
    'sksk',
    'ksks',
    'gituu',
    'gitu',
    'mmeeooww',
    'meow',
    'alhamdulillah',
    'muah',
    'mmuahh',
    'hehe',
    'salamramadhan',
    'happywomensday',
    'jahagaha',
    'ahakss',
    'ahksk'
}

def make_cleaning(s, c_dict):
    s = s.translate(c_dict)
    return s

def cleaning(string):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()]
    )
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    
    chars = '.,/'
    for c in chars:
        string = string.replace(c, f' {c} ')
        
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    x = []
    for word in string:
        if any([laugh in word for laugh in laughing]):
            if random.random() >= 0.5:
                x.append(word)
        else:
            x.append(word)
    string = [w.title() if w[0].isupper() else w for w in x]
    return ' '.join(string)

In [8]:
cleaning('Selalu all out lakonan @qalesyakiss ... Kesian Lijah jadi gila. #ulasanedtv #CukupDeritaItu #selanjutnyadiblogedtv https://t.co/VrY4WHZPSM')

'Selalu all out lakonan . . . Kesian Lijah jadi gila . #ulasanedtv #CukupDeritaItu #selanjutnyadiblogedtv'

In [9]:
train = pd.read_csv('train.csv')
labels = sorted(train['sentiment'].unique().tolist())
train = train.values.tolist()
test = pd.read_csv('test.csv')
test = test.values.tolist()

In [10]:
labels

['Negative', 'Neutral', 'Positive']

In [11]:
from tqdm import tqdm

for i in tqdm(range(len(train))):
    train[i][0] = cleaning(train[i][0])
    train[i][1] = labels.index(train[i][1])
    
for i in tqdm(range(len(test))):
    test[i][0] = cleaning(test[i][0])
    test[i][1] = labels.index(test[i][1])

100%|██████████| 2710/2710 [00:00<00:00, 14445.99it/s]
100%|██████████| 302/302 [00:00<00:00, 15262.49it/s]


In [12]:
from tqdm import tqdm

train_ids, train_masks = [], []
train_Y = []

for text in tqdm(train):
    train_Y.append(text[1])
    text = text[0]
    tokens_a = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    
    train_ids.append(input_id)
    train_masks.append(input_mask)

100%|██████████| 2710/2710 [00:00<00:00, 10328.89it/s]


In [13]:
test_ids, test_masks = [], []
test_Y = []

for text in tqdm(test):
    test_Y.append(text[1])
    text = text[0]
    tokens_a = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    
    test_ids.append(input_id)
    test_masks.append(input_mask)

100%|██████████| 302/302 [00:00<00:00, 11045.25it/s]


In [14]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [15]:
epoch = 10
batch_size = 8
warmup_proportion = 0.1
num_train_steps = int(len(train) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [16]:
def create_initializer(initializer_range=0.02):
    return tf.truncated_normal_initializer(stddev=initializer_range)

class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
        training = True,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.MASK = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=training,
            input_ids=self.X,
            input_mask=self.MASK,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_sequence_output()
        output_layer = tf.layers.dense(
            output_layer,
            bert_config.hidden_size,
            activation=tf.tanh,
            kernel_initializer=create_initializer())
        self.logits_seq = tf.layers.dense(output_layer, dimension_output,
                                         kernel_initializer=create_initializer())
        self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')
        self.logits = self.logits_seq[:, 0]
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
dimension_output = len(labels)
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:Restoring parameters from bert-base/model.ckpt-1000000


In [18]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

In [19]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bert-base-sentiment/model.ckpt')

'bert-base-sentiment/model.ckpt'

In [20]:
from tqdm import tqdm
import time

last_accuracy = 0
patience = 2
EPOCH = 0

while True:
    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_ids))
        batch_x = train_ids[i: index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_mask = train_masks[i: index]
        batch_mask = pad_sequences(batch_mask, padding='post')
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.MASK: batch_mask
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_ids))
        batch_x = test_ids[i: index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_mask = test_masks[i: index]
        batch_mask = pad_sequences(batch_mask, padding='post')
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.MASK: batch_mask
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
    
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    
    if test_acc > last_accuracy:
        last_accuracy = test_acc
        saver.save(sess, 'bert-base-sentiment/model.ckpt')
    elif patience > 0:
        print(f'patience: {patience}')
        patience -= 1
    else:
        print('break')
        break
        
        
    EPOCH += 1

train minibatch loop: 100%|██████████| 339/339 [00:37<00:00,  9.02it/s, accuracy=0.333, cost=0.951]
test minibatch loop: 100%|██████████| 38/38 [00:01<00:00, 29.25it/s, accuracy=0.5, cost=1.19]   


epoch: 0, training loss: 1.029999, training acc: 0.486234, valid loss: 0.955126, valid acc: 0.552632



train minibatch loop: 100%|██████████| 339/339 [00:31<00:00, 10.76it/s, accuracy=0.5, cost=0.622]  
test minibatch loop: 100%|██████████| 38/38 [00:00<00:00, 49.13it/s, accuracy=0.667, cost=1.28] 


epoch: 1, training loss: 0.827178, training acc: 0.632006, valid loss: 0.995221, valid acc: 0.573465



train minibatch loop: 100%|██████████| 339/339 [00:31<00:00, 10.66it/s, accuracy=0.833, cost=0.359]
test minibatch loop: 100%|██████████| 38/38 [00:00<00:00, 51.11it/s, accuracy=0.5, cost=1.24]   


epoch: 2, training loss: 0.520822, training acc: 0.798181, valid loss: 1.284773, valid acc: 0.618421



train minibatch loop: 100%|██████████| 339/339 [00:31<00:00, 10.71it/s, accuracy=1, cost=0.0994]    
test minibatch loop: 100%|██████████| 38/38 [00:00<00:00, 48.57it/s, accuracy=0.667, cost=1.53] 
train minibatch loop:   1%|          | 2/339 [00:00<00:31, 10.69it/s, accuracy=1, cost=0.00318]

epoch: 3, training loss: 0.327994, training acc: 0.893068, valid loss: 1.939736, valid acc: 0.593202

patience: 2


train minibatch loop: 100%|██████████| 339/339 [00:31<00:00, 10.66it/s, accuracy=1, cost=0.0428]    
test minibatch loop: 100%|██████████| 38/38 [00:00<00:00, 48.80it/s, accuracy=0.333, cost=2.53] 
train minibatch loop:   1%|          | 2/339 [00:00<00:32, 10.41it/s, accuracy=1, cost=0.00378]

epoch: 4, training loss: 0.252879, training acc: 0.930678, valid loss: 2.373100, valid acc: 0.581140

patience: 1


train minibatch loop: 100%|██████████| 339/339 [00:31<00:00, 10.71it/s, accuracy=1, cost=0.00158]   
test minibatch loop: 100%|██████████| 38/38 [00:00<00:00, 49.50it/s, accuracy=0.333, cost=3.31] 

epoch: 5, training loss: 0.153792, training acc: 0.964602, valid loss: 2.616556, valid acc: 0.581140

break





In [21]:
saver.save(sess, 'bert-base-sentiment/model.ckpt')

'bert-base-sentiment/model.ckpt'

In [23]:
dimension_output = len(labels)
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate,
    training = False,
)

sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, 'bert-base-sentiment/model.ckpt')

INFO:tensorflow:Restoring parameters from bert-base-sentiment/model.ckpt


In [24]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'bert/embeddings/word_embeddings',
 'bert/embeddings/token_type_embeddings',
 'bert/embeddings/position_embeddings',
 'bert/embeddings/LayerNorm/gamma',
 'bert/encoder/layer_0/attention/self/query/kernel',
 'bert/encoder/layer_0/attention/self/query/bias',
 'bert/encoder/layer_0/attention/self/key/kernel',
 'bert/encoder/layer_0/attention/self/key/bias',
 'bert/encoder/layer_0/attention/self/value/kernel',
 'bert/encoder/layer_0/attention/self/value/bias',
 'bert/encoder/layer_0/attention/self/Softmax',
 'bert/encoder/layer_0/attention/output/dense/kernel',
 'bert/encoder/layer_0/attention/output/dense/bias',
 'bert/encoder/layer_0/attention/output/LayerNorm/gamma',
 'bert/encoder/layer_0/intermediate/dense/kernel',
 'bert/encoder/layer_0/intermediate/dense/bias',
 'bert/encoder/layer_0/output/dense/kernel',
 'bert/encoder/layer_0/output/dense/bias',
 'bert/encoder/layer_0/output/LayerNorm/gamma',
 'bert/encoder/layer_1/attention/sel

In [25]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [26]:
freeze_graph('bert-base-sentiment', strings)

INFO:tensorflow:Restoring parameters from bert-base-sentiment/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 203 variables.
INFO:tensorflow:Converted 203 variables to const ops.
9917 ops in the final graph.


In [27]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [28]:
g = load_graph('bert-base-sentiment/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
mask = g.get_tensor_by_name('import/Placeholder_1:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
result = test_sess.run(tf.nn.softmax(logits), feed_dict = {x: [input_id], mask: [input_mask]})
result

array([[0.8833457 , 0.08375123, 0.03290293]], dtype=float32)

In [29]:
labels

['Negative', 'Neutral', 'Positive']

In [30]:
tokenizer.convert_ids_to_tokens(input_id)

['[CLS]',
 '▁Assalamualaikum',
 '▁Yb',
 '▁',
 '.',
 '▁Are',
 '▁you',
 '▁coming',
 '▁down',
 '▁to',
 '▁Parit',
 '▁Yusof',
 '?',
 '▁Sebab',
 '▁my',
 '▁grand',
 'parent',
 's',
 '▁kat',
 '▁sana',
 '▁rasanya',
 '▁tak',
 '▁daftar',
 '▁vaksin',
 '▁lagi',
 '▁They',
 '▁don',
 "'",
 't',
 '▁have',
 '▁smartphone',
 's',
 '▁',
 '.',
 '▁Please',
 '▁help',
 '[SEP]']

In [None]:
# !pip3 install minio

In [31]:
# from minio import Minio
# import os

# access_key = os.environ.get('malaysiai-minio-access-key')
# secret_key = os.environ.get('malaysiaai-minio-secret-key')

In [32]:
# client = Minio(
#     'minio.malaysiaai.ml',
#     access_key=access_key,
#     secret_key=secret_key,
# )
# found = client.bucket_exists('model')
# found

In [33]:
# client.fput_object('model', 'bert-base-sentiment/frozen_model.pb', 'bert-base-sentiment/frozen_model.pb')

In [34]:
# client.fput_object('model', 'sp10m.cased.bert.model', 'sp10m.cased.bert.model')

In [35]:
# client.fput_object('model', 'sp10m.cased.bert.vocab', 'sp10m.cased.bert.vocab')