In [1]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import re
import sentencepiece as spm

In [2]:
# !git clone https://github.com/huseinzol05/Malaya-Dataset.git

In [3]:
# Change to your local Malaya-Dataset
import glob

files = glob.glob('Malaya-Dataset/emotion/translated*')
files

['../../Malaya-Dataset/emotion/translated-fear',
 '../../Malaya-Dataset/emotion/translated-love',
 '../../Malaya-Dataset/emotion/translated-anger',
 '../../Malaya-Dataset/emotion/translated-sadness',
 '../../Malaya-Dataset/emotion/translated-joy',
 '../../Malaya-Dataset/emotion/translated-surprise']

In [4]:
texts, labels = [], []
for file in files:
    with open(file) as fopen:
        dataset = fopen.readlines()
    print(len(dataset))
    texts.extend(dataset)
    labels.extend([file.split('/')[-1].split('-')[1]] * len(dataset))

19058
15232
18873
16053
19587
9712


In [5]:
files = glob.glob('Malaya-Dataset/emotion/*malaysia.json')
files

['../../Malaya-Dataset/emotion/sadness-twitter-malaysia.json',
 '../../Malaya-Dataset/emotion/joy-twitter-malaysia.json',
 '../../Malaya-Dataset/emotion/fear-twitter-malaysia.json',
 '../../Malaya-Dataset/emotion/surprise-twitter-malaysia.json',
 '../../Malaya-Dataset/emotion/anger-twitter-malaysia.json',
 '../../Malaya-Dataset/emotion/love-twitter-malaysia.json']

In [6]:
import json

for file in files:
    with open(file) as fopen:
        dataset = json.load(fopen)
    print(len(dataset))
    texts.extend(dataset)
    labels.extend([file.split('/')[-1].split('-')[0]] * len(dataset))
    
len(texts), len(labels)

83264
63234
18895
37778
55723
63107


(420516, 420516)

In [7]:
np.unique(labels)

array(['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'], dtype='<U8')

In [8]:
from sklearn.preprocessing import LabelEncoder

unique_labels = np.unique(labels).tolist()
labels = LabelEncoder().fit_transform(labels)
unique_labels

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [9]:
labels = labels.tolist()

In [10]:
from prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.v4.model')

with open('sp10m.cased.v4.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [11]:
BERT_INIT_CHKPNT = 'pretraining_output2/model.ckpt-1000000'
BERT_CONFIG = 'checkpoint/bert_config.json'

In [12]:
MAX_SEQ_LENGTH = 100
tokenizer.tokenize(texts[1])

['▁malam',
 '▁yang',
 '▁gelap',
 '▁apabila',
 '▁saya',
 '▁terpaksa',
 '▁pergi',
 '▁ke',
 '▁bilik',
 '▁mandi',
 '▁luaran']

In [13]:
list(v.keys())[:10]

['<unk>',
 '<s>',
 '</s>',
 '<cls>',
 '<sep>',
 '<pad>',
 '<mask>',
 '<eod>',
 '<eop>',
 '.']

In [14]:
from tqdm import tqdm

input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(texts):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["<cls>"] + tokens_a + ["<sep>"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 420516/420516 [00:49<00:00, 8553.18it/s]


In [15]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [18]:
epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [19]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_pooled_output()
        self.logits = tf.layers.dense(output_layer, dimension_output)
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [20]:
dimension_output = np.unique(labels).shape[0]
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from pretraining_output2/model.ckpt-1000000


In [21]:
from sklearn.cross_validation import train_test_split

train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, labels, test_size = 0.2
)



In [22]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_input_ids) / batch_size
    train_acc /= len(train_input_ids) / batch_size
    test_loss /= len(test_input_ids) / batch_size
    test_acc /= len(test_input_ids) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 5607/5607 [37:29<00:00,  2.57it/s, accuracy=0.885, cost=0.264] 
test minibatch loop: 100%|██████████| 1402/1402 [03:13<00:00,  7.79it/s, accuracy=0.818, cost=0.266]
train minibatch loop:   0%|          | 0/5607 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.862540
time taken: 2442.5887212753296
epoch: 0, training loss: 0.447970, training acc: 0.807460, valid loss: 0.273390, valid acc: 0.862540



train minibatch loop: 100%|██████████| 5607/5607 [37:27<00:00,  2.57it/s, accuracy=0.923, cost=0.191] 
test minibatch loop: 100%|██████████| 1402/1402 [03:12<00:00,  7.80it/s, accuracy=0.818, cost=0.277] 
train minibatch loop:   0%|          | 0/5607 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.862540, current acc: 0.867070
time taken: 2440.1557557582855
epoch: 1, training loss: 0.252797, training acc: 0.868900, valid loss: 0.249145, valid acc: 0.867070



train minibatch loop: 100%|██████████| 5607/5607 [37:27<00:00,  2.57it/s, accuracy=0.942, cost=0.147] 
test minibatch loop: 100%|██████████| 1402/1402 [03:12<00:00,  7.79it/s, accuracy=0.795, cost=0.219] 
train minibatch loop:   0%|          | 0/5607 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.867070, current acc: 0.867732
time taken: 2440.4718947410583
epoch: 2, training loss: 0.224579, training acc: 0.877833, valid loss: 0.253093, valid acc: 0.867732



train minibatch loop: 100%|██████████| 5607/5607 [37:27<00:00,  2.57it/s, accuracy=0.962, cost=0.122] 
test minibatch loop: 100%|██████████| 1402/1402 [03:12<00:00,  7.81it/s, accuracy=0.841, cost=0.235] 
train minibatch loop:   0%|          | 0/5607 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.867732, current acc: 0.868335
time taken: 2440.2763261795044
epoch: 3, training loss: 0.207366, training acc: 0.883288, valid loss: 0.265962, valid acc: 0.868335



train minibatch loop: 100%|██████████| 5607/5607 [37:27<00:00,  2.57it/s, accuracy=0.923, cost=0.0955]
test minibatch loop: 100%|██████████| 1402/1402 [03:12<00:00,  7.82it/s, accuracy=0.864, cost=0.235] 
train minibatch loop:   0%|          | 0/5607 [00:00<?, ?it/s]

time taken: 2440.601249933243
epoch: 4, training loss: 0.193580, training acc: 0.889743, valid loss: 0.289179, valid acc: 0.866425



train minibatch loop: 100%|██████████| 5607/5607 [37:27<00:00,  2.57it/s, accuracy=0.923, cost=0.099] 
test minibatch loop: 100%|██████████| 1402/1402 [03:13<00:00,  7.75it/s, accuracy=0.841, cost=0.272] 
train minibatch loop:   0%|          | 0/5607 [00:00<?, ?it/s]

time taken: 2440.6501138210297
epoch: 5, training loss: 0.180380, training acc: 0.899238, valid loss: 0.312310, valid acc: 0.866873



train minibatch loop: 100%|██████████| 5607/5607 [37:27<00:00,  2.56it/s, accuracy=0.962, cost=0.0849]
test minibatch loop: 100%|██████████| 1402/1402 [03:12<00:00,  7.82it/s, accuracy=0.886, cost=0.199] 

time taken: 2440.6755998134613
epoch: 6, training loss: 0.167566, training acc: 0.909102, valid loss: 0.331886, valid acc: 0.864182

break epoch:7






In [23]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    batch_masks = test_input_masks[i: index]
    batch_segment = test_segment_ids[i: index]
    batch_y = test_Y[i: index]
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
    ), 1, ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 1402/1402 [03:12<00:00,  7.86it/s]


In [24]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = unique_labels,digits=5
    )
)

             precision    recall  f1-score   support

      anger    0.91232   0.93906   0.92550     15080
       fear    0.85787   0.86008   0.85898      7397
        joy    0.91618   0.92719   0.92166     16564
       love    0.91356   0.94468   0.92886     15708
    sadness    0.80322   0.81937   0.81122     19792
   surprise    0.72008   0.60013   0.65465      9563

avg / total    0.86099   0.86412   0.86183     84104

