In [1]:
import xlnet
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import model_utils

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])





  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [3]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('alxlnet-base/sp10m.cased.v5.model')

True

In [4]:
import glob

files = glob.glob('../Malaya-Dataset/emotion/translated*')
files

['../Malaya-Dataset/emotion/translated-love',
 '../Malaya-Dataset/emotion/translated-happy',
 '../Malaya-Dataset/emotion/translated-fear',
 '../Malaya-Dataset/emotion/translated-sadness',
 '../Malaya-Dataset/emotion/translated-surprise',
 '../Malaya-Dataset/emotion/translated-anger']

In [5]:
import re
from unidecode import unidecode

def cleaning(string):
    string = unidecode(string)
    string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    return ' '.join(string)

In [6]:
texts, labels = [], []

for file in files:
    label = file.split('translated-')[1]
    with open(file) as fopen:
        t = fopen.read().split('\n')[:-1]
    t = [cleaning(s) for s in t]
    t = list(filter(None, t))
    texts.extend(t)
    print(label, len(t))
    labels.extend([label] * len(t))

love 15231
happy 19586
fear 19057
sadness 16052
surprise 9711
anger 18872


In [7]:
files = glob.glob('../Malaya-Dataset/emotion/*malaysia.json')
files

['../Malaya-Dataset/emotion/surprise-twitter-malaysia.json',
 '../Malaya-Dataset/emotion/happy-twitter-malaysia.json',
 '../Malaya-Dataset/emotion/love-twitter-malaysia.json',
 '../Malaya-Dataset/emotion/fear-twitter-malaysia.json',
 '../Malaya-Dataset/emotion/anger-twitter-malaysia.json',
 '../Malaya-Dataset/emotion/sadness-twitter-malaysia.json']

In [8]:
import json

for file in files:
    label = file.split('/')[-1].split('-')[0]
    with open(file) as fopen:
        t = json.load(fopen)
    t = [cleaning(s) for s in t]
    t = list(filter(None, t))
    texts.extend(t)
    print(label, len(t))
    labels.extend([label] * len(t))
    
len(texts), len(labels)

surprise 37778
happy 48924
love 59242
fear 18895
anger 51745
sadness 79233


(394326, 394326)

In [9]:
np.unique(labels, return_counts = True)

(array(['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'],
       dtype='<U8'), array([70617, 37952, 68510, 74473, 95285, 47489]))

In [10]:
from prepro_utils import preprocess_text, encode_ids

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [11]:
SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

def XY(left_train):
    X, segments, masks = [], [], []
    for i in tqdm(range(len(left_train))):
        tokens_a = tokenize_fn(left_train[i])
        segment_id = [SEG_ID_A] * len(tokens_a)
        tokens_a.append(SEP_ID)
        tokens_a.append(CLS_ID)
        segment_id.append(SEG_ID_A)
        segment_id.append(SEG_ID_CLS)
        input_mask = [0] * len(tokens_a)
        X.append(tokens_a)
        segments.append(segment_id)
        masks.append(input_mask)
    return X, segments, masks

In [12]:
X, segments, masks = XY(texts)

100%|██████████| 394326/394326 [00:37<00:00, 10596.11it/s]


In [13]:
from sklearn.preprocessing import LabelEncoder

labels = LabelEncoder().fit_transform(labels)

In [14]:
kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='alxlnet-base/config.json')




In [15]:
epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(X) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
print(num_train_steps, num_warmup_steps)
learning_rate = 2e-5

training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = learning_rate,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clip = 1.0,
      clamp_len=-1,)

65721 6572


In [16]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [17]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.transpose(self.X, [1, 0]),
            seg_ids=tf.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.transpose(self.input_masks, [1, 0]))
        
        
        summary = xlnet_model.get_pooled_out("last", True)
        print(summary)
        
        self.logits = tf.layers.dense(summary, dimension_output)
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer, self.learning_rate, _ = model_utils.get_train_op(training_parameters, self.cost)
        
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [18]:
dimension_output = 6

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())




INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Tensor("model_1/sequnece_summary/summary/Tanh:0", shape=(?, 512), dtype=float32)
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [19]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [20]:
tvars = tf.trainable_variables()
checkpoint = 'alxlnet-base/model.ckpt'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [21]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from alxlnet-base/model.ckpt


In [22]:
from sklearn.model_selection import train_test_split
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

train_X, test_X, train_masks, test_masks, train_segments, test_segments, train_Y, test_Y = train_test_split(
    X, segments, masks, labels, test_size = 0.2
)

In [23]:
from tqdm import tqdm
import time


EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i : index]
        batch_y = train_Y[i : index]
        batch_masks = train_masks[i : index]
        batch_segments = train_segments[i : index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i : index]
        batch_y = test_Y[i : index]
        batch_masks = test_masks[i : index]
        batch_segments = test_segments[i : index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
        
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 5258/5258 [15:28<00:00,  5.66it/s, accuracy=0.675, cost=0.644] 
test minibatch loop: 100%|██████████| 1315/1315 [01:25<00:00, 15.45it/s, accuracy=0.846, cost=0.338]
train minibatch loop:   0%|          | 0/5258 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.853444
time taken: 1013.7399888038635
epoch: 0, training loss: 0.610760, training acc: 0.744017, valid loss: 0.310405, valid acc: 0.853444



train minibatch loop: 100%|██████████| 5258/5258 [15:31<00:00,  5.65it/s, accuracy=0.775, cost=0.44]  
test minibatch loop: 100%|██████████| 1315/1315 [01:24<00:00, 15.63it/s, accuracy=0.885, cost=0.321] 
train minibatch loop:   0%|          | 0/5258 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.853444, current acc: 0.866161
time taken: 1015.197653055191
epoch: 1, training loss: 0.273369, training acc: 0.863815, valid loss: 0.257124, valid acc: 0.866161



train minibatch loop: 100%|██████████| 5258/5258 [15:31<00:00,  5.65it/s, accuracy=0.75, cost=0.366]  
test minibatch loop: 100%|██████████| 1315/1315 [01:24<00:00, 15.64it/s, accuracy=0.885, cost=0.377] 
train minibatch loop:   0%|          | 0/5258 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.866161, current acc: 0.867023
time taken: 1015.4731228351593
epoch: 2, training loss: 0.237116, training acc: 0.874800, valid loss: 0.253414, valid acc: 0.867023



train minibatch loop: 100%|██████████| 5258/5258 [15:31<00:00,  5.65it/s, accuracy=0.775, cost=0.368] 
test minibatch loop: 100%|██████████| 1315/1315 [01:24<00:00, 15.62it/s, accuracy=0.846, cost=0.32]  
train minibatch loop:   0%|          | 0/5258 [00:00<?, ?it/s]

time taken: 1015.3165743350983
epoch: 3, training loss: 0.214357, training acc: 0.883214, valid loss: 0.258724, valid acc: 0.865384



train minibatch loop: 100%|██████████| 5258/5258 [15:30<00:00,  5.65it/s, accuracy=0.8, cost=0.283]   
test minibatch loop: 100%|██████████| 1315/1315 [01:24<00:00, 15.62it/s, accuracy=0.846, cost=0.407] 
train minibatch loop:   0%|          | 0/5258 [00:00<?, ?it/s]

time taken: 1015.1628031730652
epoch: 4, training loss: 0.194514, training acc: 0.892244, valid loss: 0.272158, valid acc: 0.863787



train minibatch loop: 100%|██████████| 5258/5258 [15:31<00:00,  5.65it/s, accuracy=0.775, cost=0.271] 
test minibatch loop: 100%|██████████| 1315/1315 [01:26<00:00, 15.27it/s, accuracy=0.808, cost=0.514] 

time taken: 1017.1645240783691
epoch: 5, training loss: 0.176453, training acc: 0.901475, valid loss: 0.296102, valid acc: 0.861159

break epoch:6






In [25]:
real_Y, predict_Y = [], []

pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
for i in pbar:
    index = min(i + batch_size, len(test_X))
    batch_x = test_X[i : index]
    batch_y = test_Y[i : index]
    batch_masks = test_masks[i : index]
    batch_segments = test_segments[i : index]
    batch_x = pad_sequences(batch_x, padding='post')
    batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
    batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks
            },
    ), 1, ).tolist()
    real_Y += batch_y.tolist()

test minibatch loop: 100%|██████████| 1315/1315 [01:21<00:00, 16.05it/s]


In [26]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'],
        digits = 5
    )
)

              precision    recall  f1-score   support

       anger    0.91563   0.93195   0.92372     14137
        fear    0.88479   0.84492   0.86440      7635
       happy    0.91837   0.91924   0.91880     13621
        love    0.93131   0.94558   0.93839     14811
     sadness    0.78902   0.82396   0.80611     19177
    surprise    0.70276   0.62889   0.66377      9485

    accuracy                        0.86118     78866
   macro avg    0.85698   0.84909   0.85253     78866
weighted avg    0.85967   0.86118   0.86002     78866

