In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import xlnet
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import model_utils




In [3]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.v9.model')

True

In [4]:
import json
import random

emotion_label = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise']

with open('../sentiment/emotion-twitter-lexicon.json') as fopen:
    emotion = json.load(fopen)
    
emotion.keys()

dict_keys(['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'])

In [5]:
texts, labels = [], []

for k, v in emotion.items():
    if len(v) > 30000:
        emotion[k] = random.sample(v, 30000)
    print(k, len(emotion[k]))
    texts.extend(emotion[k])
    labels.extend([emotion_label.index(k)] * len(emotion[k]))

anger 30000
fear 20316
happy 30000
love 20783
sadness 26468
surprise 13107


In [6]:
from rules import normalized_chars
import random
from unidecode import unidecode
import re

laughing = {
    'huhu',
    'haha',
    'gagaga',
    'hihi',
    'wkawka',
    'wkwk',
    'kiki',
    'keke',
    'huehue',
    'hshs',
    'hoho',
    'hewhew',
    'uwu',
    'sksk',
    'ksks',
    'gituu',
    'gitu',
    'mmeeooww',
    'meow',
    'alhamdulillah',
    'muah',
    'mmuahh',
    'hehe',
    'salamramadhan',
    'happywomensday',
    'jahagaha',
    'ahakss',
    'ahksk'
}

def make_cleaning(s, c_dict):
    s = s.translate(c_dict)
    return s

def cleaning(string):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()]
    )
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    
    chars = '.,/'
    for c in chars:
        string = string.replace(c, f' {c} ')
        
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    x = []
    for word in string:
        word = word.lower()
        if any([laugh in word for laugh in laughing]):
            if random.random() >= 0.5:
                x.append(word)
        else:
            x.append(word)
    string = [w.title() if w[0].isupper() else w for w in x]
    return ' '.join(string)

In [7]:
from tqdm import tqdm

for i in tqdm(range(len(texts))):
    texts[i] = cleaning(texts[i])

100%|██████████| 140674/140674 [00:12<00:00, 11078.93it/s]


In [8]:
actual_t, actual_l = [], []

for i in tqdm(range(len(texts))):
    if len(texts[i]) > 2:
        actual_t.append(texts[i])
        actual_l.append(labels[i])

100%|██████████| 140674/140674 [00:00<00:00, 1329509.17it/s]


In [9]:
from prepro_utils import preprocess_text, encode_ids

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [10]:
SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

def XY(left_train):
    X, segments, masks = [], [], []
    for i in tqdm(range(len(left_train))):
        tokens_a = tokenize_fn(left_train[i])
        segment_id = [SEG_ID_A] * len(tokens_a)
        tokens_a.append(SEP_ID)
        tokens_a.append(CLS_ID)
        segment_id.append(SEG_ID_A)
        segment_id.append(SEG_ID_CLS)
        input_mask = [0] * len(tokens_a)
        X.append(tokens_a)
        segments.append(segment_id)
        masks.append(input_mask)
    return X, segments, masks

In [11]:
X, segments, masks = XY(actual_t)

100%|██████████| 140674/140674 [00:19<00:00, 7241.00it/s]


In [12]:
kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='alxlnet-base/config.json')




In [13]:
epoch = 3
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(X) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
print(num_train_steps, num_warmup_steps)
learning_rate = 2e-5

training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = learning_rate,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clip = 1.0,
      clamp_len=-1,)

7033 703


In [14]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [15]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.transpose(self.X, [1, 0]),
            seg_ids=tf.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.transpose(self.input_masks, [1, 0]))
        
        output_layer = xlnet_model.get_sequence_output()
        output_layer = tf.transpose(output_layer, [1, 0, 2])
        
        self.logits_seq = tf.layers.dense(output_layer, dimension_output)
        self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')
        self.logits = self.logits_seq[:, 0]
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer, self.learning_rate, _ = model_utils.get_train_op(training_parameters, self.cost)
        
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
dimension_output = 6

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())




INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `layer.__call__` method instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.Dense instead.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [17]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [18]:
tvars = tf.trainable_variables()
checkpoint = 'alxlnet-base/model.ckpt'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [19]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

INFO:tensorflow:Restoring parameters from alxlnet-base/model.ckpt


In [20]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_masks, test_masks, train_segments, test_segments, train_Y, test_Y = train_test_split(
    X, segments, masks, actual_l, test_size = 0.2
)

In [21]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

i = 0
index = 4
batch_x = train_X[i : index]
batch_y = train_Y[i : index]
batch_masks = train_masks[i : index]
batch_segments = train_segments[i : index]
batch_x = pad_sequences(batch_x, padding='post')
batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
batch_masks = pad_sequences(batch_masks, padding='post', value = 1)

sess.run(
    [model.accuracy, model.cost],
    feed_dict = {
        model.X: batch_x,
        model.Y: batch_y,
        model.segment_ids: batch_segments,
        model.input_masks: batch_masks,
    },
)

[0.0, 2.9896626]

In [22]:
from tqdm import tqdm
import time

for EPOCH in range(epoch):

    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i : index]
        batch_y = train_Y[i : index]
        batch_masks = train_masks[i : index]
        batch_segments = train_segments[i : index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i : index]
        batch_y = test_Y[i : index]
        batch_masks = test_masks[i : index]
        batch_segments = test_segments[i : index]
        batch_x = pad_sequences(batch_x, padding='post')
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
        
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 1876/1876 [14:20<00:00,  2.18it/s, accuracy=1, cost=0.00718]   
test minibatch loop: 100%|██████████| 469/469 [01:14<00:00,  6.34it/s, accuracy=1, cost=0.000879]  
train minibatch loop:   0%|          | 0/1876 [00:00<?, ?it/s]

epoch: 0, training loss: 0.480753, training acc: 0.848570, valid loss: 0.049923, valid acc: 0.988984



train minibatch loop: 100%|██████████| 1876/1876 [14:13<00:00,  2.20it/s, accuracy=1, cost=0.000222]  
test minibatch loop: 100%|██████████| 469/469 [01:14<00:00,  6.33it/s, accuracy=1, cost=4.3e-5]    
train minibatch loop:   0%|          | 0/1876 [00:00<?, ?it/s]

epoch: 1, training loss: 0.030033, training acc: 0.993293, valid loss: 0.026411, valid acc: 0.994563



train minibatch loop: 100%|██████████| 1876/1876 [14:15<00:00,  2.19it/s, accuracy=1, cost=0.00394]   
test minibatch loop: 100%|██████████| 469/469 [01:14<00:00,  6.30it/s, accuracy=1, cost=2.08e-5]   

epoch: 2, training loss: 0.012450, training acc: 0.997122, valid loss: 0.020494, valid acc: 0.996233






In [23]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'alxlnet-base-emotion/model.ckpt')

'alxlnet-base-emotion/model.ckpt'

In [24]:
kwargs = dict(
      is_training=False,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='alxlnet-base/config.json')

In [25]:
dimension_output = 6
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())

INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>




In [26]:
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, 'alxlnet-base-emotion/model.ckpt')

INFO:tensorflow:Restoring parameters from alxlnet-base-emotion/model.ckpt


In [27]:
real_Y, predict_Y = [], []

pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
for i in pbar:
    index = min(i + batch_size, len(test_X))
    batch_x = test_X[i : index]
    batch_y = test_Y[i : index]
    batch_masks = test_masks[i : index]
    batch_segments = test_segments[i : index]
    batch_x = pad_sequences(batch_x, padding='post')
    batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
    batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks
            },
    ), 1, ).tolist()
    real_Y += batch_y

test minibatch loop: 100%|██████████| 469/469 [01:04<00:00,  7.31it/s]


In [28]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'],
        digits = 5
    )
)

              precision    recall  f1-score   support

       anger    0.99669   0.99439   0.99554      6065
        fear    0.99702   0.99727   0.99714      4027
       happy    0.99764   0.99949   0.99857      5918
        love    0.99554   0.99694   0.99624      4250
     sadness    0.99867   0.99641   0.99754      5286
    surprise    0.99422   0.99730   0.99576      2589

    accuracy                        0.99691     28135
   macro avg    0.99663   0.99697   0.99680     28135
weighted avg    0.99691   0.99691   0.99691     28135



In [29]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'model/transformer/r_w_bias',
 'model/transformer/r_r_bias',
 'model/transformer/word_embedding/lookup_table',
 'model/transformer/word_embedding/lookup_table_2',
 'model/transformer/r_s_bias',
 'model/transformer/seg_embed',
 'model/transformer/layer_shared/rel_attn/q/kernel',
 'model/transformer/layer_shared/rel_attn/k/kernel',
 'model/transformer/layer_shared/rel_attn/v/kernel',
 'model/transformer/layer_shared/rel_attn/r/kernel',
 'model/transformer/layer_shared/rel_attn/o/kernel',
 'model/transformer/layer_shared/rel_attn/LayerNorm/gamma',
 'model/transformer/layer_shared/ff/layer_1/kernel',
 'model/transformer/layer_shared/ff/layer_1/bias',
 'model/transformer/layer_shared/ff/layer_2/kernel',
 'model/transformer/layer_shared/ff/layer_2/bias',
 'model/transformer/layer_shared/ff/LayerNorm/gamma',
 'dense/kernel',
 'dense/bias',
 'logits_seq',
 'logits']

In [30]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [31]:
freeze_graph('alxlnet-base-emotion', strings)

INFO:tensorflow:Restoring parameters from alxlnet-base-emotion/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 21 variables.
INFO:tensorflow:Converted 21 variables to const ops.
7390 ops in the final graph.


In [32]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'alxlnet-base-emotion/frozen_model.pb'
outPutname = "v34/emotion/alxlnet-base-emotion.pb"

s3 = boto3.client('s3')

s3.upload_file(Key,bucketName,outPutname)