In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time



In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [3]:
def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )

def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            val = dic[k] if k in dic else UNK
            X[i, -1 - no] = val
    return X

In [4]:
import os
emotion_files = [f for f in os.listdir(os.getcwd()) if 'translated-' in f]
emotion_files

['translated-joy',
 'translated-love',
 'translated-fear',
 'translated-sadness',
 'translated-surprise',
 'translated-anger']

In [5]:
texts, labels = [], []
for f in emotion_files:
    with open(f) as fopen:
        dataset = list(filter(None, fopen.read().split('\n')))
        labels.extend([f.split('-')[1]] * len(dataset))
        texts.extend(dataset)

In [6]:
unique_labels = np.unique(labels).tolist()
labels = LabelEncoder().fit_transform(labels)
unique_labels

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [7]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])[0]

In [8]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 14652
Most common words [('saya', 165182), ('asa', 50903), ('rasa', 50028), ('tidak', 33044), ('yang', 31373), ('untuk', 15327)]
Sample data [520, 1151, 8, 4, 103, 722, 8, 94, 114, 8] ['buah', 'parti', 'yang', 'saya', 'gi', 'natal', 'yang', 'akhir', 'malam', 'yang']


In [9]:
class Model:
    def __init__(
        self,
        size_layer,
        num_layers,
        dimension_output,
        learning_rate,
        dropout,
        dict_size,
    ):
        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                state_keep_prob = dropout,
                output_keep_prob = dropout,
            )

        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(
            tf.random_uniform([dict_size, size_layer], -1, 1)
        )
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer),
                cell_bw = cells(size_layer),
                inputs = encoder_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d' % (n),
            )
            encoder_embedded = tf.concat((out_fw, out_bw), 2)

        W = tf.get_variable(
            'w',
            shape = (size_layer * 2, dimension_output),
            initializer = tf.orthogonal_initializer(),
        )
        b = tf.get_variable(
            'b', shape = (dimension_output), initializer = tf.zeros_initializer()
        )
        self.logits = tf.add(
            tf.matmul(tf.reduce_mean(encoder_embedded, 1), W),
            b,
            name = 'logits',
        )
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [10]:
size_layer = 256
num_layers = 2
dimension_output = len(unique_labels)
learning_rate = 1e-4
batch_size = 32
dropout = 0.8
maxlen = 80

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    size_layer,
    num_layers,
    dimension_output,
    learning_rate,
    dropout,
    len(dictionary),
)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bidirectional/model.ckpt')

'bidirectional/model.ckpt'

In [11]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
    ]
)

In [12]:
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Variable',
 'bidirectional_rnn_0/fw/lstm_cell/kernel',
 'bidirectional_rnn_0/fw/lstm_cell/bias',
 'bidirectional_rnn_0/bw/lstm_cell/kernel',
 'bidirectional_rnn_0/bw/lstm_cell/bias',
 'bidirectional_rnn_1/fw/lstm_cell/kernel',
 'bidirectional_rnn_1/fw/lstm_cell/bias',
 'bidirectional_rnn_1/bw/lstm_cell/kernel',
 'bidirectional_rnn_1/bw/lstm_cell/bias',
 'w',
 'b',
 'logits',
 'gradients/logits_grad/Shape',
 'gradients/logits_grad/Shape_1',
 'gradients/logits_grad/BroadcastGradientArgs',
 'gradients/logits_grad/Sum',
 'gradients/logits_grad/Reshape',
 'gradients/logits_grad/Sum_1',
 'gradients/logits_grad/Reshape_1',
 'gradients/logits_grad/tuple/group_deps',
 'gradients/logits_grad/tuple/control_dependency',
 'gradients/logits_grad/tuple/control_dependency_1']

In [13]:
tf.trainable_variables()

[<tf.Variable 'Variable:0' shape=(14656, 256) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_0/fw/lstm_cell/kernel:0' shape=(512, 1024) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_0/fw/lstm_cell/bias:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_0/bw/lstm_cell/kernel:0' shape=(512, 1024) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_0/bw/lstm_cell/bias:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_1/fw/lstm_cell/kernel:0' shape=(768, 1024) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_1/fw/lstm_cell/bias:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_1/bw/lstm_cell/kernel:0' shape=(768, 1024) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_1/bw/lstm_cell/bias:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'w:0' shape=(512, 6) dtype=float32_ref>,
 <tf.Variable 'b:0' shape=(6,) dtype=float32_ref>]

In [14]:
vectors = str_idx(texts, dictionary, maxlen)
train_X, test_X, train_Y, test_Y = train_test_split(
    vectors, labels, test_size = 0.2
)

In [15]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1


train minibatch loop: 100%|██████████| 2463/2463 [19:10<00:00,  2.07it/s, accuracy=0.75, cost=0.828] 
test minibatch loop: 100%|██████████| 616/616 [01:56<00:00,  5.04it/s, accuracy=0.609, cost=0.955]
train minibatch loop:   0%|          | 0/2463 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.696822
time taken: 1267.327508687973
epoch: 0, training loss: 1.243084, training acc: 0.504098, valid loss: 0.855778, valid acc: 0.696822



train minibatch loop: 100%|██████████| 2463/2463 [19:29<00:00,  2.05it/s, accuracy=0.714, cost=0.713]
test minibatch loop: 100%|██████████| 616/616 [01:56<00:00,  5.22it/s, accuracy=0.739, cost=0.596]
train minibatch loop:   0%|          | 0/2463 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.696822, current acc: 0.760729
time taken: 1286.3706641197205
epoch: 1, training loss: 0.734632, training acc: 0.741675, valid loss: 0.680388, valid acc: 0.760729



train minibatch loop: 100%|██████████| 2463/2463 [10:02<00:00,  5.60it/s, accuracy=0.679, cost=0.717]
test minibatch loop: 100%|██████████| 616/616 [00:37<00:00, 16.41it/s, accuracy=0.826, cost=0.46] 
train minibatch loop:   0%|          | 1/2463 [00:00<07:19,  5.60it/s, accuracy=0.781, cost=0.743]

epoch: 2, pass acc: 0.760729, current acc: 0.775285
time taken: 639.7767624855042
epoch: 2, training loss: 0.602859, training acc: 0.787415, valid loss: 0.634992, valid acc: 0.775285



train minibatch loop: 100%|██████████| 2463/2463 [07:20<00:00,  5.60it/s, accuracy=0.643, cost=0.611]
test minibatch loop: 100%|██████████| 616/616 [00:37<00:00, 16.38it/s, accuracy=0.826, cost=0.486]
train minibatch loop:   0%|          | 1/2463 [00:00<07:16,  5.64it/s, accuracy=0.812, cost=0.697]

epoch: 3, pass acc: 0.775285, current acc: 0.789750
time taken: 478.50358533859253
epoch: 3, training loss: 0.535529, training acc: 0.809440, valid loss: 0.586219, valid acc: 0.789750



train minibatch loop: 100%|██████████| 2463/2463 [07:21<00:00,  5.60it/s, accuracy=0.75, cost=0.533] 
test minibatch loop: 100%|██████████| 616/616 [00:37<00:00, 16.49it/s, accuracy=0.783, cost=0.528]
train minibatch loop:   0%|          | 1/2463 [00:00<07:21,  5.58it/s, accuracy=0.812, cost=0.665]

epoch: 4, pass acc: 0.789750, current acc: 0.790999
time taken: 478.6171419620514
epoch: 4, training loss: 0.487955, training acc: 0.825483, valid loss: 0.585549, valid acc: 0.790999



train minibatch loop: 100%|██████████| 2463/2463 [07:21<00:00,  5.60it/s, accuracy=0.75, cost=0.513] 
test minibatch loop: 100%|██████████| 616/616 [00:37<00:00, 15.83it/s, accuracy=0.826, cost=0.388]
train minibatch loop:   0%|          | 1/2463 [00:00<07:18,  5.62it/s, accuracy=0.844, cost=0.592]

epoch: 5, pass acc: 0.790999, current acc: 0.799291
time taken: 478.58524656295776
epoch: 5, training loss: 0.452828, training acc: 0.835621, valid loss: 0.565911, valid acc: 0.799291



train minibatch loop: 100%|██████████| 2463/2463 [07:20<00:00,  5.58it/s, accuracy=0.786, cost=0.425]
test minibatch loop: 100%|██████████| 616/616 [00:37<00:00, 16.44it/s, accuracy=0.783, cost=0.474]
train minibatch loop:   0%|          | 1/2463 [00:00<07:18,  5.61it/s, accuracy=0.844, cost=0.611]

epoch: 6, pass acc: 0.799291, current acc: 0.800388
time taken: 478.58138489723206
epoch: 6, training loss: 0.422172, training acc: 0.844721, valid loss: 0.563996, valid acc: 0.800388



train minibatch loop: 100%|██████████| 2463/2463 [10:17<00:00,  2.66it/s, accuracy=0.786, cost=0.425]
test minibatch loop: 100%|██████████| 616/616 [01:30<00:00,  6.30it/s, accuracy=0.783, cost=0.552]
train minibatch loop:   0%|          | 0/2463 [00:00<?, ?it/s]

time taken: 707.4652082920074
epoch: 7, training loss: 0.395075, training acc: 0.853451, valid loss: 0.572655, valid acc: 0.798409



train minibatch loop: 100%|██████████| 2463/2463 [12:45<00:00,  5.62it/s, accuracy=0.786, cost=0.365] 
test minibatch loop: 100%|██████████| 616/616 [00:37<00:00, 16.39it/s, accuracy=0.783, cost=0.451]
train minibatch loop:   0%|          | 1/2463 [00:00<07:17,  5.63it/s, accuracy=0.844, cost=0.523]

epoch: 8, pass acc: 0.800388, current acc: 0.801657
time taken: 803.4346408843994
epoch: 8, training loss: 0.372697, training acc: 0.860658, valid loss: 0.572492, valid acc: 0.801657



train minibatch loop: 100%|██████████| 2463/2463 [10:28<00:00,  2.68it/s, accuracy=0.821, cost=0.341] 
test minibatch loop: 100%|██████████| 616/616 [01:27<00:00,  7.15it/s, accuracy=0.783, cost=0.486]
train minibatch loop:   0%|          | 0/2463 [00:00<?, ?it/s]

time taken: 715.8854594230652
epoch: 9, training loss: 0.350294, training acc: 0.868336, valid loss: 0.588634, valid acc: 0.798916



train minibatch loop: 100%|██████████| 2463/2463 [14:58<00:00,  2.72it/s, accuracy=0.786, cost=0.358] 
test minibatch loop: 100%|██████████| 616/616 [01:28<00:00,  7.11it/s, accuracy=0.783, cost=0.633]
train minibatch loop:   0%|          | 0/2463 [00:00<?, ?it/s]

time taken: 986.6287086009979
epoch: 10, training loss: 0.331505, training acc: 0.874450, valid loss: 0.592429, valid acc: 0.800997



train minibatch loop: 100%|██████████| 2463/2463 [14:58<00:00,  2.65it/s, accuracy=0.821, cost=0.304] 
test minibatch loop: 100%|██████████| 616/616 [01:27<00:00,  6.40it/s, accuracy=0.826, cost=0.596] 

time taken: 986.4895989894867
epoch: 11, training loss: 0.313824, training acc: 0.880783, valid loss: 0.603280, valid acc: 0.799139

break epoch:12






In [16]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    predict_Y += np.argmax(
        sess.run(
            model.logits, feed_dict = {model.X: batch_x, model.Y: batch_y}
        ),
        1,
    ).tolist()
    real_Y += batch_y.tolist()

validation minibatch loop: 100%|██████████| 616/616 [01:26<00:00,  6.98it/s]


In [17]:
from sklearn import metrics
print(metrics.classification_report(real_Y, predict_Y, target_names = unique_labels))

             precision    recall  f1-score   support

      anger       0.81      0.80      0.81      3726
       fear       0.77      0.78      0.77      3806
        joy       0.83      0.81      0.82      3975
       love       0.86      0.83      0.85      2992
    sadness       0.75      0.78      0.77      3293
   surprise       0.77      0.79      0.78      1911

avg / total       0.80      0.80      0.80     19703



In [18]:
text = classification_textcleaning('kerajaan sebenarnya sangat sayangkan rakyatnya, tetapi sebenarnya benci')
new_vector = str_idx([text[0]], dictionary, len(text[0].split()))
sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

array([[0.46793234, 0.07585412, 0.0049634 , 0.31592074, 0.03232513,
        0.10300431]], dtype=float32)

In [19]:
import json
with open('bidirectional-emotion.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [20]:
saver.save(sess, 'bidirectional/model.ckpt')

'bidirectional/model.ckpt'

In [21]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [22]:
freeze_graph('bidirectional', strings)

INFO:tensorflow:Restoring parameters from bidirectional/model.ckpt
INFO:tensorflow:Froze 11 variables.
INFO:tensorflow:Converted 11 variables to const ops.
656 ops in the final graph.


In [23]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [26]:
g = load_graph('bidirectional/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
result = test_sess.run(tf.nn.softmax(logits), feed_dict = {x: new_vector})
result



array([[0.54999757, 0.06570744, 0.00491361, 0.22659159, 0.04161717,
        0.11117265]], dtype=float32)