In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time



In [26]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = [e for e in hujung if word.endswith(e)]
    if len(hujung_result):
        hujung_result = max(hujung_result, key = len)
        if len(hujung_result):
            word = word[: -len(hujung_result)]
    permulaan_result = [e for e in permulaan if word.startswith(e)]
    if len(permulaan_result):
        permulaan_result = max(permulaan_result, key = len)
        if len(permulaan_result):
            word = word[len(permulaan_result) :]
    return word

In [33]:
def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [naive_stemmer(word) for word in string]
    return ' '.join([word for word in string if len(word) > 1])


def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            X[i, -1 - no] = dic.get(k, UNK)
    return X

In [34]:
classification_textcleaning('kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya')

'raja benar sangat benci rakyat minyak naik gala'

In [15]:
import os
emotion_files = [f for f in os.listdir(os.getcwd()) if 'translated-' in f]
emotion_files

['translated-joy',
 'translated-love',
 'translated-fear',
 'translated-sadness',
 'translated-surprise',
 'translated-anger']

In [16]:
texts, labels = [], []
for f in emotion_files:
    with open(f) as fopen:
        dataset = list(filter(None, fopen.read().split('\n')))
        labels.extend([f.split('-')[1]] * len(dataset))
        texts.extend(dataset)

In [17]:
unique_labels = np.unique(labels).tolist()
labels = LabelEncoder().fit_transform(labels)
unique_labels

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [14]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])

In [9]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 14728
Most common words [('saya', 165182), ('asa', 50903), ('rasa', 50028), ('tidak', 33044), ('yang', 31373), ('untuk', 15327)]
Sample data [516, 1128, 8, 4, 103, 722, 8, 93, 115, 8] ['buah', 'parti', 'yang', 'saya', 'gi', 'natal', 'yang', 'akhir', 'malam', 'yang']


In [10]:
max_features = len(dictionary)
maxlen = 100
batch_size = 32
embedded_size = 256

In [11]:
X = str_idx(texts, dictionary, maxlen)

In [12]:
train_X, test_X, train_Y, test_Y = train_test_split(X, 
                                                    labels,
                                                    test_size = 0.2)

In [13]:
class Model:
    def __init__(
        self, embedded_size, dict_size, dimension_output, learning_rate
    ):

        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(
            tf.random_uniform([dict_size, embedded_size], -1, 1)
        )
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        self.logits = tf.identity(
            tf.layers.dense(
                tf.reduce_mean(encoder_embedded, 1), dimension_output
            ),
            name = 'logits',
        )
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
            self.cost
        )
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [14]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(embedded_size, max_features, len(unique_labels), 5e-4)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'fast-text/model.ckpt')

'fast-text/model.ckpt'

In [15]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
    ]
)

In [16]:
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Variable',
 'dense/kernel',
 'dense/bias',
 'logits']

In [17]:
tf.trainable_variables()

[<tf.Variable 'Variable:0' shape=(14732, 256) dtype=float32_ref>,
 <tf.Variable 'dense/kernel:0' shape=(256, 6) dtype=float32_ref>,
 <tf.Variable 'dense/bias:0' shape=(6,) dtype=float32_ref>]

In [18]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

saver.save(sess, "fast-text/model.ckpt")

train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 82.29it/s, accuracy=0.321, cost=1.72] 
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 225.72it/s, accuracy=0.609, cost=1.58]
train minibatch loop:   0%|          | 10/2463 [00:00<00:26, 91.85it/s, accuracy=0.469, cost=1.61]

epoch: 0, pass acc: 0.000000, current acc: 0.383418
time taken: 32.66115999221802
epoch: 0, training loss: 1.725041, training acc: 0.286889, valid loss: 1.654089, valid acc: 0.383418



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 82.71it/s, accuracy=0.5, cost=1.44]  
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 221.76it/s, accuracy=0.696, cost=1.33]
train minibatch loop:   0%|          | 8/2463 [00:00<00:32, 76.10it/s, accuracy=0.594, cost=1.33]

epoch: 1, pass acc: 0.383418, current acc: 0.544854
time taken: 32.557716846466064
epoch: 1, training loss: 1.523483, training acc: 0.504238, valid loss: 1.394425, valid acc: 0.544854



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 82.75it/s, accuracy=0.571, cost=1.2]  
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 226.10it/s, accuracy=0.826, cost=1.09] 
train minibatch loop:   0%|          | 8/2463 [00:00<00:33, 72.50it/s, accuracy=0.531, cost=1.32] 

epoch: 2, pass acc: 0.544854, current acc: 0.664743
time taken: 32.49084973335266
epoch: 2, training loss: 1.241095, training acc: 0.648192, valid loss: 1.130963, valid acc: 0.664743



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 80.18it/s, accuracy=0.643, cost=1.06] 
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 223.45it/s, accuracy=0.826, cost=0.949]
train minibatch loop:   0%|          | 10/2463 [00:00<00:28, 86.76it/s, accuracy=0.719, cost=0.912]

epoch: 3, pass acc: 0.664743, current acc: 0.710726
time taken: 32.686481952667236
epoch: 3, training loss: 1.009701, training acc: 0.715228, valid loss: 0.950811, valid acc: 0.710726



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 82.81it/s, accuracy=0.714, cost=0.995]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 218.47it/s, accuracy=0.826, cost=0.866]
train minibatch loop:   0%|          | 9/2463 [00:00<00:27, 89.57it/s, accuracy=0.75, cost=0.827] 

epoch: 4, pass acc: 0.710726, current acc: 0.735139
time taken: 32.563164472579956
epoch: 4, training loss: 0.860664, training acc: 0.746255, valid loss: 0.842707, valid acc: 0.735139



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 82.99it/s, accuracy=0.643, cost=0.966]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 223.71it/s, accuracy=0.826, cost=0.815]
train minibatch loop:   0%|          | 8/2463 [00:00<00:35, 68.79it/s, accuracy=0.844, cost=0.542]

epoch: 5, pass acc: 0.735139, current acc: 0.746660
time taken: 32.43339252471924
epoch: 5, training loss: 0.770188, training acc: 0.762074, valid loss: 0.778998, valid acc: 0.746660



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 76.69it/s, accuracy=0.643, cost=0.953]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 226.17it/s, accuracy=0.826, cost=0.78] 
train minibatch loop:   0%|          | 11/2463 [00:00<00:23, 104.60it/s, accuracy=0.781, cost=0.708]

epoch: 6, pass acc: 0.746660, current acc: 0.753004
time taken: 32.654788970947266
epoch: 6, training loss: 0.714006, training acc: 0.771933, valid loss: 0.740220, valid acc: 0.753004



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 78.53it/s, accuracy=0.643, cost=0.946]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 225.32it/s, accuracy=0.783, cost=0.756]
train minibatch loop:   0%|          | 9/2463 [00:00<00:29, 82.34it/s, accuracy=0.625, cost=0.854]

epoch: 7, pass acc: 0.753004, current acc: 0.756689
time taken: 32.62237882614136
epoch: 7, training loss: 0.677090, training acc: 0.778861, valid loss: 0.715527, valid acc: 0.756689



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 78.99it/s, accuracy=0.643, cost=0.941]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 225.55it/s, accuracy=0.783, cost=0.737]
train minibatch loop:   0%|          | 10/2463 [00:00<00:24, 98.82it/s, accuracy=0.688, cost=0.815]

epoch: 8, pass acc: 0.756689, current acc: 0.759582
time taken: 32.511637449264526
epoch: 8, training loss: 0.651305, training acc: 0.783416, valid loss: 0.699181, valid acc: 0.759582



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 82.81it/s, accuracy=0.643, cost=0.938]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 227.18it/s, accuracy=0.783, cost=0.724]
train minibatch loop:   0%|          | 9/2463 [00:00<00:31, 78.47it/s, accuracy=0.781, cost=0.747]

epoch: 9, pass acc: 0.759582, current acc: 0.761561
time taken: 32.457406520843506
epoch: 9, training loss: 0.632305, training acc: 0.787666, valid loss: 0.688064, valid acc: 0.761561



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 82.98it/s, accuracy=0.643, cost=0.934]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 226.95it/s, accuracy=0.826, cost=0.714]
train minibatch loop:   0%|          | 10/2463 [00:00<00:25, 96.00it/s, accuracy=0.75, cost=0.627] 

epoch: 10, pass acc: 0.761561, current acc: 0.762292
time taken: 32.39967155456543
epoch: 10, training loss: 0.617675, training acc: 0.790889, valid loss: 0.680386, valid acc: 0.762292



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 82.81it/s, accuracy=0.643, cost=0.931]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 226.59it/s, accuracy=0.87, cost=0.706] 
train minibatch loop:   0%|          | 8/2463 [00:00<00:35, 69.43it/s, accuracy=0.938, cost=0.305]

epoch: 11, pass acc: 0.762292, current acc: 0.763987
time taken: 32.4664580821991
epoch: 11, training loss: 0.605989, training acc: 0.793414, valid loss: 0.675059, valid acc: 0.763987



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 82.60it/s, accuracy=0.643, cost=0.927]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 226.83it/s, accuracy=0.87, cost=0.701] 
train minibatch loop:   0%|          | 8/2463 [00:00<00:32, 76.19it/s, accuracy=0.688, cost=0.754]

time taken: 32.53728008270264
epoch: 12, training loss: 0.596366, training acc: 0.796193, valid loss: 0.671394, valid acc: 0.763834



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 83.28it/s, accuracy=0.643, cost=0.923]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 221.53it/s, accuracy=0.87, cost=0.698] 
train minibatch loop:   0%|          | 8/2463 [00:00<00:31, 78.06it/s, accuracy=0.75, cost=0.733] 

time taken: 32.35716152191162
epoch: 13, training loss: 0.588233, training acc: 0.798312, valid loss: 0.668933, valid acc: 0.763022



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 82.84it/s, accuracy=0.643, cost=0.919]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 224.24it/s, accuracy=0.87, cost=0.696] 
train minibatch loop:   0%|          | 9/2463 [00:00<00:32, 76.31it/s, accuracy=0.688, cost=0.735]

time taken: 32.47988224029541
epoch: 14, training loss: 0.581209, training acc: 0.800418, valid loss: 0.667365, valid acc: 0.763225



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 83.29it/s, accuracy=0.643, cost=0.914]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 222.51it/s, accuracy=0.87, cost=0.695] 
train minibatch loop:   0%|          | 8/2463 [00:00<00:34, 72.19it/s, accuracy=0.688, cost=0.8]  

time taken: 32.3430609703064
epoch: 15, training loss: 0.575031, training acc: 0.801954, valid loss: 0.666473, valid acc: 0.763022



train minibatch loop: 100%|██████████| 2463/2463 [00:29<00:00, 83.00it/s, accuracy=0.643, cost=0.91] 
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 227.55it/s, accuracy=0.87, cost=0.696] 


time taken: 32.38255524635315
epoch: 16, training loss: 0.569515, training acc: 0.803844, valid loss: 0.666104, valid acc: 0.762870

break epoch:17



'fast-text/model.ckpt'

In [19]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    predict_Y += np.argmax(
        sess.run(
            model.logits, feed_dict = {model.X: batch_x, model.Y: batch_y}
        ),
        1,
    ).tolist()
    real_Y += batch_y.tolist()

validation minibatch loop: 100%|██████████| 616/616 [00:01<00:00, 532.99it/s]


In [20]:
from sklearn import metrics
print(metrics.classification_report(real_Y, predict_Y, target_names = unique_labels))

             precision    recall  f1-score   support

      anger       0.74      0.81      0.77      3686
       fear       0.75      0.75      0.75      3830
        joy       0.73      0.78      0.75      3896
       love       0.83      0.81      0.82      3041
    sadness       0.77      0.67      0.72      3259
   surprise       0.78      0.74      0.76      1991

avg / total       0.76      0.76      0.76     19703



In [39]:
import json
with open('fast-text-emotion.json') as fopen:
    p = json.load(fopen)

In [40]:
str_idx([classification_textcleaning(text)],p['dictionary'], len(text.split()))

array([[ 984.,   20.,   12.,  173., 1613.,   17.,   20.,   96.]])

In [30]:
text = 'kerajaan sebenarnya sangat sayangkan rakyatnya, tetapi sebenarnya benci'
new_vector = str_idx([classification_textcleaning(text)],x['dictionary'], len(text.split()))
#sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

TypeError: must be str, not int

In [8]:
new_vector

array([[ 984.,   20.,   12.,  173., 1613.,   17.,   20.,   96.]])

In [22]:
import json
with open('fast-text-emotion.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [23]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [24]:
freeze_graph('fast-text', strings)

INFO:tensorflow:Restoring parameters from fast-text/model.ckpt
INFO:tensorflow:Froze 3 variables.
INFO:tensorflow:Converted 3 variables to const ops.
16 ops in the final graph.


In [9]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [41]:
g = load_graph('fast-text/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(tf.nn.softmax(logits), feed_dict = {x: new_vector})



array([[4.1450025e-05, 3.9089160e-20, 2.2515555e-26, 9.9995852e-01,
        2.5486487e-21, 4.6214880e-22]], dtype=float32)

In [19]:
labels

array([2, 2, 2, ..., 0, 0, 0])

In [18]:
texts[0]

'sebuah parti yang saya pergi ke natal yang terakhir'

In [48]:
text = 'bodoh sial'
new_vector = str_idx([classification_textcleaning(text)],p['dictionary'], len(text.split()))
test_sess.run(tf.nn.softmax(logits), feed_dict = {x: new_vector})

array([[2.285901e-28, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        1.000000e+00, 0.000000e+00]], dtype=float32)

In [49]:
new_vector

array([[178., 874.]])

In [22]:
len(text.split())

9

In [47]:
classification_textcleaning(text)

'bodoh sial'