In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time



In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [3]:
def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [naive_stemmer(word) for word in string]
    return ' '.join([word for word in string if len(word) > 1])


def str_idx(corpus, dic, UNK = 3):
    X = []
    for sentence in corpus:
        X.append([dic[w] if w in dic else UNK for w in sentence.split()[:maxlen]])
    return X

In [4]:
def create_ngram_set(input_list, ngram_value):
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def build_ngram(x_train, limit = 100000):
    global max_features
    ngram_set = set()
    for input_list in tqdm(x_train, total = len(x_train), ncols = 70):
        for i in range(2, 3):
            set_of_ngram = create_ngram_set(input_list, ngram_value = i)
            ngram_set.update(set_of_ngram)
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set) if k + max_features < limit}
    indice_token = {token_indice[k]: k for k in token_indice}
    max_features = np.max(list(indice_token.keys())) + 1
    return token_indice


def add_ngram(sequences, token_indice):
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i : i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)
    return new_sequences

In [5]:
import os
emotion_files = [f for f in os.listdir(os.getcwd()) if 'translated-' in f]
emotion_files

['translated-joy',
 'translated-love',
 'translated-fear',
 'translated-sadness',
 'translated-surprise',
 'translated-anger']

In [6]:
texts, labels = [], []
for f in emotion_files:
    with open(f) as fopen:
        dataset = list(filter(None, fopen.read().split('\n')))
        labels.extend([f.split('-')[1]] * len(dataset))
        texts.extend(dataset)

In [7]:
unique_labels = np.unique(labels).tolist()
labels = LabelEncoder().fit_transform(labels)
unique_labels

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [8]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])

In [9]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 14652
Most common words [('saya', 165182), ('asa', 50903), ('rasa', 50028), ('tidak', 33044), ('yang', 31373), ('untuk', 15327)]
Sample data [521, 1144, 8, 4, 103, 723, 8, 94, 114, 8] ['buah', 'parti', 'yang', 'saya', 'gi', 'natal', 'yang', 'akhir', 'malam', 'yang']


In [10]:
ngram_range = 2
max_features = len(dictionary)
maxlen = 80
batch_size = 32
embedded_size = 256

In [11]:
max_features

14656

In [12]:
idx_trainset = str_idx(texts, dictionary, maxlen)
token_indice = build_ngram(idx_trainset)
X = add_ngram(idx_trainset, token_indice)
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen)

100%|███████████████████████| 98515/98515 [00:00<00:00, 226699.54it/s]


In [13]:
train_X, test_X, train_Y, test_Y = train_test_split(X, 
                                                    labels,
                                                    test_size = 0.2)

In [14]:
class Model:
    def __init__(
        self, embedded_size, dict_size, dimension_output, learning_rate
    ):

        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(
            tf.random_uniform([dict_size, embedded_size], -1, 1)
        )
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        self.logits = tf.identity(
            tf.layers.dense(
                tf.reduce_mean(encoder_embedded, 1), dimension_output
            ),
            name = 'logits',
        )
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
            self.cost
        )
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [15]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(embedded_size, max_features, len(unique_labels), 5e-4)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'fast-text/model.ckpt')

'fast-text/model.ckpt'

In [16]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
    ]
)

In [17]:
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Variable',
 'dense/kernel',
 'dense/bias',
 'logits']

In [18]:
tf.trainable_variables()

[<tf.Variable 'Variable:0' shape=(100001, 256) dtype=float32_ref>,
 <tf.Variable 'dense/kernel:0' shape=(256, 6) dtype=float32_ref>,
 <tf.Variable 'dense/bias:0' shape=(6,) dtype=float32_ref>]

In [19]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

saver.save(sess, "fast-text/model.ckpt")

train minibatch loop: 100%|██████████| 2463/2463 [01:32<00:00, 26.68it/s, accuracy=0.393, cost=1.54] 
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 250.83it/s, accuracy=0.522, cost=1.45]
train minibatch loop:   0%|          | 3/2463 [00:00<01:27, 28.02it/s, accuracy=0.438, cost=1.55]

epoch: 0, pass acc: 0.000000, current acc: 0.459305
time taken: 94.68354630470276
epoch: 0, training loss: 1.677034, training acc: 0.352060, valid loss: 1.541595, valid acc: 0.459305



train minibatch loop: 100%|██████████| 2463/2463 [01:32<00:00, 26.73it/s, accuracy=0.571, cost=1.13] 
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 266.92it/s, accuracy=0.565, cost=1.12] 
train minibatch loop:   0%|          | 3/2463 [00:00<01:49, 22.40it/s, accuracy=0.75, cost=1.15] 

epoch: 1, pass acc: 0.459305, current acc: 0.674673
time taken: 94.46491956710815
epoch: 1, training loss: 1.325451, training acc: 0.623462, valid loss: 1.148992, valid acc: 0.674673



train minibatch loop: 100%|██████████| 2463/2463 [01:32<00:00, 26.76it/s, accuracy=0.643, cost=0.924]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 270.85it/s, accuracy=0.696, cost=0.869]
train minibatch loop:   0%|          | 3/2463 [00:00<01:47, 22.78it/s, accuracy=0.781, cost=0.842]

epoch: 2, pass acc: 0.674673, current acc: 0.736500
time taken: 94.32717251777649
epoch: 2, training loss: 0.962563, training acc: 0.735669, valid loss: 0.892813, valid acc: 0.736500



train minibatch loop: 100%|██████████| 2463/2463 [01:32<00:00, 26.74it/s, accuracy=0.643, cost=0.846]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 263.92it/s, accuracy=0.739, cost=0.733]
train minibatch loop:   0%|          | 3/2463 [00:00<01:27, 28.22it/s, accuracy=0.812, cost=0.677]

epoch: 3, pass acc: 0.736500, current acc: 0.758852
time taken: 94.42890787124634
epoch: 3, training loss: 0.749180, training acc: 0.781741, valid loss: 0.766707, valid acc: 0.758852



train minibatch loop: 100%|██████████| 2463/2463 [01:32<00:00, 26.77it/s, accuracy=0.679, cost=0.811]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 269.56it/s, accuracy=0.739, cost=0.659]
train minibatch loop:   0%|          | 3/2463 [00:00<01:46, 22.99it/s, accuracy=0.906, cost=0.55] 

epoch: 4, pass acc: 0.758852, current acc: 0.769002
time taken: 94.28968334197998
epoch: 4, training loss: 0.632378, training acc: 0.805013, valid loss: 0.704613, valid acc: 0.769002



train minibatch loop: 100%|██████████| 2463/2463 [01:32<00:00, 26.18it/s, accuracy=0.679, cost=0.788]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 270.62it/s, accuracy=0.739, cost=0.616]
train minibatch loop:   0%|          | 3/2463 [00:00<01:38, 24.95it/s, accuracy=0.812, cost=0.533]

epoch: 5, pass acc: 0.769002, current acc: 0.773316
time taken: 94.3593077659607
epoch: 5, training loss: 0.561727, training acc: 0.820722, valid loss: 0.671915, valid acc: 0.773316



train minibatch loop: 100%|██████████| 2463/2463 [01:32<00:00, 26.52it/s, accuracy=0.714, cost=0.765]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 265.73it/s, accuracy=0.739, cost=0.588]
train minibatch loop:   0%|          | 3/2463 [00:00<01:22, 29.99it/s, accuracy=0.812, cost=0.492]

epoch: 6, pass acc: 0.773316, current acc: 0.776057
time taken: 94.39602041244507
epoch: 6, training loss: 0.513376, training acc: 0.831255, valid loss: 0.653939, valid acc: 0.776057



train minibatch loop: 100%|██████████| 2463/2463 [01:32<00:00, 26.75it/s, accuracy=0.714, cost=0.742]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 267.82it/s, accuracy=0.783, cost=0.568]
train minibatch loop:   0%|          | 3/2463 [00:00<01:22, 29.86it/s, accuracy=0.812, cost=0.461]

epoch: 7, pass acc: 0.776057, current acc: 0.777193
time taken: 94.37455296516418
epoch: 7, training loss: 0.476937, training acc: 0.840682, valid loss: 0.644282, valid acc: 0.777193



train minibatch loop: 100%|██████████| 2463/2463 [01:26<00:00, 29.86it/s, accuracy=0.714, cost=0.718]
test minibatch loop: 100%|██████████| 616/616 [00:01<00:00, 358.16it/s, accuracy=0.783, cost=0.554]
train minibatch loop:   0%|          | 3/2463 [00:00<01:22, 29.98it/s, accuracy=0.812, cost=0.435]

epoch: 8, pass acc: 0.777193, current acc: 0.777244
time taken: 88.02731561660767
epoch: 8, training loss: 0.447597, training acc: 0.848549, valid loss: 0.639808, valid acc: 0.777244



train minibatch loop: 100%|██████████| 2463/2463 [01:22<00:00, 30.32it/s, accuracy=0.714, cost=0.693]
test minibatch loop: 100%|██████████| 616/616 [00:01<00:00, 355.50it/s, accuracy=0.783, cost=0.544]
train minibatch loop:   0%|          | 4/2463 [00:00<01:21, 30.12it/s, accuracy=0.844, cost=0.464]

time taken: 84.06424927711487
epoch: 9, training loss: 0.422890, training acc: 0.855287, valid loss: 0.638810, valid acc: 0.776889



train minibatch loop: 100%|██████████| 2463/2463 [01:28<00:00, 27.78it/s, accuracy=0.75, cost=0.668] 
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 267.66it/s, accuracy=0.783, cost=0.539]
train minibatch loop:   0%|          | 3/2463 [00:00<01:26, 28.41it/s, accuracy=0.844, cost=0.398]

time taken: 90.96887183189392
epoch: 10, training loss: 0.401439, training acc: 0.861036, valid loss: 0.640286, valid acc: 0.775975



train minibatch loop: 100%|██████████| 2463/2463 [01:32<00:00, 26.77it/s, accuracy=0.786, cost=0.642]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 271.32it/s, accuracy=0.783, cost=0.536]
train minibatch loop:   0%|          | 3/2463 [00:00<01:26, 28.53it/s, accuracy=0.844, cost=0.383]

time taken: 94.28606629371643
epoch: 11, training loss: 0.382415, training acc: 0.866888, valid loss: 0.643615, valid acc: 0.774960



train minibatch loop: 100%|██████████| 2463/2463 [01:32<00:00, 26.51it/s, accuracy=0.786, cost=0.617]
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 275.06it/s, accuracy=0.783, cost=0.537]
train minibatch loop:   0%|          | 3/2463 [00:00<01:49, 22.49it/s, accuracy=0.844, cost=0.326]

time taken: 94.27572536468506
epoch: 12, training loss: 0.365290, training acc: 0.872255, valid loss: 0.648401, valid acc: 0.773742



train minibatch loop: 100%|██████████| 2463/2463 [01:31<00:00, 26.78it/s, accuracy=0.821, cost=0.593] 
test minibatch loop: 100%|██████████| 616/616 [00:02<00:00, 269.65it/s, accuracy=0.783, cost=0.54] 


time taken: 94.26148343086243
epoch: 13, training loss: 0.349714, training acc: 0.877522, valid loss: 0.654378, valid acc: 0.772372

break epoch:14



'fast-text/model.ckpt'

In [20]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    predict_Y += np.argmax(
        sess.run(
            model.logits, feed_dict = {model.X: batch_x, model.Y: batch_y}
        ),
        1,
    ).tolist()
    real_Y += batch_y.tolist()

validation minibatch loop: 100%|██████████| 616/616 [00:01<00:00, 612.23it/s]


In [21]:
from sklearn import metrics
print(metrics.classification_report(real_Y, predict_Y, target_names = unique_labels))

             precision    recall  f1-score   support

      anger       0.82      0.75      0.78      3754
       fear       0.71      0.81      0.75      3837
        joy       0.76      0.79      0.78      3844
       love       0.83      0.83      0.83      3065
    sadness       0.75      0.75      0.75      3241
   surprise       0.79      0.64      0.71      1962

avg / total       0.77      0.77      0.77     19703



In [22]:
text = 'kerajaan sebenarnya sangat sayangkan rakyatnya, tetapi sebenarnya benci'
new_vector = add_ngram(str_idx([classification_textcleaning(text)],dictionary), token_indice)
sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

array([[1.6177664e-08, 5.0652740e-18, 7.5105124e-23, 1.0000000e+00,
        1.7726123e-19, 4.0211566e-18]], dtype=float32)

In [23]:
import json
with open('fast-text-emotion.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [24]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [25]:
freeze_graph('fast-text', strings)

INFO:tensorflow:Restoring parameters from fast-text/model.ckpt
INFO:tensorflow:Froze 3 variables.
INFO:tensorflow:Converted 3 variables to const ops.
16 ops in the final graph.


In [26]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [27]:
g = load_graph('fast-text/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(tf.nn.softmax(logits), feed_dict = {x: new_vector})



array([[1.6177664e-08, 5.0652740e-18, 7.5105124e-23, 1.0000000e+00,
        1.7726123e-19, 4.0211566e-18]], dtype=float32)

In [28]:
import pickle
with open('token-indice.pkl','wb') as fopen:
    pickle.dump(token_indice, fopen)