In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
import tensorflow as tf
from sklearn.model_selection import train_test_split
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time

In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [3]:

def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )


def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            X[i, -1 - no] = dic.get(k,UNK)
    return X

In [4]:
with open('subjectivity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('subjectivity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts

assert len(labels) == len(texts)

In [5]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])[0]

In [6]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 13222
Most common words [('yang', 11804), ('untuk', 3879), ('tidak', 2898), ('deng', 2827), ('ada', 2294), ('dalam', 2193)]
Sample data [10, 68, 13, 27, 55, 53, 11, 395, 34, 182] ['filem', 'mula', 'pada', 'masa', 'lalu', 'mana', 'orang', 'budak', 'lelaki', 'nama']


In [7]:
class Model:
    def __init__(
        self,
        size_layer,
        num_layers,
        dimension_output,
        learning_rate,
        dropout,
        dict_size,
    ):
        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                state_keep_prob = dropout,
                output_keep_prob = dropout,
            )

        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(
            tf.random_uniform([dict_size, size_layer], -1, 1)
        )
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer),
                cell_bw = cells(size_layer),
                inputs = encoder_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d' % (n),
            )
            encoder_embedded = tf.concat((out_fw, out_bw), 2)

        W = tf.get_variable(
            'w',
            shape = (size_layer * 2, 2),
            initializer = tf.orthogonal_initializer(),
        )
        b = tf.get_variable(
            'b', shape = (2), initializer = tf.zeros_initializer()
        )
        self.logits = tf.add(
            tf.matmul(tf.reduce_mean(encoder_embedded, 1), W),
            b,
            name = 'logits',
        )
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [8]:
size_layer = 256
num_layers = 2
dimension_output = 2
learning_rate = 1e-4
batch_size = 32
dropout = 0.8
maxlen = 80

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    size_layer,
    num_layers,
    dimension_output,
    learning_rate,
    dropout,
    len(dictionary),
)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bidirectional/model.ckpt')

'bidirectional/model.ckpt'

In [9]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
    ]
)

In [10]:
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Variable',
 'bidirectional_rnn_0/fw/lstm_cell/kernel',
 'bidirectional_rnn_0/fw/lstm_cell/bias',
 'bidirectional_rnn_0/bw/lstm_cell/kernel',
 'bidirectional_rnn_0/bw/lstm_cell/bias',
 'bidirectional_rnn_1/fw/lstm_cell/kernel',
 'bidirectional_rnn_1/fw/lstm_cell/bias',
 'bidirectional_rnn_1/bw/lstm_cell/kernel',
 'bidirectional_rnn_1/bw/lstm_cell/bias',
 'w',
 'b',
 'logits',
 'gradients/logits_grad/Shape',
 'gradients/logits_grad/Shape_1',
 'gradients/logits_grad/BroadcastGradientArgs',
 'gradients/logits_grad/Sum',
 'gradients/logits_grad/Reshape',
 'gradients/logits_grad/Sum_1',
 'gradients/logits_grad/Reshape_1',
 'gradients/logits_grad/tuple/group_deps',
 'gradients/logits_grad/tuple/control_dependency',
 'gradients/logits_grad/tuple/control_dependency_1']

In [11]:
tf.trainable_variables()

[<tf.Variable 'Variable:0' shape=(13226, 256) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_0/fw/lstm_cell/kernel:0' shape=(512, 1024) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_0/fw/lstm_cell/bias:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_0/bw/lstm_cell/kernel:0' shape=(512, 1024) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_0/bw/lstm_cell/bias:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_1/fw/lstm_cell/kernel:0' shape=(768, 1024) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_1/fw/lstm_cell/bias:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_1/bw/lstm_cell/kernel:0' shape=(768, 1024) dtype=float32_ref>,
 <tf.Variable 'bidirectional_rnn_1/bw/lstm_cell/bias:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'w:0' shape=(512, 2) dtype=float32_ref>,
 <tf.Variable 'b:0' shape=(2,) dtype=float32_ref>]

In [12]:
vectors = str_idx(texts, dictionary, maxlen)
train_X, test_X, train_Y, test_Y = train_test_split(
    vectors, labels, test_size = 0.2
)

In [13]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 250/250 [00:46<00:00,  5.58it/s, accuracy=1, cost=0.429]    
test minibatch loop: 100%|██████████| 63/63 [00:04<00:00, 15.47it/s, accuracy=0.667, cost=0.488]
train minibatch loop:   0%|          | 1/250 [00:00<00:45,  5.47it/s, accuracy=0.781, cost=0.429]

epoch: 0, pass acc: 0.000000, current acc: 0.771366
time taken: 50.15620541572571
epoch: 0, training loss: 0.643723, training acc: 0.610114, valid loss: 0.494544, valid acc: 0.771366



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.57it/s, accuracy=1, cost=0.0993]   
test minibatch loop: 100%|██████████| 63/63 [00:04<00:00, 15.68it/s, accuracy=0.778, cost=0.313]
train minibatch loop:   0%|          | 1/250 [00:00<00:44,  5.53it/s, accuracy=0.781, cost=0.321]

epoch: 1, pass acc: 0.771366, current acc: 0.794726
time taken: 49.83419370651245
epoch: 1, training loss: 0.473966, training acc: 0.779395, valid loss: 0.447352, valid acc: 0.794726



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.53it/s, accuracy=1, cost=0.0676]   
test minibatch loop: 100%|██████████| 63/63 [00:04<00:00, 15.63it/s, accuracy=0.778, cost=0.439]
train minibatch loop:   0%|          | 1/250 [00:00<00:45,  5.46it/s, accuracy=0.812, cost=0.338]

epoch: 2, pass acc: 0.794726, current acc: 0.807772
time taken: 49.85848927497864
epoch: 2, training loss: 0.456373, training acc: 0.799975, valid loss: 0.449408, valid acc: 0.807772



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.58it/s, accuracy=1, cost=0.0436]   
test minibatch loop: 100%|██████████| 63/63 [00:04<00:00, 15.60it/s, accuracy=0.889, cost=0.339]
train minibatch loop:   0%|          | 1/250 [00:00<00:45,  5.48it/s, accuracy=0.875, cost=0.335]

epoch: 3, pass acc: 0.807772, current acc: 0.837152
time taken: 49.91290307044983
epoch: 3, training loss: 0.379133, training acc: 0.841260, valid loss: 0.407142, valid acc: 0.837152



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.59it/s, accuracy=1, cost=0.0145]   
test minibatch loop: 100%|██████████| 63/63 [00:03<00:00, 15.76it/s, accuracy=0.889, cost=0.286]
train minibatch loop:   0%|          | 1/250 [00:00<00:45,  5.47it/s, accuracy=0.844, cost=0.292]

epoch: 4, pass acc: 0.837152, current acc: 0.845682
time taken: 49.81059193611145
epoch: 4, training loss: 0.337318, training acc: 0.861338, valid loss: 0.388685, valid acc: 0.845682



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.58it/s, accuracy=1, cost=0.00753]  
test minibatch loop: 100%|██████████| 63/63 [00:04<00:00, 15.69it/s, accuracy=0.889, cost=0.304]
train minibatch loop:   0%|          | 1/250 [00:00<00:45,  5.50it/s, accuracy=0.906, cost=0.243]

epoch: 5, pass acc: 0.845682, current acc: 0.853710
time taken: 49.75829839706421
epoch: 5, training loss: 0.300530, training acc: 0.877902, valid loss: 0.378218, valid acc: 0.853710



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.58it/s, accuracy=1, cost=0.0065]    
test minibatch loop: 100%|██████████| 63/63 [00:04<00:00, 15.70it/s, accuracy=0.889, cost=0.255]
train minibatch loop:   0%|          | 1/250 [00:00<00:48,  5.15it/s, accuracy=0.844, cost=0.271]

time taken: 49.786596059799194
epoch: 6, training loss: 0.267379, training acc: 0.896223, valid loss: 0.390281, valid acc: 0.850700



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.61it/s, accuracy=1, cost=0.00228]   
test minibatch loop: 100%|██████████| 63/63 [00:04<00:00, 15.74it/s, accuracy=0.889, cost=0.198]
train minibatch loop:   0%|          | 1/250 [00:00<00:45,  5.51it/s, accuracy=0.906, cost=0.203]

epoch: 7, pass acc: 0.853710, current acc: 0.858226
time taken: 49.74743151664734
epoch: 7, training loss: 0.238398, training acc: 0.910026, valid loss: 0.387246, valid acc: 0.858226



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.59it/s, accuracy=1, cost=0.00124]   
test minibatch loop: 100%|██████████| 63/63 [00:03<00:00, 15.77it/s, accuracy=1, cost=0.0705]   
train minibatch loop:   0%|          | 1/250 [00:00<00:45,  5.51it/s, accuracy=0.969, cost=0.215]

epoch: 8, pass acc: 0.858226, current acc: 0.864526
time taken: 49.72689700126648
epoch: 8, training loss: 0.207464, training acc: 0.927594, valid loss: 0.399662, valid acc: 0.864526



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.57it/s, accuracy=1, cost=0.000709]  
test minibatch loop: 100%|██████████| 63/63 [00:04<00:00, 15.75it/s, accuracy=1, cost=0.0928]   
train minibatch loop:   0%|          | 1/250 [00:00<00:45,  5.49it/s, accuracy=0.969, cost=0.181]

epoch: 9, pass acc: 0.864526, current acc: 0.869042
time taken: 49.721059799194336
epoch: 9, training loss: 0.176001, training acc: 0.939390, valid loss: 0.407298, valid acc: 0.869042



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.60it/s, accuracy=1, cost=0.00153]   
test minibatch loop: 100%|██████████| 63/63 [00:04<00:00, 15.75it/s, accuracy=1, cost=0.0661]   
train minibatch loop:   0%|          | 1/250 [00:00<00:45,  5.51it/s, accuracy=0.938, cost=0.154]

time taken: 49.648770332336426
epoch: 10, training loss: 0.154012, training acc: 0.948174, valid loss: 0.448675, valid acc: 0.859508



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.62it/s, accuracy=1, cost=0.00439]   
test minibatch loop: 100%|██████████| 63/63 [00:03<00:00, 15.77it/s, accuracy=1, cost=0.0517]    
train minibatch loop:   0%|          | 1/250 [00:00<00:45,  5.53it/s, accuracy=0.969, cost=0.145]

time taken: 49.61682486534119
epoch: 11, training loss: 0.129695, training acc: 0.959970, valid loss: 0.460306, valid acc: 0.855996



train minibatch loop: 100%|██████████| 250/250 [00:45<00:00,  5.61it/s, accuracy=1, cost=0.00326]   
test minibatch loop: 100%|██████████| 63/63 [00:03<00:00, 15.78it/s, accuracy=1, cost=0.0602]   

time taken: 49.617290019989014
epoch: 12, training loss: 0.109519, training acc: 0.964613, valid loss: 0.444919, valid acc: 0.849975

break epoch:13






In [14]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    predict_Y += np.argmax(
        sess.run(
            model.logits, feed_dict = {model.X: batch_x, model.Y: batch_y}
        ),
        1,
    ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 63/63 [00:03<00:00, 16.07it/s]


In [15]:
print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['negative', 'positive']
    )
)

              precision    recall  f1-score   support

    negative       0.82      0.89      0.85       989
    positive       0.88      0.81      0.84      1004

   micro avg       0.85      0.85      0.85      1993
   macro avg       0.85      0.85      0.85      1993
weighted avg       0.85      0.85      0.85      1993



In [16]:
text = classification_textcleaning('kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya')
new_vector = str_idx([text[0]], dictionary, len(text[0].split()))
sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

array([[0.29364473, 0.7063553 ]], dtype=float32)

In [17]:
text = classification_textcleaning('kerajaan sebenarnya sangat sayangkan rakyatnya')
new_vector = str_idx([text[0]], dictionary, len(text[0].split()))
sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

array([[0.5253138, 0.4746862]], dtype=float32)

In [18]:
import json
with open('bidirectional-subjective.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [19]:
saver.save(sess, 'bidirectional/model.ckpt')

'bidirectional/model.ckpt'

In [20]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [21]:
freeze_graph('bidirectional', strings)

INFO:tensorflow:Restoring parameters from bidirectional/model.ckpt
INFO:tensorflow:Froze 11 variables.
INFO:tensorflow:Converted 11 variables to const ops.
656 ops in the final graph.


In [22]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [23]:
g = load_graph('bidirectional/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
result = test_sess.run(tf.nn.softmax(logits), feed_dict = {x: new_vector})

