In [1]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np
from sklearn.utils import shuffle
from utils import *
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from tqdm import tqdm
import pickle



In [2]:
df = pd.read_csv('sentiment-news-bahasa-v5.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [3]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', ' ', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,1] = textcleaning(df.iloc[i,1])

In [5]:
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [6]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 18957
Most common words [('yang', 14891), ('dan', 8177), ('tidak', 4578), ('untuk', 4023), ('dengan', 3349), ('filem', 3279)]
Sample data [1631, 204, 5, 161, 218, 106, 303, 4, 78, 202] ['ringkas', 'bodoh', 'dan', 'membosankan', 'kanak-kanak', 'lelaki', 'remaja', 'yang', 'begitu', 'muda']


In [7]:
def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            try:
                X[i,-1 - no]=dic[k]
            except Exception as e:
                X[i,-1 - no]=UNK
    return X

In [8]:
def attention(inputs, attention_size):
    hidden_size = inputs.shape[2].value
    w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    with tf.name_scope('v'):
        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')
    alphas = tf.nn.softmax(vu, name='alphas')
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
    return output, alphas

class Model:
    def __init__(self, size_layer, num_layers, dropout, dimension_output, learning_rate, maxlen,
                dict_size):
        def cells(size, reuse=False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse),
                state_keep_prob=dropout,
                output_keep_prob=dropout)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, size_layer], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer),
                cell_bw = cells(size_layer),
                inputs = encoder_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            encoder_embedded = tf.concat((out_fw, out_bw), 2)
        self.outputs, self.attention = attention(encoder_embedded,maxlen)
        W = tf.get_variable('w',shape=(size_layer*2, 2),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(2),initializer=tf.zeros_initializer())
        self.logits = tf.add(tf.matmul(self.outputs, W),b,name='logits')
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.logits, 
                                                                           labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        self.accuracy = tf.reduce_mean(tf.cast(tf.nn.in_top_k(self.logits, self.Y, 1), tf.float32))

In [9]:
size_layer = 256
num_layers = 2
dropout = 0.8
dimension_output = 2
learning_rate = 1e-4
batch_size = 32
maxlen = 100

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,dropout,dimension_output,learning_rate,maxlen,len(dictionary))
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, "hierarchical/model.ckpt")

'hierarchical/model.ckpt'

In [10]:
vectors = str_idx(texts, dictionary, maxlen)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, 
                                                    labels,
                                                    test_size = 0.2)

In [12]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x = train_X[i:min(i+batch_size,train_X.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X) / batch_size)
    train_acc /= (len(train_X) / batch_size)
    test_loss /= (len(test_X) / batch_size)
    test_acc /= (len(test_X) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1
    saver.save(sess, "hierarchical/model.ckpt")

train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.22it/s, accuracy=0.806, cost=0.572]
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.70it/s, accuracy=0.625, cost=0.613]


epoch: 0, pass acc: 0.000000, current acc: 0.623950
time taken: 92.23545956611633
epoch: 0, training loss: 0.673939, training acc: 0.574175, valid loss: 0.655924, valid acc: 0.623950



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.23it/s, accuracy=0.839, cost=0.486]
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 12.00it/s, accuracy=0.75, cost=0.584] 


epoch: 1, pass acc: 0.623950, current acc: 0.661064
time taken: 91.80484056472778
epoch: 1, training loss: 0.616633, training acc: 0.657256, valid loss: 0.627682, valid acc: 0.661064



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.24it/s, accuracy=0.806, cost=0.44] 
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.85it/s, accuracy=0.875, cost=0.616]


epoch: 2, pass acc: 0.661064, current acc: 0.684524
time taken: 91.81486558914185
epoch: 2, training loss: 0.564194, training acc: 0.705840, valid loss: 0.621890, valid acc: 0.684524



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.23it/s, accuracy=0.839, cost=0.375]
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.99it/s, accuracy=0.75, cost=0.665] 


epoch: 3, pass acc: 0.684524, current acc: 0.689426
time taken: 91.83554458618164
epoch: 3, training loss: 0.513168, training acc: 0.745062, valid loss: 0.627222, valid acc: 0.689426



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.21it/s, accuracy=0.903, cost=0.293]
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.84it/s, accuracy=0.75, cost=0.741] 


epoch: 4, pass acc: 0.689426, current acc: 0.690476
time taken: 92.32510042190552
epoch: 4, training loss: 0.458082, training acc: 0.783936, valid loss: 0.649062, valid acc: 0.690476



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.20it/s, accuracy=0.871, cost=0.274]
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.91it/s, accuracy=0.5, cost=0.904]  


epoch: 5, pass acc: 0.690476, current acc: 0.692927
time taken: 92.5005795955658
epoch: 5, training loss: 0.403465, training acc: 0.818688, valid loss: 0.687502, valid acc: 0.692927



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.21it/s, accuracy=0.935, cost=0.185]
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.73it/s, accuracy=0.5, cost=1.11]   


epoch: 6, pass acc: 0.692927, current acc: 0.696779
time taken: 92.46164631843567
epoch: 6, training loss: 0.350288, training acc: 0.845744, valid loss: 0.746539, valid acc: 0.696779



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.23it/s, accuracy=0.968, cost=0.166]
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.84it/s, accuracy=0.5, cost=1.35]   


epoch: 7, pass acc: 0.696779, current acc: 0.700980
time taken: 92.025390625
epoch: 7, training loss: 0.300940, training acc: 0.872798, valid loss: 0.799389, valid acc: 0.700980



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.24it/s, accuracy=0.935, cost=0.158] 
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.77it/s, accuracy=0.375, cost=1.85] 


time taken: 91.94457387924194
epoch: 8, training loss: 0.261112, training acc: 0.889165, valid loss: 0.833415, valid acc: 0.693978



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.22it/s, accuracy=0.935, cost=0.139] 
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.71it/s, accuracy=0.375, cost=1.67] 


time taken: 92.27956342697144
epoch: 9, training loss: 0.232353, training acc: 0.903960, valid loss: 0.856125, valid acc: 0.688725



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.22it/s, accuracy=0.968, cost=0.119] 
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.82it/s, accuracy=0.5, cost=1.58]   


time taken: 92.19576978683472
epoch: 10, training loss: 0.194886, training acc: 0.920508, valid loss: 0.904724, valid acc: 0.695028



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.24it/s, accuracy=0.935, cost=0.115] 
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.85it/s, accuracy=0.375, cost=2.56] 


time taken: 91.8014886379242
epoch: 11, training loss: 0.160236, training acc: 0.938189, valid loss: 1.080001, valid acc: 0.689426



train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.22it/s, accuracy=0.903, cost=0.106] 
test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.87it/s, accuracy=0.375, cost=2.3]  


time taken: 92.0934100151062
epoch: 12, training loss: 0.141676, training acc: 0.944489, valid loss: 1.072974, valid acc: 0.697479

break epoch:13



In [13]:
real_Y, predict_Y = [], []

pbar = tqdm(range(0, len(test_X), batch_size), desc='validation minibatch loop')
for i in pbar:
    batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
    batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
    predict_Y += np.argmax(sess.run(model.logits, feed_dict = {model.X : batch_x, model.Y : batch_y}),1).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.92it/s]


In [14]:
from sklearn import metrics
print(metrics.classification_report(real_Y, predict_Y, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.65      0.69      0.67      1289
   positive       0.73      0.70      0.71      1567

avg / total       0.70      0.69      0.69      2856



In [15]:
strings=','.join([n.name for n in tf.get_default_graph().as_graph_def().node if "Variable" in n.op or n.name.find('Placeholder') >= 0 or n.name.find('logits') == 0 or n.name.find('alphas') == 0])

In [16]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"
    clear_devices = True
    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        ) 
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

In [17]:
freeze_graph("hierarchical", strings)

INFO:tensorflow:Restoring parameters from hierarchical/model.ckpt
INFO:tensorflow:Froze 44 variables.
Converted 44 variables to const ops.
793 ops in the final graph.


In [18]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [19]:
g=load_graph('hierarchical/frozen_model.pb')

In [20]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
alphas = g.get_tensor_by_name('import/alphas:0')
test_sess = tf.InteractiveSession(graph=g)
test_sess.run([logits,alphas], feed_dict={x:vectors[:1]})[1].shape

(1, 100)

In [21]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.25555208, 0.7444479 ]], dtype=float32),
 array([[0.00209559, 0.00378773, 0.01576839, 0.02166901, 0.0758793 ,
         0.15169376, 0.29709268, 0.29184714, 0.14016648]], dtype=float32)]

In [22]:
text = 'saya sangat sayangkan kerajaan saya'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.16736251, 0.8326375 ]], dtype=float32),
 array([[0.0156941 , 0.06104115, 0.11414091, 0.18187664, 0.62724715]],
       dtype=float32)]

In [23]:
text = 'bodoh lah awak ni'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.5802783 , 0.41972172]], dtype=float32),
 array([[0.16799149, 0.28151396, 0.2692253 , 0.2812692 ]], dtype=float32)]

In [24]:
text = 'kerajaan sebenarnya sangat baik'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.3700046, 0.6299954]], dtype=float32),
 array([[0.16361861, 0.15003377, 0.34092915, 0.34541845]], dtype=float32)]

In [25]:
import json
with open('hierarchical-sentiment.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))