In [1]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np
from sklearn.utils import shuffle
from utils import *
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from tqdm import tqdm
import pickle



In [2]:
df = pd.read_csv('sentiment-news-bahasa-v5.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [3]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', ' ', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,1] = textcleaning(df.iloc[i,1])

In [5]:
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [6]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 18957
Most common words [('yang', 14891), ('dan', 8177), ('tidak', 4578), ('untuk', 4023), ('dengan', 3349), ('filem', 3279)]
Sample data [1627, 204, 5, 161, 218, 106, 300, 4, 78, 202] ['ringkas', 'bodoh', 'dan', 'membosankan', 'kanak-kanak', 'lelaki', 'remaja', 'yang', 'begitu', 'muda']


In [7]:
def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            try:
                X[i,-1 - no]=dic[k]
            except Exception as e:
                X[i,-1 - no]=UNK
    return X

In [8]:
class Model:
    def __init__(self, size_layer, num_layers, dropout, dimension_output, learning_rate, dict_size):
        def cells(size, reuse=False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse),
                output_keep_prob=dropout)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, size_layer], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer),
                cell_bw = cells(size_layer),
                inputs = encoder_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            encoder_embedded = tf.concat((out_fw, out_bw), 2)
            
        W = tf.get_variable('w',shape=(size_layer*2, 2),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(2),initializer=tf.zeros_initializer())
        self.logits = tf.add(tf.matmul(tf.reduce_mean(encoder_embedded,1), W),b,name='logits')
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.logits, 
                                                                           labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        self.accuracy = tf.reduce_mean(tf.cast(tf.nn.in_top_k(self.logits, self.Y, 1), tf.float32))

In [9]:
size_layer = 256
num_layers = 2
dropout = 0.8
dimension_output = 2
learning_rate = 1e-4
batch_size = 32
maxlen = 100

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,dropout,dimension_output,learning_rate,len(dictionary))
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, "bidirectional/model.ckpt")

'bidirectional/model.ckpt'

In [10]:
vectors = str_idx(texts, dictionary, maxlen)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, 
                                                    labels,
                                                    test_size = 0.2)

In [12]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x = train_X[i:min(i+batch_size,train_X.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X) / batch_size)
    train_acc /= (len(train_X) / batch_size)
    test_loss /= (len(test_X) / batch_size)
    test_acc /= (len(test_X) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1
    saver.save(sess, "bidirectional/model.ckpt")

train minibatch loop: 100%|██████████| 357/357 [01:20<00:00,  4.43it/s, accuracy=0.452, cost=0.696]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.05it/s, accuracy=0.5, cost=0.704]  


epoch: 0, pass acc: 0.000000, current acc: 0.535014
time taken: 87.52816390991211
epoch: 0, training loss: 0.692963, training acc: 0.530636, valid loss: 0.691246, valid acc: 0.535014



train minibatch loop: 100%|██████████| 357/357 [01:16<00:00,  4.69it/s, accuracy=0.516, cost=0.716]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.39it/s, accuracy=0.25, cost=0.785] 


epoch: 1, pass acc: 0.535014, current acc: 0.641106
time taken: 82.79175686836243
epoch: 1, training loss: 0.662950, training acc: 0.603915, valid loss: 0.649223, valid acc: 0.641106



train minibatch loop: 100%|██████████| 357/357 [01:16<00:00,  4.69it/s, accuracy=0.613, cost=0.625]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.22it/s, accuracy=0.625, cost=0.587]


epoch: 2, pass acc: 0.641106, current acc: 0.673319
time taken: 82.89222002029419
epoch: 2, training loss: 0.608860, training acc: 0.678334, valid loss: 0.615162, valid acc: 0.673319



train minibatch loop: 100%|██████████| 357/357 [01:16<00:00,  4.66it/s, accuracy=0.774, cost=0.535]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.33it/s, accuracy=0.75, cost=0.529] 


epoch: 3, pass acc: 0.673319, current acc: 0.683473
time taken: 83.28404688835144
epoch: 3, training loss: 0.552815, training acc: 0.730611, valid loss: 0.618420, valid acc: 0.683473



train minibatch loop: 100%|██████████| 357/357 [01:16<00:00,  4.69it/s, accuracy=0.839, cost=0.44] 
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.43it/s, accuracy=0.875, cost=0.42] 


epoch: 4, pass acc: 0.683473, current acc: 0.697829
time taken: 82.84823060035706
epoch: 4, training loss: 0.497861, training acc: 0.773601, valid loss: 0.629559, valid acc: 0.697829



train minibatch loop: 100%|██████████| 357/357 [01:16<00:00,  4.68it/s, accuracy=0.839, cost=0.406]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.44it/s, accuracy=0.75, cost=0.685] 


time taken: 82.98535513877869
epoch: 5, training loss: 0.451428, training acc: 0.803277, valid loss: 0.734149, valid acc: 0.685574



train minibatch loop: 100%|██████████| 357/357 [01:16<00:00,  4.67it/s, accuracy=0.871, cost=0.313]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.51it/s, accuracy=0.75, cost=0.554] 


time taken: 83.15081810951233
epoch: 6, training loss: 0.408890, training acc: 0.825779, valid loss: 0.738041, valid acc: 0.697479



train minibatch loop: 100%|██████████| 357/357 [01:16<00:00,  4.66it/s, accuracy=0.903, cost=0.283]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.29it/s, accuracy=0.75, cost=0.431] 


epoch: 7, pass acc: 0.697829, current acc: 0.702731
time taken: 83.31504368782043
epoch: 7, training loss: 0.362210, training acc: 0.848893, valid loss: 0.751275, valid acc: 0.702731



train minibatch loop: 100%|██████████| 357/357 [01:16<00:00,  4.67it/s, accuracy=0.903, cost=0.234]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.47it/s, accuracy=0.75, cost=0.552] 


time taken: 83.05671238899231
epoch: 8, training loss: 0.327703, training acc: 0.865789, valid loss: 0.784919, valid acc: 0.697479



train minibatch loop: 100%|██████████| 357/357 [01:16<00:00,  4.67it/s, accuracy=0.903, cost=0.224] 
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.30it/s, accuracy=0.75, cost=0.677] 


time taken: 83.21823143959045
epoch: 9, training loss: 0.285810, training acc: 0.886011, valid loss: 0.915739, valid acc: 0.669818



train minibatch loop: 100%|██████████| 357/357 [01:16<00:00,  4.64it/s, accuracy=0.935, cost=0.181] 
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.37it/s, accuracy=0.875, cost=0.352]


time taken: 83.72120499610901
epoch: 10, training loss: 0.256273, training acc: 0.897482, valid loss: 0.913179, valid acc: 0.695378



train minibatch loop: 100%|██████████| 357/357 [01:17<00:00,  4.61it/s, accuracy=0.903, cost=0.209] 
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.31it/s, accuracy=0.875, cost=0.384]


time taken: 84.12972474098206
epoch: 12, training loss: 0.204677, training acc: 0.919015, valid loss: 1.092270, valid acc: 0.697829

break epoch:13



In [13]:
real_Y, predict_Y = [], []

pbar = tqdm(range(0, len(test_X), batch_size), desc='validation minibatch loop')
for i in pbar:
    batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
    batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
    predict_Y += np.argmax(sess.run(model.logits, feed_dict = {model.X : batch_x, model.Y : batch_y}),1).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 13.44it/s]


In [14]:
from sklearn import metrics
print(metrics.classification_report(real_Y, predict_Y, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.68      0.64      0.66      1350
   positive       0.69      0.74      0.71      1506

avg / total       0.69      0.69      0.69      2856



In [15]:
strings=','.join([n.name for n in tf.get_default_graph().as_graph_def().node if "Variable" in n.op or n.name.find('Placeholder') >= 0 or n.name.find('logits') == 0 or n.name.find('alphas') == 0])

In [16]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"
    clear_devices = True
    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        ) 
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

In [17]:
freeze_graph("bidirectional", strings)

INFO:tensorflow:Restoring parameters from bidirectional/model.ckpt
INFO:tensorflow:Froze 35 variables.
Converted 35 variables to const ops.
581 ops in the final graph.


In [18]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [19]:
g=load_graph('bidirectional/frozen_model.pb')

In [20]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph=g)
test_sess.run(logits, feed_dict={x:vectors[:1]}).shape

(1, 2)

In [21]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run(tf.nn.softmax(logits), feed_dict={x:new_vector})

array([[0.792608  , 0.20739199]], dtype=float32)

In [22]:
text = 'saya sangat sayangkan kerajaan saya'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run(tf.nn.softmax(logits), feed_dict={x:new_vector})

array([[0.35303828, 0.64696175]], dtype=float32)

In [23]:
text = 'bodoh lah awak ni'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run(tf.nn.softmax(logits), feed_dict={x:new_vector})

array([[0.3330668, 0.6669332]], dtype=float32)

In [24]:
text = 'kerajaan sebenarnya sangat baik'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run(tf.nn.softmax(logits), feed_dict={x:new_vector})

array([[0.22721377, 0.7727862 ]], dtype=float32)

In [25]:
import json
with open('bidirectional-sentiment.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))