In [1]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np
from sklearn.utils import shuffle
from utils import *
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from tqdm import tqdm
import pickle



In [2]:
df = pd.read_csv('sentiment-news-bahasa-v5.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [3]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', ' ', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,1] = textcleaning(df.iloc[i,1])

In [5]:
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [6]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 18957
Most common words [('yang', 14891), ('dan', 8177), ('tidak', 4578), ('untuk', 4023), ('dengan', 3349), ('filem', 3279)]
Sample data [1643, 206, 5, 160, 217, 106, 306, 4, 79, 202] ['ringkas', 'bodoh', 'dan', 'membosankan', 'kanak-kanak', 'lelaki', 'remaja', 'yang', 'begitu', 'muda']


In [7]:
def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            try:
                X[i,-1 - no]=dic[k]
            except Exception as e:
                X[i,-1 - no]=UNK
    return X

In [8]:
class Model:
    def __init__(self, size_layer, num_layers, dimension_output, learning_rate, dropout,
                dict_size):
        def cells(size, reuse=False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer=tf.orthogonal_initializer(),
                    reuse=reuse),
                state_keep_prob=dropout,
                output_keep_prob=dropout)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, size_layer], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units = size_layer, 
            memory = encoder_embedded)
        rnn_cells = tf.contrib.seq2seq.AttentionWrapper(
            cell = tf.nn.rnn_cell.MultiRNNCell([cells(size_layer) for _ in range(num_layers)]), 
            attention_mechanism = attention_mechanism,
            attention_layer_size = size_layer,
            alignment_history=True)
        outputs, last_state = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded, dtype = tf.float32)
        self.alignments = tf.transpose(last_state.alignment_history.stack(),[1,2,0])
        W = tf.get_variable('w',shape=(size_layer, dimension_output),
                            initializer=tf.glorot_uniform_initializer())
        b = tf.get_variable('b',shape=(dimension_output),
                            initializer=tf.zeros_initializer())
        self.logits = tf.add(tf.matmul(outputs[:,-1], W),b,name='logits')
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        self.attention = tf.nn.softmax(tf.reduce_sum(self.alignments[0],1),name='alphas')

In [9]:
size_layer = 256
num_layers = 2
dropout = 0.8
dimension_output = 2
learning_rate = 1e-4
batch_size = 32
maxlen = 100

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,dimension_output,learning_rate,dropout,len(dictionary))
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, "bahdanau/model.ckpt")

'bahdanau/model.ckpt'

In [10]:
vectors = str_idx(texts, dictionary, maxlen)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, 
                                                    labels,
                                                    test_size = 0.2)

In [12]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x = train_X[i:min(i+batch_size,train_X.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X) / batch_size)
    train_acc /= (len(train_X) / batch_size)
    test_loss /= (len(test_X) / batch_size)
    test_acc /= (len(test_X) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1
    saver.save(sess, "bahdanau/model.ckpt")

train minibatch loop: 100%|██████████| 357/357 [01:48<00:00,  3.28it/s, accuracy=0.742, cost=0.501]
test minibatch loop: 100%|██████████| 90/90 [00:09<00:00,  9.38it/s, accuracy=0.5, cost=0.792]  


epoch: 0, pass acc: 0.000000, current acc: 0.653361
time taken: 118.52346515655518
epoch: 0, training loss: 0.654096, training acc: 0.608574, valid loss: 0.639861, valid acc: 0.653361



train minibatch loop: 100%|██████████| 357/357 [01:44<00:00,  3.43it/s, accuracy=0.806, cost=0.447]
test minibatch loop: 100%|██████████| 90/90 [00:08<00:00, 11.01it/s, accuracy=0.375, cost=0.812]


epoch: 1, pass acc: 0.653361, current acc: 0.661415
time taken: 112.26846075057983
epoch: 1, training loss: 0.599809, training acc: 0.670648, valid loss: 0.625623, valid acc: 0.661415



train minibatch loop: 100%|██████████| 357/357 [01:43<00:00,  3.46it/s, accuracy=0.806, cost=0.418]
test minibatch loop: 100%|██████████| 90/90 [00:08<00:00, 11.08it/s, accuracy=0.375, cost=0.809]


epoch: 2, pass acc: 0.661415, current acc: 0.664916
time taken: 111.42859125137329
epoch: 2, training loss: 0.556458, training acc: 0.708028, valid loss: 0.627689, valid acc: 0.664916



train minibatch loop: 100%|██████████| 357/357 [01:44<00:00,  3.43it/s, accuracy=0.839, cost=0.357]
test minibatch loop: 100%|██████████| 90/90 [00:08<00:00, 11.08it/s, accuracy=0.5, cost=0.909]  


epoch: 3, pass acc: 0.664916, current acc: 0.672969
time taken: 112.3364839553833
epoch: 3, training loss: 0.505106, training acc: 0.748738, valid loss: 0.645605, valid acc: 0.672969



train minibatch loop: 100%|██████████| 357/357 [01:44<00:00,  3.41it/s, accuracy=0.935, cost=0.288]
test minibatch loop: 100%|██████████| 90/90 [00:08<00:00, 10.55it/s, accuracy=0.625, cost=0.954]


time taken: 113.08643627166748
epoch: 4, training loss: 0.444339, training acc: 0.787003, valid loss: 0.708635, valid acc: 0.669118



train minibatch loop: 100%|██████████| 357/357 [01:43<00:00,  3.43it/s, accuracy=0.871, cost=0.212]
test minibatch loop: 100%|██████████| 90/90 [00:08<00:00, 11.09it/s, accuracy=0.625, cost=1.21] 


time taken: 112.11891460418701
epoch: 5, training loss: 0.370059, training acc: 0.831469, valid loss: 0.858506, valid acc: 0.659314



train minibatch loop: 100%|██████████| 357/357 [01:45<00:00,  3.40it/s, accuracy=0.968, cost=0.133] 
test minibatch loop: 100%|██████████| 90/90 [00:08<00:00, 11.03it/s, accuracy=0.75, cost=1.52]  


time taken: 113.19700813293457
epoch: 6, training loss: 0.290849, training acc: 0.869734, valid loss: 1.132611, valid acc: 0.666317



train minibatch loop: 100%|██████████| 357/357 [01:45<00:00,  3.39it/s, accuracy=0.935, cost=0.122] 
test minibatch loop: 100%|██████████| 90/90 [00:08<00:00, 10.60it/s, accuracy=0.75, cost=1.49]  
train minibatch loop:   0%|          | 0/357 [00:00<?, ?it/s]

time taken: 113.71206593513489
epoch: 7, training loss: 0.224863, training acc: 0.907549, valid loss: 1.208689, valid acc: 0.667367



train minibatch loop: 100%|██████████| 357/357 [01:43<00:00,  3.44it/s, accuracy=0.935, cost=0.105] 
test minibatch loop: 100%|██████████| 90/90 [00:09<00:00,  9.55it/s, accuracy=0.75, cost=1.76]  


time taken: 113.27922558784485
epoch: 8, training loss: 0.172068, training acc: 0.933637, valid loss: 1.287527, valid acc: 0.663866

break epoch:9



In [13]:
real_Y, predict_Y = [], []

pbar = tqdm(range(0, len(test_X), batch_size), desc='validation minibatch loop')
for i in pbar:
    batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
    batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
    predict_Y += np.argmax(sess.run(model.logits, feed_dict = {model.X : batch_x, model.Y : batch_y}),1).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 90/90 [00:08<00:00, 11.10it/s]


In [14]:
from sklearn import metrics
print(metrics.classification_report(real_Y, predict_Y, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.63      0.63      0.63      1320
   positive       0.68      0.68      0.68      1536

avg / total       0.66      0.66      0.66      2856



In [15]:
strings=','.join([n.name for n in tf.get_default_graph().as_graph_def().node if "Variable" in n.op or n.name.find('Placeholder') >= 0 or n.name.find('logits') == 0 or n.name.find('alphas') == 0])

In [16]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"
    clear_devices = True
    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        ) 
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

In [17]:
freeze_graph("bahdanau", strings)

INFO:tensorflow:Restoring parameters from bahdanau/model.ckpt
INFO:tensorflow:Froze 35 variables.
Converted 35 variables to const ops.
413 ops in the final graph.


In [18]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [19]:
g=load_graph('bahdanau/frozen_model.pb')

In [20]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
alphas = g.get_tensor_by_name('import/alphas:0')
test_sess = tf.InteractiveSession(graph=g)
test_sess.run([logits,alphas], feed_dict={x:vectors[:1]})[1].shape

(100,)

In [21]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.9984276 , 0.00157239]], dtype=float32),
 array([0.04632575, 0.0188179 , 0.01601248, 0.01524496, 0.01524496,
        0.04298494, 0.01821836, 0.01842043, 0.80873024], dtype=float32)]

In [22]:
text = 'saya sangat sayangkan kerajaan saya'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.549336  , 0.45066395]], dtype=float32),
 array([0.14614522, 0.097147  , 0.15663794, 0.45392463, 0.14614522],
       dtype=float32)]

In [23]:
text = 'bodoh lah awak ni'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.93802065, 0.06197929]], dtype=float32),
 array([0.5580012 , 0.16381814, 0.11481879, 0.16336185], dtype=float32)]

In [24]:
text = 'kerajaan sebenarnya sangat baik'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.1507497 , 0.84925026]], dtype=float32),
 array([0.33014774, 0.12414087, 0.10522812, 0.4404832 ], dtype=float32)]

In [25]:
import json
with open('bahdanau-sentiment.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))