In [1]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np
from sklearn.utils import shuffle
from utils import *
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from tqdm import tqdm
import pickle



In [2]:
df = pd.read_csv('sentiment-news-bahasa-v5.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [3]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', ' ', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,1] = textcleaning(df.iloc[i,1])

In [5]:
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [6]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 18957
Most common words [('yang', 14891), ('dan', 8177), ('tidak', 4578), ('untuk', 4023), ('dengan', 3349), ('filem', 3279)]
Sample data [1609, 205, 5, 160, 218, 106, 304, 4, 79, 202] ['ringkas', 'bodoh', 'dan', 'membosankan', 'kanak-kanak', 'lelaki', 'remaja', 'yang', 'begitu', 'muda']


In [7]:
def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            try:
                X[i,-1 - no]=dic[k]
            except Exception as e:
                X[i,-1 - no]=UNK
    return X

In [8]:
class Model:
    def __init__(self, size_layer, num_layers, dimension_output, learning_rate, dropout,
                dict_size):
        def cells(size, reuse=False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer=tf.orthogonal_initializer(),
                    reuse=reuse),
                state_keep_prob=dropout,
                output_keep_prob=dropout)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, size_layer], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        attention_mechanism = tf.contrib.seq2seq.LuongAttention(
            num_units = size_layer, 
            memory = encoder_embedded)
        rnn_cells = tf.contrib.seq2seq.AttentionWrapper(
            cell = tf.nn.rnn_cell.MultiRNNCell([cells(size_layer) for _ in range(num_layers)]), 
            attention_mechanism = attention_mechanism,
            attention_layer_size = size_layer,
            alignment_history=True)
        outputs, last_state = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded, dtype = tf.float32)
        self.alignments = tf.transpose(last_state.alignment_history.stack(),[1,2,0])
        W = tf.get_variable('w',shape=(size_layer, dimension_output),
                            initializer=tf.glorot_uniform_initializer())
        b = tf.get_variable('b',shape=(dimension_output),
                            initializer=tf.zeros_initializer())
        self.logits = tf.add(tf.matmul(outputs[:,-1], W),b,name='logits')
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        self.attention = tf.nn.softmax(tf.reduce_sum(self.alignments[0],1),name='alphas')

In [9]:
size_layer = 256
num_layers = 2
dropout = 0.8
dimension_output = 2
learning_rate = 1e-4
batch_size = 32
maxlen = 100

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,dimension_output,learning_rate,dropout,len(dictionary))
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, "luong/model.ckpt")

'luong/model.ckpt'

In [10]:
vectors = str_idx(texts, dictionary, maxlen)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, 
                                                    labels,
                                                    test_size = 0.2)

In [12]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x = train_X[i:min(i+batch_size,train_X.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X) / batch_size)
    train_acc /= (len(train_X) / batch_size)
    test_loss /= (len(test_X) / batch_size)
    test_acc /= (len(test_X) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1
    saver.save(sess, "luong/model.ckpt")

train minibatch loop: 100%|██████████| 357/357 [01:13<00:00,  4.85it/s, accuracy=0.613, cost=0.649]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 14.40it/s, accuracy=0.75, cost=0.608] 
train minibatch loop:   0%|          | 0/357 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.634454
time taken: 79.88421893119812
epoch: 0, training loss: 0.659646, training acc: 0.606987, valid loss: 0.647072, valid acc: 0.634454



train minibatch loop: 100%|██████████| 357/357 [01:13<00:00,  4.85it/s, accuracy=0.677, cost=0.599]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 14.45it/s, accuracy=0.625, cost=0.613]


epoch: 1, pass acc: 0.634454, current acc: 0.640756
time taken: 79.77123618125916
epoch: 1, training loss: 0.604024, training acc: 0.670986, valid loss: 0.640838, valid acc: 0.640756



train minibatch loop: 100%|██████████| 357/357 [01:13<00:00,  4.84it/s, accuracy=0.581, cost=0.593]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 14.47it/s, accuracy=0.75, cost=0.633] 


epoch: 2, pass acc: 0.640756, current acc: 0.652311
time taken: 79.92901086807251
epoch: 2, training loss: 0.550536, training acc: 0.720702, valid loss: 0.656587, valid acc: 0.652311



train minibatch loop: 100%|██████████| 357/357 [01:13<00:00,  4.85it/s, accuracy=0.774, cost=0.474]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 14.59it/s, accuracy=0.875, cost=0.521]
train minibatch loop:   0%|          | 0/357 [00:00<?, ?it/s]

time taken: 79.7882149219513
epoch: 3, training loss: 0.485508, training acc: 0.764753, valid loss: 0.698474, valid acc: 0.644958



train minibatch loop: 100%|██████████| 357/357 [01:13<00:00,  4.85it/s, accuracy=0.871, cost=0.365]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 14.38it/s, accuracy=0.5, cost=0.759]  


time taken: 79.82724380493164
epoch: 4, training loss: 0.400258, training acc: 0.820614, valid loss: 0.783973, valid acc: 0.635154



train minibatch loop: 100%|██████████| 357/357 [01:13<00:00,  4.84it/s, accuracy=0.806, cost=0.397]
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 14.49it/s, accuracy=0.625, cost=1.36] 
train minibatch loop:   0%|          | 0/357 [00:00<?, ?it/s]

time taken: 79.90071511268616
epoch: 5, training loss: 0.317195, training acc: 0.865780, valid loss: 0.911790, valid acc: 0.626751



train minibatch loop: 100%|██████████| 357/357 [01:15<00:00,  4.72it/s, accuracy=0.903, cost=0.244] 
test minibatch loop: 100%|██████████| 90/90 [00:06<00:00, 14.35it/s, accuracy=0.625, cost=1.62] 
train minibatch loop:   0%|          | 0/357 [00:00<?, ?it/s]

time taken: 81.97506380081177
epoch: 6, training loss: 0.248816, training acc: 0.901944, valid loss: 1.097798, valid acc: 0.626050



train minibatch loop: 100%|██████████| 357/357 [01:47<00:00,  3.31it/s, accuracy=0.935, cost=0.213] 
test minibatch loop: 100%|██████████| 90/90 [00:11<00:00,  7.57it/s, accuracy=0.5, cost=1.51]   


time taken: 119.74860167503357
epoch: 7, training loss: 0.196715, training acc: 0.924620, valid loss: 1.217249, valid acc: 0.627451

break epoch:8



In [13]:
real_Y, predict_Y = [], []

pbar = tqdm(range(0, len(test_X), batch_size), desc='validation minibatch loop')
for i in pbar:
    batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
    batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
    predict_Y += np.argmax(sess.run(model.logits, feed_dict = {model.X : batch_x, model.Y : batch_y}),1).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 90/90 [00:11<00:00,  7.64it/s]


In [14]:
from sklearn import metrics
print(metrics.classification_report(real_Y, predict_Y, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.61      0.64      0.63      1351
   positive       0.66      0.63      0.65      1505

avg / total       0.64      0.64      0.64      2856



In [15]:
strings=','.join([n.name for n in tf.get_default_graph().as_graph_def().node if "Variable" in n.op or n.name.find('Placeholder') >= 0 or n.name.find('logits') == 0 or n.name.find('alphas') == 0])

In [16]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"
    clear_devices = True
    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        ) 
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

In [17]:
freeze_graph("luong", strings)

INFO:tensorflow:Restoring parameters from luong/model.ckpt
INFO:tensorflow:Froze 29 variables.
Converted 29 variables to const ops.
399 ops in the final graph.


In [18]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [19]:
g=load_graph('luong/frozen_model.pb')

In [20]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
alphas = g.get_tensor_by_name('import/alphas:0')
test_sess = tf.InteractiveSession(graph=g)
test_sess.run([logits,alphas], feed_dict={x:vectors[:1]})[1].shape

(100,)

In [21]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.2420753 , 0.75792474]], dtype=float32),
 array([0.07763681, 0.14505728, 0.05765092, 0.0795059 , 0.0795059 ,
        0.09933987, 0.21249782, 0.1525911 , 0.09621443], dtype=float32)]

In [22]:
text = 'saya sangat sayangkan kerajaan saya'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.1258716, 0.8741284]], dtype=float32),
 array([0.15036678, 0.12758024, 0.30859366, 0.26309252, 0.15036678],
       dtype=float32)]

In [23]:
text = 'bodoh lah awak ni'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.74957025, 0.25042972]], dtype=float32),
 array([0.2403495 , 0.30058745, 0.24812593, 0.21093708], dtype=float32)]

In [24]:
text = 'kerajaan sebenarnya sangat baik'
new_vector = str_idx([text],dictionary,len(text.split()))
test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})

[array([[0.02914646, 0.97085357]], dtype=float32),
 array([0.21245742, 0.30735135, 0.16115844, 0.31903276], dtype=float32)]

In [25]:
import json
with open('luong-sentiment.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))