In [1]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
import time
from sklearn.preprocessing import LabelEncoder
from unidecode import unidecode
import pickle



In [2]:
df = pd.read_csv('sentiment-news-bahasa-v5.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [3]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', '', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,1] = textcleaning(df.iloc[i,1])

In [5]:
train_X, test_X, train_Y, test_Y = train_test_split(df.text.values, Y, test_size = 0.1)

In [6]:
def str_idx(corpus, dic, maxlen, UNK=0):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            try:
                X[i,-1 - no]=dic[k]
            except Exception as e:
                X[i,-1 - no]=UNK
    return X

In [7]:
with open('word2vec-256.p','rb') as fopen:
    embedded = pickle.load(fopen)

In [8]:
class Model:
    def __init__(self, size_layer, num_layers, dropout, dimension_output, learning_rate, maxlen):
        def cells(size, reuse=False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse),
                dropout,dropout,dropout)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(tf.convert_to_tensor(embedded['nce_weights'],
                                                           dtype=tf.float32),trainable=False)
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer),
                cell_bw = cells(size_layer),
                inputs = encoder_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            encoder_embedded = tf.concat((out_fw, out_bw), 2)
        W = tf.get_variable('w',shape=(size_layer*2, 2),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(2),initializer=tf.zeros_initializer())
        self.logits = tf.add(tf.matmul(encoder_embedded[:,-1], W),b,name='logits')
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.logits, 
                                                                           labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        self.accuracy = tf.reduce_mean(tf.cast(tf.nn.in_top_k(self.logits, self.Y, 1), tf.float32))

In [9]:
size_layer = 512
num_layers = 2
dropout = 0.7
dimension_output = 2
learning_rate = 1e-4
maxlen = 100
batch_size = 32
dictionary = embedded['dictionary']

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,dropout,dimension_output,learning_rate,maxlen)
sess.run(tf.global_variables_initializer())

In [10]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 10, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
    
    train_X, train_Y = shuffle(train_X, train_Y)
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_Y[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_Y[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1

epoch: 0, pass acc: 0.000000, current acc: 0.681818
time taken: 101.97703409194946
epoch: 0, training loss: 0.634329, training acc: 0.644721, valid loss: 0.622408, valid acc: 0.681818

epoch: 1, pass acc: 0.681818, current acc: 0.684659
time taken: 93.08757257461548
epoch: 1, training loss: 0.578062, training acc: 0.691141, valid loss: 0.600805, valid acc: 0.684659

time taken: 94.39820551872253
epoch: 2, training loss: 0.542596, training acc: 0.718143, valid loss: 0.581004, valid acc: 0.678977

time taken: 89.37310671806335
epoch: 3, training loss: 0.527841, training acc: 0.737257, valid loss: 0.615988, valid acc: 0.684659

time taken: 73.29732012748718
epoch: 4, training loss: 0.518808, training acc: 0.733920, valid loss: 0.605105, valid acc: 0.650568

epoch: 5, pass acc: 0.684659, current acc: 0.690341
time taken: 48.82748460769653
epoch: 5, training loss: 0.503941, training acc: 0.754854, valid loss: 0.577728, valid acc: 0.690341

time taken: 48.807836055755615
epoch: 6, training l

In [11]:
import os
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, os.getcwd()+"/normal/model.ckpt")

'/home/barbatos/Desktop/rnn/normal/model.ckpt'

In [12]:
strings=','.join([n.name for n in tf.get_default_graph().as_graph_def().node if "Variable" in n.op or n.name.find('Placeholder') >= 0 or n.name.find('logits') == 0 or n.name.find('alphas') == 0])

In [13]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"
    clear_devices = True
    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        ) 
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

In [14]:
freeze_graph("normal", strings)

INFO:tensorflow:Restoring parameters from /home/barbatos/Desktop/rnn/normal/model.ckpt
INFO:tensorflow:Froze 33 variables.
Converted 33 variables to const ops.
677 ops in the final graph.


In [15]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [16]:
g=load_graph('normal/frozen_model.pb')

In [17]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph=g)
results=test_sess.run(logits, feed_dict={x:batch_x})