In [1]:
import numpy as np
from utils import *
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import time
from tqdm import tqdm
import random
from sklearn.preprocessing import LabelEncoder
from unidecode import unidecode
import re
import pandas as pd



In [2]:
df = pd.read_csv('sentiment-news-bahasa-v5.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [3]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', ' ', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,1] = textcleaning(df.iloc[i,1])

In [5]:
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

In [6]:
with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [7]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 18844
Most common words [('yang', 14893), ('dan', 8177), ('tidak', 4579), ('untuk', 4023), ('dengan', 3349), ('filem', 3279)]
Sample data [1614, 204, 5, 161, 218, 106, 301, 4, 78, 203] ['ringkas', 'bodoh', 'dan', 'membosankan', 'kanak-kanak', 'lelaki', 'remaja', 'yang', 'begitu', 'muda']


In [8]:
idx_trainset = []
for text in texts:
    idx = []
    for t in text.split():
        try:
            idx.append(dictionary[t])
        except:
            idx.append(3)
    idx_trainset.append(idx)

In [9]:
def create_ngram_set(input_list, ngram_value):
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def build_ngram(x_train):
    global max_features
    ngram_set = set()
    for input_list in tqdm(x_train, total=len(x_train), ncols=70):
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    max_features = np.max(list(indice_token.keys())) + 1
    return token_indice

def add_ngram(sequences, token_indice):
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)
    return new_sequences

In [10]:
ngram_range = 2
max_features = 20000
maxlen = 80
batch_size = 32
embedded_size = 256

In [11]:
token_indice = build_ngram(idx_trainset)
X = add_ngram(idx_trainset, token_indice)
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen)

100%|███████████████████████| 14279/14279 [00:00<00:00, 214848.30it/s]


In [12]:
train_X, test_X, train_Y, test_Y = train_test_split(X, 
                                                    labels,
                                                    test_size = 0.2)

In [13]:
class Model:
    def __init__(self, embedded_size, dict_size, dimension_output, learning_rate):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        self.logits = tf.identity(tf.layers.dense(tf.reduce_mean(encoder_embedded, 1), dimension_output),
                                  name="logits")
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=self.logits,
            labels=self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [14]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(embedded_size,vocabulary_size+4,2,5e-4)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, "fast-text/model.ckpt")

'fast-text/model.ckpt'

In [15]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 10, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : train_X[i:i+batch_size], model.Y : train_Y[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : test_X[i:i+batch_size], model.Y : test_Y[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1
    saver.save(sess, "fast-text/model.ckpt")

epoch: 0, pass acc: 0.000000, current acc: 0.582514
time taken: 2.2845072746276855
epoch: 0, training loss: 0.687419, training acc: 0.548718, valid loss: 0.680611, valid acc: 0.582514

epoch: 1, pass acc: 0.582514, current acc: 0.630267
time taken: 2.1589436531066895
epoch: 1, training loss: 0.669211, training acc: 0.605513, valid loss: 0.666756, valid acc: 0.630267

epoch: 2, pass acc: 0.630267, current acc: 0.661868
time taken: 2.159257173538208
epoch: 2, training loss: 0.647248, training acc: 0.656162, valid loss: 0.650173, valid acc: 0.661868

epoch: 3, pass acc: 0.661868, current acc: 0.674508
time taken: 2.153167963027954
epoch: 3, training loss: 0.620276, training acc: 0.699263, valid loss: 0.631791, valid acc: 0.674508

epoch: 4, pass acc: 0.674508, current acc: 0.682584
time taken: 2.154153347015381
epoch: 4, training loss: 0.590033, training acc: 0.731039, valid loss: 0.613892, valid acc: 0.682584

epoch: 5, pass acc: 0.682584, current acc: 0.691011
time taken: 2.156793594360

In [16]:
saver.restore(sess, "fast-text/model.ckpt")
logits = sess.run(model.logits, feed_dict={model.X:test_X})
print(metrics.classification_report(test_Y, np.argmax(logits,1), target_names = ['negative','positive']))

INFO:tensorflow:Restoring parameters from fast-text/model.ckpt
             precision    recall  f1-score   support

   negative       0.70      0.68      0.69      1342
   positive       0.72      0.74      0.73      1514

avg / total       0.71      0.71      0.71      2856



In [17]:
def to_idx(texts):
    idx_trainset = []
    for text in texts:
        idx = []
        for t in text.split():
            try:
                idx.append(dictionary[t])
            except:
                pass
        idx_trainset.append(idx)
    return idx_trainset

In [18]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
new_vector = add_ngram(to_idx([text]), token_indice)
sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

array([[0.9678054 , 0.03219458]], dtype=float32)

In [19]:
text = 'kerajaan sebenarnya sangat sayangkan rakyatnya'
new_vector = add_ngram(to_idx([text]), token_indice)
sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

array([[1.7911309e-05, 9.9998212e-01]], dtype=float32)

In [20]:
text = 'kerajaan sebenarnya sangat sayangkan rakyatnya, tetapi sebenarnya benci'
new_vector = add_ngram(to_idx([text]), token_indice)
sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

array([[0.84641653, 0.15358353]], dtype=float32)

In [32]:
import json
with open('fast-text-sentiment.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [22]:
strings=','.join([n.name for n in tf.get_default_graph().as_graph_def().node if "Variable" in n.op or n.name.find('Placeholder') >= 0 or n.name.find('logits') == 0])

In [23]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"
    clear_devices = True
    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        ) 
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

In [24]:
freeze_graph("fast-text", strings)

INFO:tensorflow:Restoring parameters from fast-text/model.ckpt
INFO:tensorflow:Froze 11 variables.
Converted 11 variables to const ops.
22 ops in the final graph.


In [25]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [26]:
g=load_graph('fast-text/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph=g)
predicted = test_sess.run(logits,feed_dict={x:test_X})
predicted.shape

(2856, 2)

In [34]:
import pickle
with open('token-indice.pkl','wb') as fopen:
    pickle.dump(token_indice, fopen)