In [1]:
from entity_network import EntityNetwork
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from tqdm import tqdm
import time



In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [3]:
def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )

def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            val = dic[k] if k in dic else UNK
            X[i, -1 - no] = val
    return X

In [4]:
df = pd.read_csv('dataset/sentiment-data-v2.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [5]:
with open('dataset/polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('dataset/polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [6]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])[0]

In [7]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 13325
Most common words [('yang', 14899), ('tidak', 4588), ('untuk', 4038), ('filem', 3698), ('deng', 3350), ('ada', 3190)]
Sample data [1324, 196, 178, 98, 98, 126, 353, 4, 90, 210] ['ringkas', 'bodoh', 'bosan', 'kanak', 'kanak', 'lelaki', 'remaja', 'yang', 'begitu', 'muda']


In [8]:
size_layer = 256
num_layers = 2
embedded_size = 256
dimension_output = len(np.unique(labels))
learning_rate = 1e-3
maxlen = 80
batch_size = 128
decay_step = 1e4
decay_rate = 1.0
story_len = 1

In [9]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

model = EntityNetwork(
    dimension_output,
    learning_rate,
    decay_step,
    decay_rate,
    maxlen,
    story_len,
    len(dictionary),
    embedded_size,
    size_layer
)

sess.run(tf.global_variables_initializer())

In [10]:
vectors = str_idx(texts, dictionary, maxlen)
train_X, test_X, train_Y, test_Y = train_test_split(
    vectors, labels, test_size = 0.2
)

In [29]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'learning_rate' not in n.name
        and 'OptimizeLoss' not in n.name
        and 'Global_Step' not in n.name
        and 'Epoch_Step' not in n.name
    ]
)

In [30]:
strings.split(',')

['Placeholder_story',
 'Placeholder_question',
 'Placeholder_input_y',
 'Placeholder_dropout_keep_prob',
 'output_module/H',
 'output_module/R',
 'output_module/y_bias',
 'output_module/b_projection',
 'output_module/h_u_bias',
 'dynamic_memory/U',
 'dynamic_memory/V',
 'dynamic_memory/W',
 'dynamic_memory/h_bias',
 'dynamic_memory/h2_bias',
 'embedding_projection/Embedding',
 'story_mask',
 'query_mask',
 'hidden_states',
 'keys',
 'h_candidate0/alpha',
 'query_add_hidden/alpha',
 'logits']

In [13]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'entity_network/model.ckpt')

'entity_network/model.ckpt'

In [14]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.answer_single: batch_y,
                model.query: batch_x,
                model.story: batch_x_expand,
                model.dropout_keep_prob: 1.0
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.answer_single: batch_y,
                model.query: batch_x,
                model.story: batch_x_expand,
                model.dropout_keep_prob: 1.0
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 90/90 [00:03<00:00, 26.58it/s, accuracy=0.774, cost=1.16]
test minibatch loop: 100%|██████████| 23/23 [00:00<00:00, 81.40it/s, accuracy=0.575, cost=1.25]
train minibatch loop:   4%|▍         | 4/90 [00:00<00:02, 32.49it/s, accuracy=0.773, cost=1.21]

epoch: 0, pass acc: 0.000000, current acc: 0.681232
time taken: 3.671577215194702
epoch: 0, training loss: 2.188570, training acc: 0.593285, valid loss: 1.332644, valid acc: 0.681232



train minibatch loop: 100%|██████████| 90/90 [00:03<00:00, 28.41it/s, accuracy=0.935, cost=0.718]
test minibatch loop: 100%|██████████| 23/23 [00:00<00:00, 107.75it/s, accuracy=0.775, cost=0.897]
train minibatch loop:   3%|▎         | 3/90 [00:00<00:02, 29.17it/s, accuracy=0.852, cost=0.762]

epoch: 1, pass acc: 0.681232, current acc: 0.712955
time taken: 3.3933229446411133
epoch: 1, training loss: 1.002730, training acc: 0.765100, valid loss: 0.991044, valid acc: 0.712955



train minibatch loop: 100%|██████████| 90/90 [00:03<00:00, 28.71it/s, accuracy=0.968, cost=0.51] 
test minibatch loop: 100%|██████████| 23/23 [00:00<00:00, 113.42it/s, accuracy=0.775, cost=0.768]
train minibatch loop:   3%|▎         | 3/90 [00:00<00:03, 28.60it/s, accuracy=0.906, cost=0.57] 

epoch: 2, pass acc: 0.712955, current acc: 0.721008
time taken: 3.3424582481384277
epoch: 2, training loss: 0.743444, training acc: 0.820264, valid loss: 0.893074, valid acc: 0.721008



train minibatch loop: 100%|██████████| 90/90 [00:03<00:00, 28.10it/s, accuracy=1, cost=0.357]    
test minibatch loop: 100%|██████████| 23/23 [00:00<00:00, 105.85it/s, accuracy=0.75, cost=0.679] 
train minibatch loop:   4%|▍         | 4/90 [00:00<00:02, 31.86it/s, accuracy=0.883, cost=0.529]

epoch: 3, pass acc: 0.721008, current acc: 0.739496
time taken: 3.42254638671875
epoch: 3, training loss: 0.614431, training acc: 0.854942, valid loss: 0.816465, valid acc: 0.739496



train minibatch loop: 100%|██████████| 90/90 [00:03<00:00, 28.02it/s, accuracy=1, cost=0.296]    
test minibatch loop: 100%|██████████| 23/23 [00:00<00:00, 89.96it/s, accuracy=0.725, cost=0.692]
train minibatch loop:   3%|▎         | 3/90 [00:00<00:03, 27.68it/s, accuracy=0.977, cost=0.378]

epoch: 4, pass acc: 0.739496, current acc: 0.742927
time taken: 3.481818914413452
epoch: 4, training loss: 0.525420, training acc: 0.884356, valid loss: 0.808490, valid acc: 0.742927



train minibatch loop: 100%|██████████| 90/90 [00:03<00:00, 28.75it/s, accuracy=1, cost=0.234]    
test minibatch loop: 100%|██████████| 23/23 [00:00<00:00, 96.74it/s, accuracy=0.775, cost=0.708]
train minibatch loop:   3%|▎         | 3/90 [00:00<00:03, 28.78it/s, accuracy=0.961, cost=0.384]

time taken: 3.3705894947052
epoch: 5, training loss: 0.462479, training acc: 0.901164, valid loss: 0.814266, valid acc: 0.738165



train minibatch loop: 100%|██████████| 90/90 [00:03<00:00, 27.89it/s, accuracy=1, cost=0.206]    
test minibatch loop: 100%|██████████| 23/23 [00:00<00:00, 104.33it/s, accuracy=0.725, cost=0.733]
train minibatch loop:   3%|▎         | 3/90 [00:00<00:02, 29.71it/s, accuracy=0.984, cost=0.283]

time taken: 3.2572367191314697
epoch: 6, training loss: 0.391990, training acc: 0.932505, valid loss: 0.845093, valid acc: 0.727521



train minibatch loop: 100%|██████████| 90/90 [00:03<00:00, 28.59it/s, accuracy=1, cost=0.183]    
test minibatch loop: 100%|██████████| 23/23 [00:00<00:00, 112.00it/s, accuracy=0.65, cost=0.775] 

time taken: 3.3656773567199707
epoch: 7, training loss: 0.357746, training acc: 0.937232, valid loss: 0.856754, valid acc: 0.730812

break epoch:8






In [15]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    batch_x_expand = np.expand_dims(batch_x,axis = 1)
    predict_Y += np.argmax(
        sess.run(
            model.logits,
            feed_dict = {
                model.answer_single: batch_y,
                model.query: batch_x,
                model.story: batch_x_expand,
                model.dropout_keep_prob: 1.0
            },
        ),
        1,
    ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 23/23 [00:00<00:00, 143.60it/s]


In [16]:
from sklearn import metrics
print(metrics.classification_report(real_Y, predict_Y, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.70      0.66      0.68      1324
   positive       0.72      0.75      0.74      1532

avg / total       0.71      0.71      0.71      2856



In [20]:
text = classification_textcleaning(
    'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
)
new_vector = str_idx([text[0]], dictionary, maxlen)
batch_x_expand = np.expand_dims(new_vector,axis = 1)
sess.run(
            tf.nn.softmax(model.logits),
            feed_dict = {
                model.query: new_vector,
                model.story: batch_x_expand,
                model.dropout_keep_prob: 1.0
            },
        )

array([[0.5061709 , 0.49382904]], dtype=float32)

In [44]:
text = classification_textcleaning(
    'bodoh la mereka ni'
)
new_vector = str_idx([text[0]], dictionary, maxlen)
batch_x_expand = np.expand_dims(new_vector,axis = 1)
sess.run(
            tf.nn.softmax(model.logits),
            feed_dict = {
                model.query: new_vector,
                model.story: batch_x_expand,
                model.dropout_keep_prob: 1.0
            },
        )

array([[0.6623351 , 0.33766493]], dtype=float32)

In [48]:
saver.save(sess, 'entity_network/model.ckpt')

'entity_network/model.ckpt'

In [23]:
import json
with open('entity-network-sentiment.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [24]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [49]:
freeze_graph('entity_network', strings)

INFO:tensorflow:Restoring parameters from entity_network/model.ckpt
INFO:tensorflow:Froze 17 variables.
INFO:tensorflow:Converted 17 variables to const ops.
139 ops in the final graph.


In [33]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [50]:
g = load_graph('entity_network/frozen_model.pb')

In [46]:
[n.name for n in g.as_graph_def().node]

['import/Placeholder_story',
 'import/Placeholder_question',
 'import/Shape',
 'import/strided_slice/stack',
 'import/strided_slice/stack_1',
 'import/strided_slice/stack_2',
 'import/strided_slice',
 'import/Placeholder_input_y',
 'import/Placeholder_dropout_keep_prob',
 'import/output_module/H',
 'import/output_module/H/read',
 'import/output_module/R',
 'import/output_module/R/read',
 'import/output_module/y_bias',
 'import/output_module/y_bias/read',
 'import/output_module/b_projection',
 'import/output_module/h_u_bias',
 'import/output_module/h_u_bias/read',
 'import/dynamic_memory/U',
 'import/dynamic_memory/U/read',
 'import/dynamic_memory/V',
 'import/dynamic_memory/V/read',
 'import/dynamic_memory/W',
 'import/dynamic_memory/W/read',
 'import/dynamic_memory/h_bias',
 'import/dynamic_memory/h_bias/read',
 'import/dynamic_memory/h2_bias',
 'import/dynamic_memory/h2_bias/read',
 'import/embedding_projection/Embedding',
 'import/embedding_projection/Embedding/read',
 'import/embed

In [51]:
Placeholder_story = g.get_tensor_by_name('import/Placeholder_story:0')
Placeholder_question = g.get_tensor_by_name('import/Placeholder_question:0')
Placeholder_dropout_keep_prob = g.get_tensor_by_name(
    'import/Placeholder_dropout_keep_prob:0'
)
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(
    tf.nn.softmax(logits),
    feed_dict = {
        Placeholder_question: new_vector,
        Placeholder_story: batch_x_expand,
        Placeholder_dropout_keep_prob: 1.0,
    },
)



array([[0.6623351 , 0.33766493]], dtype=float32)