In [1]:
import tensorflow as tf
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import collections
import itertools
from unidecode import unidecode
import malaya
import re
import json

In [2]:
def build_dataset(words, n_words, atleast=2):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words - 10)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen]):
            X[i, no] = dic.get(k, UNK)
    return X

tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [w.lower() for w in tokenized if len(w) > 2]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

In [3]:
with open('train-similarity.json') as fopen:
    train = json.load(fopen)

In [4]:
left, right, label = train['left'], train['right'], train['label']

In [5]:
with open('test-similarity.json') as fopen:
    test = json.load(fopen)
test_left, test_right, test_label = test['left'], test['right'], test['label']

In [6]:
np.unique(label, return_counts = True)

(array([0, 1]), array([2605321, 1531070]))

In [7]:
concat = list(itertools.chain(*(left + right)))
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size, 1)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 73142
Most common words [('saya', 3584482), ('yang', 3541065), ('untuk', 2110965), ('apakah', 1948962), ('dan', 1556927), ('anda', 1375550)]
Sample data [7, 355, 325, 2415, 43, 9, 7, 355, 4166, 2415] ['apakah', 'maksud', 'cinta', 'sejati', 'kepada', 'anda', 'apakah', 'maksud', 'memuja', 'sejati']


In [8]:
with open('similarity-dictionary.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [9]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, learning_rate, dropout):
        
        def cells(size, reuse=False):
            cell = tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse)
            return tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=dropout)
        
        def rnn(inputs, scope):
            with tf.variable_scope(scope, reuse = tf.AUTO_REUSE):
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    num_units = size_layer, memory = inputs)
                rnn_cells = tf.contrib.seq2seq.AttentionWrapper(
                    cell = tf.nn.rnn_cell.MultiRNNCell(
                        [cells(size_layer) for _ in range(num_layers)]
                    ),
                    attention_mechanism = attention_mechanism,
                    attention_layer_size = size_layer,
                    alignment_history = True,
                )
                outputs, last_state = tf.nn.dynamic_rnn(
                    rnn_cells, inputs, dtype = tf.float32
                )
                return outputs[:,-1]
        
        self.X_left = tf.placeholder(tf.int32, [None, None])
        self.X_right = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None])
        self.batch_size = tf.shape(self.X_left)[0]
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded_left = tf.nn.embedding_lookup(encoder_embeddings, self.X_left)
        embedded_right = tf.nn.embedding_lookup(encoder_embeddings, self.X_right)
        
        def contrastive_loss(y,d):
            tmp= y * tf.square(d)
            tmp2 = (1-y) * tf.square(tf.maximum((1 - d),0))
            return tf.reduce_sum(tmp +tmp2)/tf.cast(self.batch_size,tf.float32)/2
        
        self.output_left = rnn(embedded_left, 'left')
        self.output_right = rnn(embedded_right, 'right')
        self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.output_left,self.output_right)),
                                              1,keep_dims=True))
        self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.output_left),
                                                                           1,keep_dims=True)),
                                                     tf.sqrt(tf.reduce_sum(tf.square(self.output_right),
                                                                           1,keep_dims=True))))
        self.distance = tf.reshape(self.distance, [-1])
        self.logits = tf.identity(self.distance, name = 'logits')
        self.cost = contrastive_loss(self.Y,self.distance)
        
        self.temp_sim = tf.subtract(tf.ones_like(self.distance),
                                    tf.rint(self.distance))
        correct_predictions = tf.equal(self.temp_sim, self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)

In [10]:
size_layer = 256
num_layers = 2
embedded_size = 128
learning_rate = 1e-4
maxlen = 50
batch_size = 128
dropout = 0.8

In [11]:
from sklearn.cross_validation import train_test_split

train_X_left = str_idx(left, dictionary, maxlen)
train_X_right = str_idx(right, dictionary, maxlen)
train_Y = label

test_X_left = str_idx(test_left, dictionary, maxlen)
test_X_right = str_idx(test_right, dictionary, maxlen)
test_Y = test_label

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),learning_rate,dropout)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.


In [13]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bahdanau/model.ckpt')

'bahdanau/model.ckpt'

In [14]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 2, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X_left), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x_left = train_X_left[i:min(i+batch_size,train_X_left.shape[0])]
        batch_x_right = train_X_right[i:min(i+batch_size,train_X_left.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X_left.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X_left), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x_left = test_X_left[i:min(i+batch_size,test_X_left.shape[0])]
        batch_x_right = test_X_right[i:min(i+batch_size,test_X_left.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,test_X_left.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X_left) / batch_size)
    train_acc /= (len(train_X_left) / batch_size)
    test_loss /= (len(test_X_left) / batch_size)
    test_acc /= (len(test_X_left) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
    
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))

train minibatch loop: 100%|██████████| 32316/32316 [1:55:21<00:00,  4.74it/s, accuracy=0.718, cost=0.0761]
test minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.29it/s, accuracy=0.775, cost=0.0865]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.746424
time taken: 6955.883935451508
epoch: 0, training loss: 0.092503, training acc: 0.721196, valid loss: 0.086450, valid acc: 0.746424



train minibatch loop: 100%|██████████| 32316/32316 [1:53:11<00:00,  4.74it/s, accuracy=0.732, cost=0.0744]
test minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.30it/s, accuracy=0.75, cost=0.0809] 
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.746424, current acc: 0.763280
time taken: 6826.028009176254
epoch: 0, training loss: 0.081873, training acc: 0.761882, valid loss: 0.081624, valid acc: 0.763280



train minibatch loop: 100%|██████████| 32316/32316 [1:53:11<00:00,  4.81it/s, accuracy=0.915, cost=0.0614]
test minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.33it/s, accuracy=0.762, cost=0.0785]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.763280, current acc: 0.771912
time taken: 6826.077198982239
epoch: 0, training loss: 0.077154, training acc: 0.778432, valid loss: 0.079107, valid acc: 0.771912



train minibatch loop: 100%|██████████| 32316/32316 [1:53:13<00:00,  4.83it/s, accuracy=0.944, cost=0.0466]
test minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.29it/s, accuracy=0.775, cost=0.0792]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.771912, current acc: 0.778024
time taken: 6828.589703083038
epoch: 0, training loss: 0.074152, training acc: 0.789472, valid loss: 0.077665, valid acc: 0.778024



test minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.28it/s, accuracy=0.775, cost=0.0797]0.0492] 
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.778024, current acc: 0.781104
time taken: 6830.9069492816925
epoch: 0, training loss: 0.071784, training acc: 0.798278, valid loss: 0.076891, valid acc: 0.781104



train minibatch loop:   4%|▍         | 1354/32316 [04:43<1:48:08,  4.77it/s, accuracy=0.719, cost=0.0897]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop: 100%|██████████| 32316/32316 [1:53:16<00:00,  4.74it/s, accuracy=0.972, cost=0.0366] 
test minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.40it/s, accuracy=0.75, cost=0.0798] 
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.781104, current acc: 0.783340
time taken: 6830.440257072449
epoch: 0, training loss: 0.069742, training acc: 0.805855, valid loss: 0.076416, valid acc: 0.783340



train minibatch loop: 100%|██████████| 32316/32316 [1:53:10<00:00,  4.72it/s, accuracy=0.944, cost=0.0372] 
test minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.33it/s, accuracy=0.762, cost=0.0789]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.783340, current acc: 0.785572
time taken: 6824.842344999313
epoch: 0, training loss: 0.067793, training acc: 0.812866, valid loss: 0.075852, valid acc: 0.785572



train minibatch loop: 100%|██████████| 32316/32316 [1:53:13<00:00,  4.74it/s, accuracy=0.944, cost=0.0349] 
test minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.34it/s, accuracy=0.775, cost=0.0784]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.785572, current acc: 0.787424
time taken: 6827.9793746471405
epoch: 0, training loss: 0.065874, training acc: 0.819701, valid loss: 0.075570, valid acc: 0.787424



train minibatch loop: 100%|██████████| 32316/32316 [1:53:11<00:00,  4.78it/s, accuracy=0.915, cost=0.0403] 
test minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.31it/s, accuracy=0.788, cost=0.0763]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

time taken: 6825.849843502045
epoch: 0, training loss: 0.064019, training acc: 0.825954, valid loss: 0.075614, valid acc: 0.786776



train minibatch loop: 100%|██████████| 32316/32316 [1:53:07<00:00,  4.80it/s, accuracy=0.944, cost=0.0329] 
test minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.33it/s, accuracy=0.775, cost=0.0759]

time taken: 6821.773227930069
epoch: 0, training loss: 0.062121, training acc: 0.832607, valid loss: 0.075962, valid acc: 0.786164

break epoch:0






In [15]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and '_power' not in n.name
        and 'gradient' not in n.name
        and 'Initializer' not in n.name
        and 'Assign' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Variable',
 'left/memory_layer/kernel',
 'left/rnn/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/kernel/Read/ReadVariableOp',
 'left/rnn/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/bias/Read/ReadVariableOp',
 'left/rnn/attention_wrapper/multi_rnn_cell/cell_1/lstm_cell/kernel/Read/ReadVariableOp',
 'left/rnn/attention_wrapper/multi_rnn_cell/cell_1/lstm_cell/bias/Read/ReadVariableOp',
 'left/rnn/attention_wrapper/bahdanau_attention/query_layer/kernel',
 'left/rnn/attention_wrapper/bahdanau_attention/attention_v',
 'left/rnn/attention_wrapper/attention_layer/kernel',
 'right/memory_layer/kernel',
 'right/rnn/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/kernel/Read/ReadVariableOp',
 'right/rnn/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/bias/Read/ReadVariableOp',
 'right/rnn/attention_wrapper/multi_rnn_cell/cell_1/lstm_cell/kernel/Read/ReadVariableOp',
 'right/rnn/attention_wrapper/multi_rnn_cell/cell_1/lstm_cell/bi

In [16]:
saver.save(sess, 'bahdanau/model.ckpt')

'bahdanau/model.ckpt'

In [17]:
left = str_idx(['a person is outdoors, on a horse.'], dictionary, maxlen)
right = str_idx(['a person on a horse jumps over a broken down airplane.'], dictionary, maxlen)
sess.run([model.temp_sim,1-model.distance], feed_dict = {model.X_left : left, 
                                        model.X_right: right})

[array([0.], dtype=float32), array([0.11445844], dtype=float32)]

In [18]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X_left), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x_left = test_X_left[i:min(i+batch_size,train_X_left.shape[0])]
    batch_x_right = test_X_right[i:min(i+batch_size,train_X_left.shape[0])]
    batch_y = test_Y[i:min(i+batch_size,train_X_left.shape[0])]
    predict_Y += sess.run(model.temp_sim, feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y}).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


In [19]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['not similar', 'similar']
    )
)

             precision    recall  f1-score   support

not similar       0.83      0.83      0.83     31524
    similar       0.71      0.71      0.71     18476

avg / total       0.79      0.79      0.79     50000



In [20]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [21]:
freeze_graph('bahdanau', strings)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from bahdanau/model.ckpt
Instructions for updating:
Use tf.compat.v1.graph_util.convert_variables_to_constants
Instructions for updating:
Use tf.compat.v1.graph_util.extract_sub_graph
INFO:tensorflow:Froze 17 variables.
INFO:tensorflow:Converted 17 variables to const ops.
647 ops in the final graph.


In [22]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [23]:
g = load_graph('bahdanau/frozen_model.pb')
x1 = g.get_tensor_by_name('import/Placeholder:0')
x2 = g.get_tensor_by_name('import/Placeholder_1:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(1-logits, feed_dict = {x1 : left, x2: right})

array([0.11765248], dtype=float32)

In [24]:
test_sess.run(1-logits, feed_dict = {x1 : batch_x_left, x2: batch_x_right})

array([0.4636389 , 0.5283668 , 0.43854022, 0.8202803 , 0.64394784,
       0.84979135, 0.745062  , 0.01964164, 0.07101661, 0.02169931,
       0.8392247 , 0.22707516, 0.19469285, 0.4840045 , 0.05370182,
       0.4678564 , 0.4111814 , 0.11001766, 0.20520616, 0.07242185,
       0.7431572 , 0.52817804, 0.4351002 , 0.63338685, 0.52839124,
       0.07311231, 0.1716168 , 0.09279257, 0.02310717, 0.02681172,
       0.2308088 , 0.551746  , 0.8105283 , 0.66022396, 0.739179  ,
       0.38779128, 0.8515695 , 0.7534613 , 0.05358309, 0.05516434,
       0.63869566, 0.7444098 , 0.63428354, 0.49298012, 0.75610924,
       0.54483724, 0.9024776 , 0.05228931, 0.05101156, 0.02496451,
       0.7684243 , 0.37446058, 0.8911811 , 0.39399248, 0.04925126,
       0.89727813, 0.34909683, 0.09850705, 0.04967946, 0.05255091,
       0.58232725, 0.40308565, 0.68486273, 0.41244376, 0.06464297,
       0.07472116, 0.06430554, 0.42752308, 0.10852087, 0.0495699 ,
       0.11905402, 0.26009667, 0.53447616, 0.88553053, 0.04034