In [1]:
import tensorflow as tf
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import collections
import itertools
from unidecode import unidecode
import malaya
import re
import json

In [2]:
def build_dataset(words, n_words, atleast=2):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words - 10)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen]):
            X[i, no] = dic.get(k, UNK)
    return X

tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [w.lower() for w in tokenized if len(w) > 2]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

In [3]:
with open('train-similarity.json') as fopen:
    train = json.load(fopen)
    
left, right, label = train['left'], train['right'], train['label']

In [4]:
with open('test-similarity.json') as fopen:
    test = json.load(fopen)
test_left, test_right, test_label = test['left'], test['right'], test['label']

In [5]:
np.unique(label, return_counts = True)

(array([0, 1]), array([2605321, 1531070]))

In [6]:
with open('similarity-dictionary.json') as fopen:
    x = json.load(fopen)
dictionary = x['dictionary']
rev_dictionary = x['reverse_dictionary']

In [7]:
def position_encoding(inputs):
    T = tf.shape(inputs)[1]
    repr_dim = inputs.get_shape()[-1].value
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1])

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta

def cnn_block(x, dilation_rate, pad_sz, hidden_dim, kernel_size):
    x = layer_norm(x)
    pad = tf.zeros([tf.shape(x)[0], pad_sz, hidden_dim])
    x =  tf.layers.conv1d(inputs = tf.concat([pad, x, pad], 1),
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          dilation_rate = dilation_rate)
    x = x[:, :-pad_sz, :]
    x = tf.nn.relu(x)
    return x

class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, learning_rate, dropout, kernel_size = 5):
        
        def cnn(x, scope):
            x += position_encoding(x)
            with tf.variable_scope(scope, reuse = tf.AUTO_REUSE):
                for n in range(num_layers):
                    dilation_rate = 2 ** n
                    pad_sz = (kernel_size - 1) * dilation_rate 
                    with tf.variable_scope('block_%d'%n,reuse=tf.AUTO_REUSE):
                        x += cnn_block(x, dilation_rate, pad_sz, size_layer, kernel_size)
                
                with tf.variable_scope('logits', reuse=tf.AUTO_REUSE):
                    return tf.layers.dense(x, size_layer)[:, -1]
        
        self.X_left = tf.placeholder(tf.int32, [None, None])
        self.X_right = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None])
        self.batch_size = tf.shape(self.X_left)[0]
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded_left = tf.nn.embedding_lookup(encoder_embeddings, self.X_left)
        embedded_right = tf.nn.embedding_lookup(encoder_embeddings, self.X_right)
        
        def contrastive_loss(y,d):
            tmp= y * tf.square(d)
            tmp2 = (1-y) * tf.square(tf.maximum((1 - d),0))
            return tf.reduce_sum(tmp +tmp2)/tf.cast(self.batch_size,tf.float32)/2
        
        self.output_left = cnn(embedded_left, 'left')
        self.output_right = cnn(embedded_right, 'right')
        print(self.output_left, self.output_right)
        self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.output_left,self.output_right)),
                                              1,keep_dims=True))
        self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.output_left),
                                                                           1,keep_dims=True)),
                                                     tf.sqrt(tf.reduce_sum(tf.square(self.output_right),
                                                                           1,keep_dims=True))))
        self.distance = tf.reshape(self.distance, [-1])
        self.logits = tf.identity(self.distance, name = 'logits')
        self.cost = contrastive_loss(self.Y,self.distance)
        
        self.temp_sim = tf.subtract(tf.ones_like(self.distance),
                                    tf.rint(self.distance))
        correct_predictions = tf.equal(self.temp_sim, self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)

In [8]:
size_layer = 128
num_layers = 4
embedded_size = 128
learning_rate = 1e-3
maxlen = 50
batch_size = 128
dropout = 0.8

In [9]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),learning_rate,dropout)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'dilated-cnn/model.ckpt')

Tensor("left/logits/strided_slice:0", shape=(?, 128), dtype=float32) Tensor("right/logits/strided_slice:0", shape=(?, 128), dtype=float32)
Instructions for updating:
keep_dims is deprecated, use keepdims instead


'dilated-cnn/model.ckpt'

In [10]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and '_power' not in n.name
        and 'gradient' not in n.name
        and 'Initializer' not in n.name
        and 'Assign' not in n.name
    ]
)

In [11]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 2, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(left), batch_size), desc='train minibatch loop')
    for i in pbar:
        index = min(i+batch_size,len(left))
        batch_x_left = str_idx(left[i: index], dictionary, maxlen)
        batch_x_right = str_idx(right[i: index], dictionary, maxlen)
        batch_y = label[i:index]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_left), batch_size), desc='test minibatch loop')
    for i in pbar:
        index = min(i+batch_size,len(test_left))
        batch_x_left = str_idx(test_left[i: index], dictionary, maxlen)
        batch_x_right = str_idx(test_right[i: index], dictionary, maxlen)
        batch_y = test_label[i: index]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(left) / batch_size)
    train_acc /= (len(left) / batch_size)
    test_loss /= (len(test_left) / batch_size)
    test_acc /= (len(test_left) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
    
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))

train minibatch loop: 100%|██████████| 32316/32316 [39:10<00:00, 13.75it/s, accuracy=0.549, cost=0.119] 
test minibatch loop: 100%|██████████| 391/391 [00:08<00:00, 44.01it/s, accuracy=0.725, cost=0.0879]
train minibatch loop:   0%|          | 2/32316 [00:00<39:09, 13.76it/s, accuracy=0.852, cost=0.0575]

epoch: 0, pass acc: 0.000000, current acc: 0.754236
time taken: 2359.090886592865
epoch: 0, training loss: 0.088377, training acc: 0.736193, valid loss: 0.083123, valid acc: 0.754236



train minibatch loop: 100%|██████████| 32316/32316 [39:10<00:00, 14.49it/s, accuracy=0.577, cost=0.0901] 
test minibatch loop: 100%|██████████| 391/391 [00:08<00:00, 45.23it/s, accuracy=0.712, cost=0.0807]
train minibatch loop:   0%|          | 2/32316 [00:00<38:55, 13.84it/s, accuracy=0.898, cost=0.0552]

epoch: 0, pass acc: 0.754236, current acc: 0.770444
time taken: 2359.3658940792084
epoch: 0, training loss: 0.075040, training acc: 0.782331, valid loss: 0.078482, valid acc: 0.770444



train minibatch loop: 100%|██████████| 32316/32316 [39:12<00:00, 14.52it/s, accuracy=0.775, cost=0.0608] 
test minibatch loop: 100%|██████████| 391/391 [00:08<00:00, 45.24it/s, accuracy=0.725, cost=0.0816]
train minibatch loop:   0%|          | 2/32316 [00:00<39:00, 13.81it/s, accuracy=0.906, cost=0.0465]

epoch: 0, pass acc: 0.770444, current acc: 0.773316
time taken: 2360.77591919899
epoch: 0, training loss: 0.065331, training acc: 0.815129, valid loss: 0.078364, valid acc: 0.773316



train minibatch loop: 100%|██████████| 32316/32316 [39:12<00:00, 14.52it/s, accuracy=0.831, cost=0.0589] 
test minibatch loop: 100%|██████████| 391/391 [00:08<00:00, 45.04it/s, accuracy=0.712, cost=0.088] 
train minibatch loop:   0%|          | 2/32316 [00:00<39:09, 13.75it/s, accuracy=0.945, cost=0.0312]

time taken: 2361.537866592407
epoch: 0, training loss: 0.055691, training acc: 0.847078, valid loss: 0.078892, valid acc: 0.772364



train minibatch loop: 100%|██████████| 32316/32316 [39:10<00:00, 14.47it/s, accuracy=0.901, cost=0.0346] 
test minibatch loop: 100%|██████████| 391/391 [00:08<00:00, 45.22it/s, accuracy=0.725, cost=0.0924]

time taken: 2359.5909333229065
epoch: 0, training loss: 0.046877, training acc: 0.875735, valid loss: 0.080502, valid acc: 0.771336

break epoch:0






In [12]:
saver.save(sess, 'dilated-cnn/model.ckpt')

'dilated-cnn/model.ckpt'

In [13]:
left = str_idx(['a person is outdoors, on a horse.'], dictionary, maxlen)
right = str_idx(['a person on a horse jumps over a broken down airplane.'], dictionary, maxlen)
sess.run([model.temp_sim,1-model.distance], feed_dict = {model.X_left : left, 
                                        model.X_right: right})

[array([0.], dtype=float32), array([0.0343591], dtype=float32)]

In [14]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_left), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i+batch_size,len(test_left))
    batch_x_left = str_idx(test_left[i: index], dictionary, maxlen)
    batch_x_right = str_idx(test_right[i: index], dictionary, maxlen)
    batch_y = test_label[i: index]
    predict_Y += sess.run(model.temp_sim, feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y}).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 391/391 [00:08<00:00, 47.37it/s]


In [15]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['not similar', 'similar']
    )
)

             precision    recall  f1-score   support

not similar       0.82      0.82      0.82     31524
    similar       0.69      0.69      0.69     18476

avg / total       0.77      0.77      0.77     50000



In [16]:
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Variable',
 'left/block_0/gamma',
 'left/block_0/beta',
 'left/block_0/conv1d/kernel',
 'left/block_0/conv1d/bias',
 'left/block_1/gamma',
 'left/block_1/beta',
 'left/block_1/conv1d/kernel',
 'left/block_1/conv1d/bias',
 'left/block_2/gamma',
 'left/block_2/beta',
 'left/block_2/conv1d/kernel',
 'left/block_2/conv1d/bias',
 'left/block_3/gamma',
 'left/block_3/beta',
 'left/block_3/conv1d/kernel',
 'left/block_3/conv1d/bias',
 'left/logits/dense/kernel',
 'left/logits/dense/kernel/read',
 'left/logits/dense/bias',
 'left/logits/dense/bias/read',
 'left/logits/dense/Tensordot/Shape',
 'left/logits/dense/Tensordot/Rank',
 'left/logits/dense/Tensordot/axes',
 'left/logits/dense/Tensordot/GreaterEqual/y',
 'left/logits/dense/Tensordot/GreaterEqual',
 'left/logits/dense/Tensordot/Cast',
 'left/logits/dense/Tensordot/mul',
 'left/logits/dense/Tensordot/Less/y',
 'left/logits/dense/Tensordot/Less',
 'left/logits/dense/Tensordot/Cast_1',
 

In [17]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [18]:
freeze_graph('dilated-cnn', strings)

INFO:tensorflow:Restoring parameters from dilated-cnn/model.ckpt
INFO:tensorflow:Froze 37 variables.
INFO:tensorflow:Converted 37 variables to const ops.
875 ops in the final graph.


In [19]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [20]:
g = load_graph('dilated-cnn/frozen_model.pb')
x1 = g.get_tensor_by_name('import/Placeholder:0')
x2 = g.get_tensor_by_name('import/Placeholder_1:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(1-logits, feed_dict = {x1 : left, x2: right})

array([0.0343591], dtype=float32)

In [21]:
test_sess.run(1-logits, feed_dict = {x1 : batch_x_left, x2: batch_x_right})

array([0.21152252, 0.14559478, 0.20776057, 0.70417494, 0.49244803,
       0.33945912, 0.9202117 , 0.02324635, 0.12748677, 0.8314166 ,
       0.975024  , 0.62006444, 0.18129557, 0.6427861 , 0.07265455,
       0.4061333 , 0.18890274, 0.02502632, 0.0484429 , 0.10148406,
       0.8321909 , 0.05768776, 0.55261767, 0.6817114 , 0.11403704,
       0.44246477, 0.4924479 , 0.18728226, 0.07191038, 0.05914503,
       0.0800122 , 0.3046261 , 0.60251844, 0.761145  , 0.95517516,
       0.88605934, 0.814803  , 0.07416344, 0.06447667, 0.03957129,
       0.03240418, 0.75431895, 0.6757686 , 0.76394105, 0.9388763 ,
       0.24763906, 0.98832715, 0.05210805, 0.02429408, 0.12788087,
       0.1121434 , 0.8168456 , 0.9283892 , 0.5351901 , 0.01739019,
       0.9779401 , 0.02959573, 0.07608068, 0.16026843, 0.07550842,
       0.6336924 , 0.23004955, 0.8670918 , 0.68216723, 0.06849951,
       0.02407455, 0.01773602, 0.88574535, 0.06930637, 0.01752573,
       0.02795351, 0.5855931 , 0.1376006 , 0.958021  , 0.00917