In [1]:
import tensorflow as tf
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import collections
import itertools
from unidecode import unidecode
import malaya
import re
import json

In [2]:
def build_dataset(words, n_words, atleast=2):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words - 10)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen]):
            X[i, no] = dic.get(k, UNK)
    return X

tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [w.lower() for w in tokenized if len(w) > 2]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

In [3]:
with open('train-similarity.json') as fopen:
    train = json.load(fopen)
    
left, right, label = train['left'], train['right'], train['label']

In [4]:
with open('test-similarity.json') as fopen:
    test = json.load(fopen)
test_left, test_right, test_label = test['left'], test['right'], test['label']

In [5]:
np.unique(label, return_counts = True)

(array([0, 1]), array([2605321, 1531070]))

In [6]:
with open('similarity-dictionary.json') as fopen:
    x = json.load(fopen)
dictionary = x['dictionary']
rev_dictionary = x['reverse_dictionary']

In [7]:
def position_encoding(inputs):
    T = tf.shape(inputs)[1]
    repr_dim = inputs.get_shape()[-1].value
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1])

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta

def self_attention(inputs, is_training, num_units, num_heads = 8, activation=None):
    T_q = T_k = tf.shape(inputs)[1]
    Q_K_V = tf.layers.dense(inputs, 3*num_units, activation)
    Q, K, V = tf.split(Q_K_V, 3, -1)
    Q_ = tf.concat(tf.split(Q, num_heads, axis=2), 0)
    K_ = tf.concat(tf.split(K, num_heads, axis=2), 0)
    V_ = tf.concat(tf.split(V, num_heads, axis=2), 0)
    align = tf.matmul(Q_, K_, transpose_b=True)
    align *= tf.rsqrt(tf.to_float(K_.get_shape()[-1].value))
    paddings = tf.fill(tf.shape(align), float('-inf'))
    lower_tri = tf.ones([T_q, T_k])
    lower_tri = tf.linalg.LinearOperatorLowerTriangular(lower_tri).to_dense()
    masks = tf.tile(tf.expand_dims(lower_tri,0), [tf.shape(align)[0],1,1])
    align = tf.where(tf.equal(masks, 0), paddings, align)
    align = tf.nn.softmax(align)
    align = tf.layers.dropout(align, 0.1, training=is_training) 
    x = tf.matmul(align, V_)
    x = tf.concat(tf.split(x, num_heads, axis=0), 2)
    x += inputs
    x = layer_norm(x)
    return x

def ffn(inputs, hidden_dim, activation=tf.nn.relu):
    x = tf.layers.conv1d(inputs, 4* hidden_dim, 1, activation=activation) 
    x = tf.layers.conv1d(x, hidden_dim, 1, activation=None)
    x += inputs
    x = layer_norm(x)
    return x

class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, learning_rate, dropout, kernel_size = 5):
        
        def cnn(x, scope):
            x += position_encoding(x)
            with tf.variable_scope(scope, reuse = tf.AUTO_REUSE):
                for n in range(num_layers):
                    with tf.variable_scope('attn_%d'%n,reuse=tf.AUTO_REUSE):
                        x = self_attention(x, True, size_layer)
                    with tf.variable_scope('ffn_%d'%n, reuse=tf.AUTO_REUSE):
                        x = ffn(x, size_layer)
                
                with tf.variable_scope('logits', reuse=tf.AUTO_REUSE):
                    return tf.layers.dense(x, size_layer)[:, -1]
        
        self.X_left = tf.placeholder(tf.int32, [None, None])
        self.X_right = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None])
        self.batch_size = tf.shape(self.X_left)[0]
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded_left = tf.nn.embedding_lookup(encoder_embeddings, self.X_left)
        embedded_right = tf.nn.embedding_lookup(encoder_embeddings, self.X_right)
        
        def contrastive_loss(y,d):
            tmp= y * tf.square(d)
            tmp2 = (1-y) * tf.square(tf.maximum((1 - d),0))
            return tf.reduce_sum(tmp +tmp2)/tf.cast(self.batch_size,tf.float32)/2
        
        self.output_left = cnn(embedded_left, 'left')
        self.output_right = cnn(embedded_right, 'right')
        print(self.output_left, self.output_right)
        self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.output_left,self.output_right)),
                                              1,keep_dims=True))
        self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.output_left),
                                                                           1,keep_dims=True)),
                                                     tf.sqrt(tf.reduce_sum(tf.square(self.output_right),
                                                                           1,keep_dims=True))))
        self.distance = tf.reshape(self.distance, [-1])
        self.logits = tf.identity(self.distance, name = 'logits')
        self.cost = contrastive_loss(self.Y,self.distance)
        
        self.temp_sim = tf.subtract(tf.ones_like(self.distance),
                                    tf.rint(self.distance))
        correct_predictions = tf.equal(self.temp_sim, self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)

In [8]:
size_layer = 128
num_layers = 4
embedded_size = 128
learning_rate = 1e-4
maxlen = 50
batch_size = 128
dropout = 0.8

In [9]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),learning_rate,dropout)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'self-attention/model.ckpt')

Tensor("left/logits/strided_slice:0", shape=(?, 128), dtype=float32) Tensor("right/logits/strided_slice:0", shape=(?, 128), dtype=float32)
Instructions for updating:
keep_dims is deprecated, use keepdims instead


'self-attention/model.ckpt'

In [10]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and '_power' not in n.name
        and 'gradient' not in n.name
        and 'Initializer' not in n.name
        and 'Assign' not in n.name
    ]
)

In [11]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 2, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(left), batch_size), desc='train minibatch loop')
    for i in pbar:
        index = min(i+batch_size,len(left))
        batch_x_left = str_idx(left[i: index], dictionary, maxlen)
        batch_x_right = str_idx(right[i: index], dictionary, maxlen)
        batch_y = label[i:index]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_left), batch_size), desc='test minibatch loop')
    for i in pbar:
        index = min(i+batch_size,len(test_left))
        batch_x_left = str_idx(test_left[i: index], dictionary, maxlen)
        batch_x_right = str_idx(test_right[i: index], dictionary, maxlen)
        batch_y = test_label[i: index]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(left) / batch_size)
    train_acc /= (len(left) / batch_size)
    test_loss /= (len(test_left) / batch_size)
    test_acc /= (len(test_left) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
    
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))

train minibatch loop: 100%|██████████| 32316/32316 [1:48:08<00:00,  5.65it/s, accuracy=0.549, cost=0.123] 
test minibatch loop: 100%|██████████| 391/391 [00:29<00:00, 13.98it/s, accuracy=0.725, cost=0.089] 
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s, accuracy=0.727, cost=0.1]

epoch: 0, pass acc: 0.000000, current acc: 0.745696
time taken: 6518.2529237270355
epoch: 0, training loss: 0.091813, training acc: 0.719950, valid loss: 0.085040, valid acc: 0.745696



train minibatch loop: 100%|██████████| 32316/32316 [1:48:08<00:00,  5.66it/s, accuracy=0.662, cost=0.108]  
test minibatch loop: 100%|██████████| 391/391 [00:29<00:00, 13.18it/s, accuracy=0.688, cost=0.0854]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s, accuracy=0.648, cost=0.101]

epoch: 0, pass acc: 0.745696, current acc: 0.762640
time taken: 6518.055644273758
epoch: 0, training loss: 0.079587, training acc: 0.766882, valid loss: 0.080575, valid acc: 0.762640



train minibatch loop:  76%|███████▌  | 24603/32316 [1:22:18<25:46,  4.99it/s, accuracy=0.852, cost=0.0366]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop: 100%|██████████| 32316/32316 [1:48:07<00:00,  5.66it/s, accuracy=0.69, cost=0.0892] 
test minibatch loop: 100%|██████████| 391/391 [00:29<00:00, 13.19it/s, accuracy=0.75, cost=0.077]  
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.762640, current acc: 0.768700
time taken: 6516.856334209442
epoch: 0, training loss: 0.073044, training acc: 0.790378, valid loss: 0.079073, valid acc: 0.768700



train minibatch loop: 100%|██████████| 32316/32316 [1:48:06<00:00,  5.66it/s, accuracy=0.761, cost=0.067]  
test minibatch loop: 100%|██████████| 391/391 [00:29<00:00, 13.23it/s, accuracy=0.762, cost=0.0771]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s, accuracy=0.672, cost=0.0856]

epoch: 0, pass acc: 0.768700, current acc: 0.769312
time taken: 6516.553435564041
epoch: 0, training loss: 0.067276, training acc: 0.811340, valid loss: 0.079457, valid acc: 0.769312



train minibatch loop: 100%|██████████| 32316/32316 [1:48:07<00:00,  5.66it/s, accuracy=0.887, cost=0.039]  
test minibatch loop: 100%|██████████| 391/391 [00:29<00:00, 13.18it/s, accuracy=0.788, cost=0.0811]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s, accuracy=0.703, cost=0.0815]

epoch: 0, pass acc: 0.769312, current acc: 0.771576
time taken: 6516.8461327552795
epoch: 0, training loss: 0.061893, training acc: 0.830197, valid loss: 0.079747, valid acc: 0.771576



train minibatch loop: 100%|██████████| 32316/32316 [1:48:07<00:00,  5.66it/s, accuracy=0.972, cost=0.0323] 
test minibatch loop: 100%|██████████| 391/391 [00:29<00:00, 13.17it/s, accuracy=0.775, cost=0.0762]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s, accuracy=0.758, cost=0.0717]

epoch: 0, pass acc: 0.771576, current acc: 0.773124
time taken: 6517.283529996872
epoch: 0, training loss: 0.056825, training acc: 0.846921, valid loss: 0.080187, valid acc: 0.773124



train minibatch loop: 100%|██████████| 32316/32316 [1:48:07<00:00,  5.66it/s, accuracy=1, cost=0.0204]     
test minibatch loop: 100%|██████████| 391/391 [00:29<00:00, 13.19it/s, accuracy=0.762, cost=0.0719]
train minibatch loop:   0%|          | 0/32316 [00:00<?, ?it/s]

time taken: 6516.9770267009735
epoch: 0, training loss: 0.052224, training acc: 0.861832, valid loss: 0.081519, valid acc: 0.770692



train minibatch loop: 100%|██████████| 32316/32316 [1:48:06<00:00,  5.66it/s, accuracy=1, cost=0.0112]     
test minibatch loop: 100%|██████████| 391/391 [00:29<00:00, 13.18it/s, accuracy=0.75, cost=0.0798] 

time taken: 6516.600977897644
epoch: 0, training loss: 0.048151, training acc: 0.874642, valid loss: 0.081898, valid acc: 0.772720

break epoch:0






In [12]:
saver.save(sess, 'self-attention/model.ckpt')

'self-attention/model.ckpt'

In [13]:
left = str_idx(['a person is outdoors, on a horse.'], dictionary, maxlen)
right = str_idx(['a person on a horse jumps over a broken down airplane.'], dictionary, maxlen)
sess.run([model.temp_sim,1-model.distance], feed_dict = {model.X_left : left, 
                                        model.X_right: right})

[array([0.], dtype=float32), array([0.02327037], dtype=float32)]

In [14]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_left), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i+batch_size,len(test_left))
    batch_x_left = str_idx(test_left[i: index], dictionary, maxlen)
    batch_x_right = str_idx(test_right[i: index], dictionary, maxlen)
    batch_y = test_label[i: index]
    predict_Y += sess.run(model.temp_sim, feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y}).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 391/391 [00:29<00:00, 14.17it/s]


In [15]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['not similar', 'similar']
    )
)

             precision    recall  f1-score   support

not similar       0.81      0.83      0.82     31524
    similar       0.70      0.67      0.68     18476

avg / total       0.77      0.77      0.77     50000



In [16]:
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Variable',
 'left/attn_0/dense/kernel',
 'left/attn_0/dense/bias',
 'left/attn_0/gamma',
 'left/attn_0/beta',
 'left/ffn_0/conv1d/kernel',
 'left/ffn_0/conv1d/bias',
 'left/ffn_0/conv1d_1/kernel',
 'left/ffn_0/conv1d_1/bias',
 'left/ffn_0/gamma',
 'left/ffn_0/beta',
 'left/attn_1/dense/kernel',
 'left/attn_1/dense/bias',
 'left/attn_1/gamma',
 'left/attn_1/beta',
 'left/ffn_1/conv1d/kernel',
 'left/ffn_1/conv1d/bias',
 'left/ffn_1/conv1d_1/kernel',
 'left/ffn_1/conv1d_1/bias',
 'left/ffn_1/gamma',
 'left/ffn_1/beta',
 'left/attn_2/dense/kernel',
 'left/attn_2/dense/bias',
 'left/attn_2/gamma',
 'left/attn_2/beta',
 'left/ffn_2/conv1d/kernel',
 'left/ffn_2/conv1d/bias',
 'left/ffn_2/conv1d_1/kernel',
 'left/ffn_2/conv1d_1/bias',
 'left/ffn_2/gamma',
 'left/ffn_2/beta',
 'left/attn_3/dense/kernel',
 'left/attn_3/dense/bias',
 'left/attn_3/gamma',
 'left/attn_3/beta',
 'left/ffn_3/conv1d/kernel',
 'left/ffn_3/conv1d/bias',
 'left/ffn_3

In [17]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [18]:
freeze_graph('self-attention', strings)

INFO:tensorflow:Restoring parameters from self-attention/model.ckpt
INFO:tensorflow:Froze 85 variables.
INFO:tensorflow:Converted 85 variables to const ops.
1637 ops in the final graph.


In [19]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [20]:
g = load_graph('self-attention/frozen_model.pb')
x1 = g.get_tensor_by_name('import/Placeholder:0')
x2 = g.get_tensor_by_name('import/Placeholder_1:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(1-logits, feed_dict = {x1 : left, x2: right})

array([0.01998395], dtype=float32)

In [21]:
test_sess.run(1-logits, feed_dict = {x1 : batch_x_left, x2: batch_x_right})

array([0.2318753 , 0.5197979 , 0.2777239 , 0.14316326, 0.8766695 ,
       0.22495192, 0.91102034, 0.0115208 , 0.070916  , 0.07542306,
       0.94589764, 0.04265296, 0.34291208, 0.43791467, 0.13047814,
       0.05099976, 0.04077601, 0.03098774, 0.05358207, 0.09898269,
       0.4222178 , 0.07683033, 0.27565062, 0.18730605, 0.34941596,
       0.08564615, 0.19999826, 0.05309838, 0.04758018, 0.01607895,
       0.13069487, 0.6605412 , 0.9515858 , 0.16830862, 0.5734025 ,
       0.5354396 , 0.749179  , 0.2538219 , 0.0801577 , 0.05013776,
       0.4355023 , 0.45459825, 0.03258169, 0.15339905, 0.9313603 ,
       0.42679828, 0.95682436, 0.07610172, 0.03255141, 0.00740314,
       0.52017945, 0.46709698, 0.74399465, 0.45834607, 0.02888119,
       0.9627122 , 0.1260702 , 0.03194386, 0.11266536, 0.05345899,
       0.5395947 , 0.34424478, 0.73064005, 0.17178106, 0.76854   ,
       0.03258795, 0.06777585, 0.8709656 , 0.09303659, 0.03535146,
       0.07395506, 0.06536621, 0.1412226 , 0.94608825, 0.07875