In [1]:
# !wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv

In [2]:
import tensorflow as tf
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import collections
from unidecode import unidecode
from sklearn.cross_validation import train_test_split



In [3]:
def build_dataset(words, n_words):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            val = dic[k] if k in dic else UNK
            X[i,-1 - no]= val
    return X

def cleaning(string):
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z\- ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string.lower()

In [4]:
df = pd.read_csv('quora_duplicate_questions.tsv', delimiter='\t').dropna()
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
left, right, label = df['question1'].tolist(), df['question2'].tolist(), df['is_duplicate'].tolist()

In [6]:
np.unique(label, return_counts = True)

(array([0, 1]), array([255024, 149263]))

In [7]:
for i in tqdm(range(len(left))):
    left[i] = cleaning(left[i])
    right[i] = cleaning(right[i])

100%|██████████| 404287/404287 [00:07<00:00, 54874.65it/s]


In [8]:
concat = ' '.join(left + right).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 87661
Most common words [('the', 377593), ('what', 324635), ('is', 269934), ('i', 223893), ('how', 220876), ('a', 212757)]
Sample data [5, 6, 4, 1285, 62, 1285, 2501, 10, 564, 11] ['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in']


In [9]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, learning_rate, dropout):
        
        def cells(size, reuse=False):
            cell = tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse)
            return tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=dropout)
        
        def birnn(inputs, scope):
            with tf.variable_scope(scope, reuse = tf.AUTO_REUSE):
                for n in range(num_layers):
                    (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw = cells(size_layer // 2),
                        cell_bw = cells(size_layer // 2),
                        inputs = inputs,
                        dtype = tf.float32,
                        scope = 'bidirectional_rnn_%d'%(n))
                    inputs = tf.concat((out_fw, out_bw), 2)
                return inputs[:,-1]
        
        self.X_left = tf.placeholder(tf.int32, [None, None])
        self.X_right = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None])
        self.batch_size = tf.shape(self.X_left)[0]
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded_left = tf.nn.embedding_lookup(encoder_embeddings, self.X_left)
        embedded_right = tf.nn.embedding_lookup(encoder_embeddings, self.X_right)
        
        def contrastive_loss(y,d):
            tmp= y * tf.square(d)
            tmp2 = (1-y) * tf.square(tf.maximum((1 - d),0))
            return tf.reduce_sum(tmp +tmp2)/tf.cast(self.batch_size,tf.float32)/2
        
        self.output_left = birnn(embedded_left, 'left')
        self.output_right = birnn(embedded_right, 'right')
        self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.output_left,self.output_right)),
                                              1,keep_dims=True))
        self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.output_left),
                                                                           1,keep_dims=True)),
                                                     tf.sqrt(tf.reduce_sum(tf.square(self.output_right),
                                                                           1,keep_dims=True))))
        self.distance = tf.reshape(self.distance, [-1])
        self.cost = contrastive_loss(self.Y,self.distance)
        
        self.temp_sim = tf.subtract(tf.ones_like(self.distance),
                                    tf.rint(self.distance))
        correct_predictions = tf.equal(self.temp_sim, self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)

In [10]:
size_layer = 256
num_layers = 2
embedded_size = 128
learning_rate = 1e-3
maxlen = 50
batch_size = 128
dropout = 0.8

In [11]:
from sklearn.cross_validation import train_test_split

vectors_left = str_idx(left, dictionary, maxlen)
vectors_right = str_idx(right, dictionary, maxlen)
train_X_left, test_X_left, train_X_right, test_X_right, train_Y, test_Y = train_test_split(vectors_left,
                                                                                           vectors_right,
                                                                                           label,
                                                                                           test_size = 0.2)

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),learning_rate,dropout)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.


In [13]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X_left), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x_left = train_X_left[i:min(i+batch_size,train_X_left.shape[0])]
        batch_x_right = train_X_right[i:min(i+batch_size,train_X_left.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X_left.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X_left), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x_left = test_X_left[i:min(i+batch_size,train_X_left.shape[0])]
        batch_x_right = test_X_right[i:min(i+batch_size,train_X_left.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,train_X_left.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X_left) / batch_size)
    train_acc /= (len(train_X_left) / batch_size)
    test_loss /= (len(test_X_left) / batch_size)
    test_acc /= (len(test_X_left) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
    
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))

train minibatch loop: 100%|██████████| 2527/2527 [12:54<00:00,  3.32it/s, accuracy=0.762, cost=0.0892]
test minibatch loop: 100%|██████████| 632/632 [01:30<00:00,  7.02it/s, accuracy=0.611, cost=0.114] 
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.721205
time taken: 865.0403523445129
epoch: 0, training loss: 0.102127, training acc: 0.692444, valid loss: 0.095351, valid acc: 0.721205



train minibatch loop: 100%|██████████| 2527/2527 [12:50<00:00,  3.28it/s, accuracy=0.733, cost=0.0808]
test minibatch loop: 100%|██████████| 632/632 [01:31<00:00,  6.93it/s, accuracy=0.644, cost=0.106] 
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.721205, current acc: 0.743804
time taken: 861.1270344257355
epoch: 0, training loss: 0.092396, training acc: 0.733870, valid loss: 0.089960, valid acc: 0.743804



train minibatch loop: 100%|██████████| 2527/2527 [12:51<00:00,  3.32it/s, accuracy=0.802, cost=0.0735]
test minibatch loop: 100%|██████████| 632/632 [01:30<00:00,  6.93it/s, accuracy=0.644, cost=0.105] 
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.743804, current acc: 0.754440
time taken: 861.8881492614746
epoch: 0, training loss: 0.088065, training acc: 0.751199, valid loss: 0.087837, valid acc: 0.754440



train minibatch loop: 100%|██████████| 2527/2527 [12:50<00:00,  3.27it/s, accuracy=0.842, cost=0.0697]
test minibatch loop: 100%|██████████| 632/632 [01:30<00:00,  7.03it/s, accuracy=0.667, cost=0.104] 
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.754440, current acc: 0.757023
time taken: 861.7144618034363
epoch: 0, training loss: 0.085004, training acc: 0.764099, valid loss: 0.086727, valid acc: 0.757023



train minibatch loop: 100%|██████████| 2527/2527 [12:51<00:00,  3.22it/s, accuracy=0.812, cost=0.0724]
test minibatch loop: 100%|██████████| 632/632 [01:31<00:00,  6.93it/s, accuracy=0.633, cost=0.11]  
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.757023, current acc: 0.760754
time taken: 862.4683222770691
epoch: 0, training loss: 0.082544, training acc: 0.773236, valid loss: 0.085892, valid acc: 0.760754



train minibatch loop: 100%|██████████| 2527/2527 [12:50<00:00,  3.32it/s, accuracy=0.782, cost=0.0759]
test minibatch loop: 100%|██████████| 632/632 [01:30<00:00,  6.95it/s, accuracy=0.656, cost=0.108] 
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

time taken: 861.5845947265625
epoch: 0, training loss: 0.080261, training acc: 0.781377, valid loss: 0.086369, valid acc: 0.757438



train minibatch loop: 100%|██████████| 2527/2527 [12:50<00:00,  3.31it/s, accuracy=0.832, cost=0.0661]
test minibatch loop: 100%|██████████| 632/632 [01:31<00:00,  6.97it/s, accuracy=0.656, cost=0.102] 
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.760754, current acc: 0.763077
time taken: 861.8206684589386
epoch: 0, training loss: 0.078398, training acc: 0.788314, valid loss: 0.084990, valid acc: 0.763077



train minibatch loop: 100%|██████████| 2527/2527 [12:50<00:00,  3.29it/s, accuracy=0.842, cost=0.0661]
test minibatch loop: 100%|██████████| 632/632 [01:30<00:00,  6.96it/s, accuracy=0.656, cost=0.103] 
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

time taken: 861.3552474975586
epoch: 0, training loss: 0.076674, training acc: 0.795231, valid loss: 0.085479, valid acc: 0.759256



train minibatch loop: 100%|██████████| 2527/2527 [12:50<00:00,  3.23it/s, accuracy=0.851, cost=0.0647]
test minibatch loop: 100%|██████████| 632/632 [01:31<00:00,  6.93it/s, accuracy=0.656, cost=0.1]   
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.763077, current acc: 0.763510
time taken: 861.3164525032043
epoch: 0, training loss: 0.075192, training acc: 0.800024, valid loss: 0.084781, valid acc: 0.763510



train minibatch loop: 100%|██████████| 2527/2527 [12:49<00:00,  3.28it/s, accuracy=0.822, cost=0.0684]
test minibatch loop: 100%|██████████| 632/632 [01:30<00:00,  6.92it/s, accuracy=0.667, cost=0.107] 
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.763510, current acc: 0.765012
time taken: 860.8371865749359
epoch: 0, training loss: 0.073777, training acc: 0.805469, valid loss: 0.084846, valid acc: 0.765012



train minibatch loop: 100%|██████████| 2527/2527 [12:51<00:00,  3.27it/s, accuracy=0.842, cost=0.0651]
test minibatch loop: 100%|██████████| 632/632 [01:31<00:00,  6.90it/s, accuracy=0.644, cost=0.104] 
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

time taken: 862.1494925022125
epoch: 0, training loss: 0.072904, training acc: 0.808442, valid loss: 0.084983, valid acc: 0.762664



train minibatch loop: 100%|██████████| 2527/2527 [12:50<00:00,  3.29it/s, accuracy=0.802, cost=0.0664]
test minibatch loop: 100%|██████████| 632/632 [01:31<00:00,  6.96it/s, accuracy=0.678, cost=0.0966]
train minibatch loop:   0%|          | 0/2527 [00:00<?, ?it/s]

time taken: 861.4347906112671
epoch: 0, training loss: 0.072015, training acc: 0.811395, valid loss: 0.084607, valid acc: 0.763842



train minibatch loop: 100%|██████████| 2527/2527 [12:48<00:00,  3.33it/s, accuracy=0.851, cost=0.0605]
test minibatch loop: 100%|██████████| 632/632 [01:31<00:00,  6.98it/s, accuracy=0.667, cost=0.0982]

time taken: 859.5523777008057
epoch: 0, training loss: 0.070824, training acc: 0.816009, valid loss: 0.085312, valid acc: 0.761277

break epoch:0






In [14]:
left = str_idx(['a person is outdoors, on a horse.'], dictionary, maxlen)
right = str_idx(['a person on a horse jumps over a broken down airplane.'], dictionary, maxlen)
sess.run([model.temp_sim,1-model.distance], feed_dict = {model.X_left : left, 
                                        model.X_right: right})

[array([0.], dtype=float32), array([0.13218915], dtype=float32)]