In [1]:
# !wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv

In [2]:
import tensorflow as tf
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import collections
from unidecode import unidecode
from sklearn.cross_validation import train_test_split



In [3]:
def build_dataset(words, n_words):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3], ['SEPARATOR', 4]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            val = dic[k] if k in dic else UNK
            X[i,-1 - no]= val
    return X

def cleaning(string):
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z\- ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string.lower()

In [4]:
df = pd.read_csv('quora_duplicate_questions.tsv', delimiter='\t').dropna()
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
left, right, label = df['question1'].tolist(), df['question2'].tolist(), df['is_duplicate'].tolist()

In [6]:
np.unique(label, return_counts = True)

(array([0, 1]), array([255024, 149263]))

In [7]:
for i in tqdm(range(len(left))):
    left[i] = cleaning(left[i])
    right[i] = cleaning(right[i])
    left[i] = left[i] + ' SEPARATOR ' + right[i]

100%|██████████| 404287/404287 [00:07<00:00, 52786.23it/s]


In [8]:
concat = ' '.join(left).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 87662
Most common words [['SEPARATOR', 4], ('SEPARATOR', 404287), ('the', 377593), ('what', 324635), ('is', 269934), ('i', 223893)]
Sample data [6, 7, 5, 1286, 63, 1286, 2502, 11, 565, 12] ['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in']


In [9]:
def position_encoding(inputs):
    T = tf.shape(inputs)[1]
    repr_dim = inputs.get_shape()[-1].value
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1])

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta

def self_attention(inputs, is_training, num_units, num_heads = 8, activation=None):
    T_q = T_k = tf.shape(inputs)[1]
    Q_K_V = tf.layers.dense(inputs, 3*num_units, activation)
    Q, K, V = tf.split(Q_K_V, 3, -1)
    Q_ = tf.concat(tf.split(Q, num_heads, axis=2), 0)
    K_ = tf.concat(tf.split(K, num_heads, axis=2), 0)
    V_ = tf.concat(tf.split(V, num_heads, axis=2), 0)
    align = tf.matmul(Q_, K_, transpose_b=True)
    align *= tf.rsqrt(tf.to_float(K_.get_shape()[-1].value))
    paddings = tf.fill(tf.shape(align), float('-inf'))
    lower_tri = tf.ones([T_q, T_k])
    lower_tri = tf.linalg.LinearOperatorLowerTriangular(lower_tri).to_dense()
    masks = tf.tile(tf.expand_dims(lower_tri,0), [tf.shape(align)[0],1,1])
    align = tf.where(tf.equal(masks, 0), paddings, align)
    align = tf.nn.softmax(align)
    align = tf.layers.dropout(align, 0.1, training=is_training) 
    x = tf.matmul(align, V_)
    x = tf.concat(tf.split(x, num_heads, axis=0), 2)
    x += inputs
    x = layer_norm(x)
    return x

def ffn(inputs, hidden_dim, activation=tf.nn.relu):
    x = tf.layers.conv1d(inputs, 4* hidden_dim, 1, activation=activation) 
    x = tf.layers.conv1d(x, hidden_dim, 1, activation=None)
    x += inputs
    x = layer_norm(x)
    return x

class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, learning_rate, dropout, kernel_size = 5):
        
        def cnn(x, scope):
            x += position_encoding(x)
            with tf.variable_scope(scope, reuse = tf.AUTO_REUSE):
                for n in range(num_layers):
                    with tf.variable_scope('attn_%d'%i,reuse=tf.AUTO_REUSE):
                        x = self_attention(x, True, size_layer)
                    with tf.variable_scope('ffn_%d'%i, reuse=tf.AUTO_REUSE):
                        x = ffn(x, size_layer)
                
                with tf.variable_scope('logits', reuse=tf.AUTO_REUSE):
                    return tf.layers.dense(x, 2)[:, -1]
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded_left = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        self.logits = cnn(embedded_left, 'left')
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [10]:
size_layer = 128
num_layers = 4
embedded_size = 128
learning_rate = 1e-4
maxlen = 50
batch_size = 128
dropout = 0.8

In [11]:
from sklearn.cross_validation import train_test_split

vectors = str_idx(left, dictionary, maxlen)
train_X, test_X, train_Y, test_Y = train_test_split(vectors, label, test_size = 0.2)

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),learning_rate,dropout)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.conv1d instead.
Instructions for updating:
Use tf.cast instead.


In [13]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x = train_X[i:min(i+batch_size,train_X.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x,
                                        model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x,
                                        model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X) / batch_size)
    train_acc /= (len(train_X) / batch_size)
    test_loss /= (len(test_X) / batch_size)
    test_acc /= (len(test_X) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
    
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))

train minibatch loop: 100%|██████████| 2527/2527 [00:54<00:00, 46.20it/s, accuracy=0.663, cost=0.652]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 110.07it/s, accuracy=0.644, cost=0.674]
train minibatch loop:   0%|          | 5/2527 [00:00<00:54, 46.61it/s, accuracy=0.648, cost=0.617]

epoch: 0, pass acc: 0.000000, current acc: 0.654326
time taken: 60.44020199775696
epoch: 0, training loss: 0.639404, training acc: 0.640978, valid loss: 0.628099, valid acc: 0.654326



train minibatch loop: 100%|██████████| 2527/2527 [00:54<00:00, 46.62it/s, accuracy=0.663, cost=0.619]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 112.44it/s, accuracy=0.622, cost=0.669]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 47.00it/s, accuracy=0.68, cost=0.62]  

epoch: 0, pass acc: 0.654326, current acc: 0.667128
time taken: 59.827545404434204
epoch: 0, training loss: 0.621935, training acc: 0.659585, valid loss: 0.614735, valid acc: 0.667128



train minibatch loop: 100%|██████████| 2527/2527 [00:54<00:00, 46.69it/s, accuracy=0.683, cost=0.577]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 112.01it/s, accuracy=0.6, cost=0.683]  
train minibatch loop:   0%|          | 5/2527 [00:00<00:54, 46.61it/s, accuracy=0.68, cost=0.621] 

epoch: 0, pass acc: 0.667128, current acc: 0.672164
time taken: 59.77066659927368
epoch: 0, training loss: 0.610259, training acc: 0.670584, valid loss: 0.608394, valid acc: 0.672164



train minibatch loop: 100%|██████████| 2527/2527 [00:54<00:00, 46.65it/s, accuracy=0.713, cost=0.564]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 111.70it/s, accuracy=0.656, cost=0.666]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 46.84it/s, accuracy=0.711, cost=0.604]

epoch: 0, pass acc: 0.672164, current acc: 0.679227
time taken: 59.83059549331665
epoch: 0, training loss: 0.601291, training acc: 0.679090, valid loss: 0.602495, valid acc: 0.679227



train minibatch loop: 100%|██████████| 2527/2527 [00:54<00:00, 46.56it/s, accuracy=0.703, cost=0.556]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 112.42it/s, accuracy=0.6, cost=0.659]  
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 46.75it/s, accuracy=0.695, cost=0.601]

epoch: 0, pass acc: 0.679227, current acc: 0.685867
time taken: 59.903602838516235
epoch: 0, training loss: 0.592938, training acc: 0.687245, valid loss: 0.597082, valid acc: 0.685867



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 46.87it/s, accuracy=0.743, cost=0.548]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 111.96it/s, accuracy=0.633, cost=0.672]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 46.98it/s, accuracy=0.695, cost=0.585]

epoch: 0, pass acc: 0.685867, current acc: 0.688751
time taken: 59.562599897384644
epoch: 0, training loss: 0.585165, training acc: 0.693349, valid loss: 0.592944, valid acc: 0.688751



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 46.82it/s, accuracy=0.752, cost=0.529]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 112.44it/s, accuracy=0.622, cost=0.704]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 46.92it/s, accuracy=0.719, cost=0.585]

epoch: 0, pass acc: 0.688751, current acc: 0.692926
time taken: 59.60137748718262
epoch: 0, training loss: 0.577756, training acc: 0.700359, valid loss: 0.590633, valid acc: 0.692926



train minibatch loop: 100%|██████████| 2527/2527 [00:54<00:00, 46.72it/s, accuracy=0.733, cost=0.524]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 112.66it/s, accuracy=0.622, cost=0.695]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 46.71it/s, accuracy=0.719, cost=0.597]

epoch: 0, pass acc: 0.692926, current acc: 0.694126
time taken: 59.701225996017456
epoch: 0, training loss: 0.570621, training acc: 0.705953, valid loss: 0.587987, valid acc: 0.694126



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.07it/s, accuracy=0.743, cost=0.517]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 112.60it/s, accuracy=0.667, cost=0.664]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 47.16it/s, accuracy=0.75, cost=0.59]  

epoch: 0, pass acc: 0.694126, current acc: 0.697845
time taken: 59.29985284805298
epoch: 0, training loss: 0.563849, training acc: 0.711581, valid loss: 0.585073, valid acc: 0.697845



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 46.92it/s, accuracy=0.752, cost=0.49] 
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 112.64it/s, accuracy=0.689, cost=0.684]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 47.25it/s, accuracy=0.734, cost=0.591]

epoch: 0, pass acc: 0.697845, current acc: 0.699698
time taken: 59.466017723083496
epoch: 0, training loss: 0.557104, training acc: 0.716393, valid loss: 0.583814, valid acc: 0.699698



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 46.91it/s, accuracy=0.733, cost=0.527]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 113.03it/s, accuracy=0.644, cost=0.68] 
train minibatch loop:   0%|          | 5/2527 [00:00<00:54, 46.28it/s, accuracy=0.75, cost=0.56]  

epoch: 0, pass acc: 0.699698, current acc: 0.700679
time taken: 59.46453809738159
epoch: 0, training loss: 0.551015, training acc: 0.721082, valid loss: 0.580544, valid acc: 0.700679



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.04it/s, accuracy=0.762, cost=0.522]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 113.48it/s, accuracy=0.678, cost=0.651]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 47.44it/s, accuracy=0.758, cost=0.556]

epoch: 0, pass acc: 0.700679, current acc: 0.702092
time taken: 59.29327607154846
epoch: 0, training loss: 0.545043, training acc: 0.725462, valid loss: 0.581033, valid acc: 0.702092



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.21it/s, accuracy=0.762, cost=0.516]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 113.11it/s, accuracy=0.7, cost=0.654]  
train minibatch loop:   0%|          | 5/2527 [00:00<00:54, 46.67it/s, accuracy=0.727, cost=0.55] 

epoch: 0, pass acc: 0.702092, current acc: 0.702943
time taken: 59.11387062072754
epoch: 0, training loss: 0.539628, training acc: 0.729723, valid loss: 0.581183, valid acc: 0.702943



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.15it/s, accuracy=0.762, cost=0.502]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 112.97it/s, accuracy=0.633, cost=0.693]
train minibatch loop:   0%|          | 5/2527 [00:00<00:52, 47.68it/s, accuracy=0.758, cost=0.545]

epoch: 0, pass acc: 0.702943, current acc: 0.705497
time taken: 59.19653916358948
epoch: 0, training loss: 0.533567, training acc: 0.734188, valid loss: 0.578577, valid acc: 0.705497



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.06it/s, accuracy=0.743, cost=0.483]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 112.83it/s, accuracy=0.644, cost=0.721]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 47.13it/s, accuracy=0.727, cost=0.544]

epoch: 0, pass acc: 0.705497, current acc: 0.709658
time taken: 59.30323553085327
epoch: 0, training loss: 0.528961, training acc: 0.737278, valid loss: 0.575870, valid acc: 0.709658



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.01it/s, accuracy=0.782, cost=0.481]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 113.54it/s, accuracy=0.7, cost=0.699]  
train minibatch loop:   0%|          | 5/2527 [00:00<00:52, 47.92it/s, accuracy=0.805, cost=0.487]

time taken: 59.32865643501282
epoch: 0, training loss: 0.522808, training acc: 0.741622, valid loss: 0.579368, valid acc: 0.706827



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.29it/s, accuracy=0.733, cost=0.481]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 113.10it/s, accuracy=0.622, cost=0.675]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 47.33it/s, accuracy=0.789, cost=0.505]

time taken: 59.023605823516846
epoch: 0, training loss: 0.517364, training acc: 0.744728, valid loss: 0.578737, valid acc: 0.709103



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.16it/s, accuracy=0.792, cost=0.454]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 113.06it/s, accuracy=0.567, cost=0.64] 
train minibatch loop:   0%|          | 5/2527 [00:00<00:52, 47.79it/s, accuracy=0.789, cost=0.486]

epoch: 0, pass acc: 0.709658, current acc: 0.711080
time taken: 59.17823839187622
epoch: 0, training loss: 0.512706, training acc: 0.748938, valid loss: 0.575415, valid acc: 0.711080



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.10it/s, accuracy=0.782, cost=0.43] 
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 112.75it/s, accuracy=0.656, cost=0.655]
train minibatch loop:   0%|          | 5/2527 [00:00<00:54, 46.70it/s, accuracy=0.766, cost=0.531]

time taken: 59.26551961898804
epoch: 0, training loss: 0.507218, training acc: 0.751649, valid loss: 0.579230, valid acc: 0.709997



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.01it/s, accuracy=0.832, cost=0.41] 
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 113.18it/s, accuracy=0.622, cost=0.669]
train minibatch loop:   0%|          | 5/2527 [00:00<00:53, 47.25it/s, accuracy=0.734, cost=0.526]

time taken: 59.346855878829956
epoch: 0, training loss: 0.502882, training acc: 0.755138, valid loss: 0.583503, valid acc: 0.707928



train minibatch loop: 100%|██████████| 2527/2527 [00:53<00:00, 47.27it/s, accuracy=0.802, cost=0.441]
test minibatch loop: 100%|██████████| 632/632 [00:05<00:00, 113.33it/s, accuracy=0.622, cost=0.659]

time taken: 59.0352988243103
epoch: 0, training loss: 0.498010, training acc: 0.757788, valid loss: 0.579649, valid acc: 0.709758

break epoch:0




