In [1]:
# !wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv

In [2]:
import tensorflow as tf
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import collections
from unidecode import unidecode
from sklearn.cross_validation import train_test_split



In [3]:
def build_dataset(words, n_words):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3], ['SEPARATOR', 4]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            val = dic[k] if k in dic else UNK
            X[i,-1 - no]= val
    return X

def cleaning(string):
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z\- ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string.lower()

In [4]:
df = pd.read_csv('quora_duplicate_questions.tsv', delimiter='\t').dropna()
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
left, right, label = df['question1'].tolist(), df['question2'].tolist(), df['is_duplicate'].tolist()

In [6]:
np.unique(label, return_counts = True)

(array([0, 1]), array([255024, 149263]))

In [7]:
for i in tqdm(range(len(left))):
    left[i] = cleaning(left[i])
    right[i] = cleaning(right[i])
    left[i] = left[i] + ' SEPARATOR ' + right[i]

100%|██████████| 404287/404287 [00:07<00:00, 51783.93it/s]


In [8]:
concat = ' '.join(left).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 87662
Most common words [['SEPARATOR', 4], ('SEPARATOR', 404287), ('the', 377593), ('what', 324635), ('is', 269934), ('i', 223893)]
Sample data [6, 7, 5, 1286, 63, 1286, 2502, 11, 565, 12] ['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in']


In [13]:
def position_encoding(inputs):
    T = tf.shape(inputs)[1]
    repr_dim = inputs.get_shape()[-1].value
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1])

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta

def cnn_block(x, dilation_rate, pad_sz, hidden_dim, kernel_size):
    x = layer_norm(x)
    pad = tf.zeros([tf.shape(x)[0], pad_sz, hidden_dim])
    x =  tf.layers.conv1d(inputs = tf.concat([pad, x, pad], 1),
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          dilation_rate = dilation_rate)
    x = x[:, :-pad_sz, :]
    x = tf.nn.relu(x)
    return x

class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, learning_rate, dropout, kernel_size = 5):
        
        def cnn(x, scope):
            x += position_encoding(x)
            with tf.variable_scope(scope, reuse = tf.AUTO_REUSE):
                for n in range(num_layers):
                    dilation_rate = 2 ** n
                    pad_sz = (kernel_size - 1) * dilation_rate 
                    with tf.variable_scope('block_%d'%i,reuse=tf.AUTO_REUSE):
                        x += cnn_block(x, dilation_rate, pad_sz, size_layer, kernel_size)
                
                with tf.variable_scope('logits', reuse=tf.AUTO_REUSE):
                    return tf.layers.dense(x, size_layer)[:, -1]
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded_left = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        self.logits = cnn(embedded_left, 'left')
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [10]:
size_layer = 128
num_layers = 4
embedded_size = 128
learning_rate = 1e-3
maxlen = 50
batch_size = 128
dropout = 0.8

In [16]:
from sklearn.cross_validation import train_test_split

vectors = str_idx(left, dictionary, maxlen)
train_X, test_X, train_Y, test_Y = train_test_split(vectors, label, test_size = 0.2)

In [14]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),learning_rate,dropout)
sess.run(tf.global_variables_initializer())



Instructions for updating:
Use tf.cast instead.


In [17]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x = train_X[i:min(i+batch_size,train_X.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x,
                                        model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x,
                                        model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X) / batch_size)
    train_acc /= (len(train_X) / batch_size)
    test_loss /= (len(test_X) / batch_size)
    test_acc /= (len(test_X) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
    
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))

train minibatch loop: 100%|██████████| 2527/2527 [00:32<00:00, 77.42it/s, accuracy=0.584, cost=0.645]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 241.53it/s, accuracy=0.678, cost=0.624]
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 82.72it/s, accuracy=0.664, cost=0.638]

epoch: 0, pass acc: 0.000000, current acc: 0.649024
time taken: 35.25988554954529
epoch: 0, training loss: 0.639172, training acc: 0.645532, valid loss: 0.625583, valid acc: 0.649024



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.78it/s, accuracy=0.653, cost=0.605]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 249.91it/s, accuracy=0.756, cost=0.568]
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 82.57it/s, accuracy=0.648, cost=0.625]

epoch: 0, pass acc: 0.649024, current acc: 0.686088
time taken: 33.05776762962341
epoch: 0, training loss: 0.599088, training acc: 0.681601, valid loss: 0.593228, valid acc: 0.686088



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.65it/s, accuracy=0.703, cost=0.568]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 250.40it/s, accuracy=0.7, cost=0.548]  
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 83.52it/s, accuracy=0.672, cost=0.615]

epoch: 0, pass acc: 0.686088, current acc: 0.700928
time taken: 33.1018283367157
epoch: 0, training loss: 0.572584, training acc: 0.705614, valid loss: 0.578908, valid acc: 0.700928



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.66it/s, accuracy=0.723, cost=0.556]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 251.01it/s, accuracy=0.778, cost=0.521]
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 83.45it/s, accuracy=0.703, cost=0.604]

epoch: 0, pass acc: 0.700928, current acc: 0.705392
time taken: 33.0923171043396
epoch: 0, training loss: 0.550349, training acc: 0.723289, valid loss: 0.573883, valid acc: 0.705392



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.63it/s, accuracy=0.733, cost=0.545]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 249.63it/s, accuracy=0.767, cost=0.526]
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 83.85it/s, accuracy=0.727, cost=0.582]

epoch: 0, pass acc: 0.705392, current acc: 0.706215
time taken: 33.11521649360657
epoch: 0, training loss: 0.530263, training acc: 0.737710, valid loss: 0.574223, valid acc: 0.706215



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.78it/s, accuracy=0.703, cost=0.507]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 249.87it/s, accuracy=0.722, cost=0.548]
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 82.88it/s, accuracy=0.68, cost=0.566] 

epoch: 0, pass acc: 0.706215, current acc: 0.712823
time taken: 33.06076192855835
epoch: 0, training loss: 0.512012, training acc: 0.749806, valid loss: 0.572262, valid acc: 0.712823



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.65it/s, accuracy=0.713, cost=0.539]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 249.05it/s, accuracy=0.711, cost=0.54] 
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 82.51it/s, accuracy=0.672, cost=0.576]

epoch: 0, pass acc: 0.712823, current acc: 0.715365
time taken: 33.11697006225586
epoch: 0, training loss: 0.495308, training acc: 0.760959, valid loss: 0.575378, valid acc: 0.715365



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.82it/s, accuracy=0.713, cost=0.55] 
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 250.17it/s, accuracy=0.689, cost=0.578]
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 82.39it/s, accuracy=0.719, cost=0.558]

epoch: 0, pass acc: 0.715365, current acc: 0.718076
time taken: 33.03995633125305
epoch: 0, training loss: 0.480132, training acc: 0.770668, valid loss: 0.576161, valid acc: 0.718076



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.96it/s, accuracy=0.723, cost=0.532]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 250.37it/s, accuracy=0.689, cost=0.571]
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 83.53it/s, accuracy=0.734, cost=0.556]

epoch: 0, pass acc: 0.718076, current acc: 0.718397
time taken: 32.98867201805115
epoch: 0, training loss: 0.466953, training acc: 0.778197, valid loss: 0.585377, valid acc: 0.718397



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.74it/s, accuracy=0.693, cost=0.579]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 250.47it/s, accuracy=0.722, cost=0.579]
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 83.07it/s, accuracy=0.727, cost=0.532]

epoch: 0, pass acc: 0.718397, current acc: 0.719860
time taken: 33.069570541381836
epoch: 0, training loss: 0.454996, training acc: 0.786085, valid loss: 0.589913, valid acc: 0.719860



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.83it/s, accuracy=0.703, cost=0.545]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 250.12it/s, accuracy=0.744, cost=0.56] 
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 82.83it/s, accuracy=0.711, cost=0.518]

epoch: 0, pass acc: 0.719860, current acc: 0.722752
time taken: 33.03630042076111
epoch: 0, training loss: 0.443845, training acc: 0.792981, valid loss: 0.597150, valid acc: 0.722752



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.84it/s, accuracy=0.743, cost=0.536]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 249.00it/s, accuracy=0.744, cost=0.57] 
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 82.01it/s, accuracy=0.75, cost=0.504] 

time taken: 33.04733848571777
epoch: 0, training loss: 0.433595, training acc: 0.798370, valid loss: 0.605825, valid acc: 0.720378



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.71it/s, accuracy=0.762, cost=0.505]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 250.76it/s, accuracy=0.756, cost=0.567]
train minibatch loop:   0%|          | 9/2527 [00:00<00:30, 82.74it/s, accuracy=0.75, cost=0.51]  

time taken: 33.075902462005615
epoch: 0, training loss: 0.423926, training acc: 0.803343, valid loss: 0.617053, valid acc: 0.721669



train minibatch loop: 100%|██████████| 2527/2527 [00:30<00:00, 82.80it/s, accuracy=0.723, cost=0.501]
test minibatch loop: 100%|██████████| 632/632 [00:02<00:00, 251.00it/s, accuracy=0.778, cost=0.559]

time taken: 33.04087018966675
epoch: 0, training loss: 0.415806, training acc: 0.808235, valid loss: 0.627675, valid acc: 0.719070

break epoch:0




