In [1]:
import numpy as np
import collections
import random
import tensorflow as tf

In [2]:
def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            val = dic[k] if k in dic else UNK
            X[i,-1 - no]= val
    return X

def load_data(filepath):
    x1=[]
    x2=[]
    y=[]
    for line in open(filepath):
        l=line.strip().split("\t")
        if len(l)<2:
            continue
        if random.random() > 0.5:
            x1.append(l[0].lower())
            x2.append(l[1].lower())
        else:
            x1.append(l[1].lower())
            x2.append(l[0].lower())
        y.append(1)
    combined = np.asarray(x1+x2)
    shuffle_indices = np.random.permutation(np.arange(len(combined)))
    combined_shuff = combined[shuffle_indices]
    for i in range(len(combined)):
        x1.append(combined[i])
        x2.append(combined_shuff[i])
        y.append(0)
    return np.array(x1),np.array(x2),np.array(y)

In [3]:
X1_text, X2_text, Y = load_data('person_match.train')

In [4]:
concat = ' '.join(X1_text.tolist() + X2_text.tolist())
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 101
Most common words [(' ', 2076683), ('a', 1345908), ('e', 1246119), ('r', 1019184), ('n', 940224), ('i', 880143)]
Sample data [5, 16, 7, 9, 5, 8, 5, 4, 6, 26] ['a', 'd', 'r', 'i', 'a', 'n', 'a', ' ', 'e', 'v']


In [5]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, learning_rate, dropout):
        
        def cells(size, reuse=False):
            cell = tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse)
            return tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=dropout)
        
        def birnn(inputs, scope):
            with tf.variable_scope(scope):
                for n in range(num_layers):
                    (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw = cells(size_layer // 2),
                        cell_bw = cells(size_layer // 2),
                        inputs = inputs,
                        dtype = tf.float32,
                        scope = 'bidirectional_rnn_%d'%(n))
                    inputs = tf.concat((out_fw, out_bw), 2)
                return inputs[:,-1]
        
        self.X_left = tf.placeholder(tf.int32, [None, None])
        self.X_right = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None])
        self.batch_size = tf.shape(self.X_left)[0]
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded_left = tf.nn.embedding_lookup(encoder_embeddings, self.X_left)
        embedded_right = tf.nn.embedding_lookup(encoder_embeddings, self.X_right)
        
        def contrastive_loss(y,d):
            tmp= y * tf.square(d)
            tmp2 = (1-y) * tf.square(tf.maximum((1 - d),0))
            return tf.reduce_sum(tmp +tmp2)/tf.cast(self.batch_size,tf.float32)/2
        
        self.output_left = birnn(embedded_left, 'left')
        self.output_right = birnn(embedded_right, 'right')
        self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.output_left,self.output_right)),1,keep_dims=True))
        self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.output_left),1,keep_dims=True)),
                                                     tf.sqrt(tf.reduce_sum(tf.square(self.output_right),1,keep_dims=True))))
        self.distance = tf.reshape(self.distance, [-1])
        self.cost = contrastive_loss(self.Y,self.distance)
        
        self.temp_sim = tf.subtract(tf.ones_like(self.distance),
                                    tf.rint(self.distance))
        correct_predictions = tf.equal(self.temp_sim, self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)

In [6]:
size_layer = 256
num_layers = 2
embedded_size = 128
learning_rate = 1e-3
maxlen = 30
batch_size = 128
dropout = 0.8

In [7]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),learning_rate,dropout)
sess.run(tf.global_variables_initializer())

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [8]:
from sklearn.cross_validation import train_test_split

vectors_left = str_idx(X1_text, dictionary, maxlen)
vectors_right = str_idx(X2_text, dictionary, maxlen)
train_X_left, test_X_left, train_X_right, test_X_right, train_Y, test_Y = train_test_split(vectors_left,
                                                                                           vectors_right,
                                                                                           Y,
                                                                                           test_size = 0.2)



In [9]:
from tqdm import tqdm
import time

for EPOCH in range(5):
    lasttime = time.time()
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X_left), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x_left = train_X_left[i:min(i+batch_size,train_X_left.shape[0])]
        batch_x_right = train_X_right[i:min(i+batch_size,train_X_left.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X_left.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X_left), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x_left = test_X_left[i:min(i+batch_size,train_X_left.shape[0])]
        batch_x_right = test_X_right[i:min(i+batch_size,train_X_left.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,train_X_left.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X_left) / batch_size)
    train_acc /= (len(train_X_left) / batch_size)
    test_loss /= (len(test_X_left) / batch_size)
    test_acc /= (len(test_X_left) / batch_size)
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))

train minibatch loop: 100%|██████████| 3337/3337 [06:42<00:00,  8.42it/s, accuracy=1, cost=0.0317]    
test minibatch loop: 100%|██████████| 835/835 [00:43<00:00, 19.32it/s, accuracy=1, cost=0.0234]    
train minibatch loop:   0%|          | 1/3337 [00:00<06:24,  8.67it/s, accuracy=0.945, cost=0.0468]

time taken: 446.1067855358124
epoch: 0, training loss: 0.047054, training acc: 0.936334, valid loss: 0.044065, valid acc: 0.943738



train minibatch loop: 100%|██████████| 3337/3337 [06:40<00:00,  8.67it/s, accuracy=1, cost=0.0305]    
test minibatch loop: 100%|██████████| 835/835 [00:42<00:00, 19.71it/s, accuracy=1, cost=0.0329]    
train minibatch loop:   0%|          | 1/3337 [00:00<06:31,  8.51it/s, accuracy=0.961, cost=0.0434]

time taken: 442.806120634079
epoch: 1, training loss: 0.043424, training acc: 0.943691, valid loss: 0.043407, valid acc: 0.943963



train minibatch loop: 100%|██████████| 3337/3337 [06:41<00:00,  8.32it/s, accuracy=1, cost=0.0299]    
test minibatch loop: 100%|██████████| 835/835 [00:42<00:00, 19.59it/s, accuracy=1, cost=0.0296]    
train minibatch loop:   0%|          | 1/3337 [00:00<06:22,  8.72it/s, accuracy=0.93, cost=0.0451]

time taken: 443.7129006385803
epoch: 2, training loss: 0.042537, training acc: 0.945597, valid loss: 0.042411, valid acc: 0.946839



train minibatch loop: 100%|██████████| 3337/3337 [06:36<00:00,  8.84it/s, accuracy=1, cost=0.0261]    
test minibatch loop: 100%|██████████| 835/835 [00:42<00:00, 19.37it/s, accuracy=1, cost=0.0269]    
train minibatch loop:   0%|          | 1/3337 [00:00<06:26,  8.63it/s, accuracy=0.953, cost=0.0426]

time taken: 438.77990889549255
epoch: 3, training loss: 0.041973, training acc: 0.946717, valid loss: 0.041931, valid acc: 0.947616



train minibatch loop: 100%|██████████| 3337/3337 [06:36<00:00,  8.33it/s, accuracy=1, cost=0.0281]    
test minibatch loop: 100%|██████████| 835/835 [00:42<00:00, 19.76it/s, accuracy=1, cost=0.0286]    

time taken: 438.77926087379456
epoch: 4, training loss: 0.041583, training acc: 0.947766, valid loss: 0.041881, valid acc: 0.948243






In [39]:
left = str_idx(['adriana evans'], dictionary, maxlen)
right = str_idx(['adriana'], dictionary, maxlen)
sess.run([model.temp_sim,1-model.distance], feed_dict = {model.X_left : left, 
                                        model.X_right: right})

[array([0.], dtype=float32), array([0.31725764], dtype=float32)]

In [41]:
left = str_idx(['husein zolkepli'], dictionary, maxlen)
right = str_idx(['zolkepli'], dictionary, maxlen)
sess.run([model.temp_sim,1-model.distance], feed_dict = {model.X_left : left, 
                                        model.X_right: right})

[array([1.], dtype=float32), array([0.631173], dtype=float32)]