In [1]:
import re

from collections import Counter
import tensorflow as tf
import numpy as np
import pandas as pd
import gensim
from scipy.spatial.distance import cosine

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [2]:
train_df = pd.read_csv('data/dev_train.csv')

In [3]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [13]:
train_df = train_df[train_df['question1'].notnull()]
train_df = train_df[train_df['question2'].notnull()]

question1_sentences = []
question2_sentences = []
labels = []
for index, row in train_df.iterrows():
    question1_sentences.append(text_to_wordlist(row['question1']))
    question2_sentences.append(text_to_wordlist(row['question2']))
    labels.append(int(row['is_duplicate']))

assert (len(question1_sentences) == len(question2_sentences)), "Num of q1 and q2 are not equal"

In [14]:
wordsList = []

for row in question1_sentences:
    wordsList.extend(row.split())
    
for row in question2_sentences:
    wordsList.extend(row.split())
        
counts = Counter(wordsList)

vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [15]:
len(vocab)

15032

In [16]:
question1_int = []
question2_int = []
labels_int = []

for index, row in enumerate(question1_sentences):
    question1 = []
    question2 = []
    
    question1_words = question1_sentences[index].split()
    question2_words = question2_sentences[index].split()
    
    if (len(question1_words) > 3) and (len(question2_words) > 3):
        labels_int.append(labels[index])
        for word in question1_words:
            question1.append(vocab_to_int.get(word))

        question1_int.append(question1)

        for word in question2_words:
            question2.append(vocab_to_int.get(word))

        question2_int.append(question2)
        
assert (len(question1_int) == len(question2_int)), "Num of q1 and q2 ints are not equal"

In [17]:
seq_len = 200

In [18]:
question1_features = np.zeros((len(question1_int), seq_len), dtype=int)
for i, row in enumerate(question1_int):
    if len(row) == 0:
        print('something is wrong here - {}', i)
        
    question1_features[i, -len(row):] = np.array(row)[:seq_len]
    
question2_features = np.zeros((len(question2_int), seq_len), dtype=int)
for i, row in enumerate(question2_int):
    question2_features[i, -len(row):] = np.array(row)[:seq_len]
    
assert (len(question1_features) == len(question2_features)), "Num of q1 and q2 features are not equal"
assert (len(question1_features) == len(labels_int)), "Num of questions and labels are not equal"

In [19]:
word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [20]:
embedding_matrix = np.zeros((len(vocab_to_int)+1, 300), dtype=float)

for word, i in vocab_to_int.items():
    if word in word_embedding_model.vocab:
        embedding_matrix[i] = word_embedding_model.word_vec(word)

In [21]:
from sklearn.model_selection import StratifiedShuffleSplit

ss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

labels = np.asarray(labels_int)
train_idx, val_idx = next(ss.split(question1_features, labels))

half_val_len = int(len(val_idx)/2)
val_idx, test_idx = val_idx[:half_val_len], val_idx[half_val_len:]

question1_train, question2_train, label_train = question1_features[train_idx], question2_features[train_idx], labels[train_idx]
question1_val, question2_val, label_val = question1_features[val_idx], question2_features[val_idx], labels[val_idx]
question1_test, question2_test, label_test = question1_features[test_idx], question2_features[test_idx], labels[test_idx]

In [31]:
len(label_train[label_train==1])

2945

In [166]:
lstm_size = 64
lstm_layers = 1
batch_size = 128
learning_rate = 0.001

In [167]:
print(embedding_matrix.shape)

(15033, 300)


In [182]:
graph = tf.Graph()

with graph.as_default():
    embedding_placeholder = tf.placeholder(tf.float32, shape=embedding_matrix.shape, name='embedding_placeholder')
    with tf.variable_scope('siamese_network') as scope:
        labels = tf.placeholder(tf.int32, [batch_size, None], name='labels')
        keep_prob = tf.placeholder(tf.float32, name='question1_keep_prob')
        
        with tf.name_scope('question1') as question1_scope:
            question1_inputs = tf.placeholder(tf.int32, [batch_size, seq_len], name='question1_inputs')
            
            question1_embedding = tf.get_variable(name='embedding', initializer=embedding_placeholder, trainable=False)
            question1_embed = tf.nn.embedding_lookup(question1_embedding, question1_inputs)

            question1_lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
            question1_drop = tf.contrib.rnn.DropoutWrapper(question1_lstm, output_keep_prob=keep_prob)
            question1_multi_lstm = tf.contrib.rnn.MultiRNNCell([question1_drop] * lstm_layers)

            q1_initial_state = question1_multi_lstm.zero_state(batch_size, tf.float32)

            question1_outputs, question1_final_state = tf.nn.dynamic_rnn(question1_multi_lstm, question1_embed, initial_state=q1_initial_state)
            question1_predictions = tf.contrib.layers.fully_connected(question1_outputs[:, -1], 1, activation_fn=tf.sigmoid)

        scope.reuse_variables()
        
        with tf.name_scope('question2') as question2_scope:
            question2_inputs = tf.placeholder(tf.int32, [batch_size, seq_len], name='question2_inputs')

            question2_embedding = question1_embedding
            question2_embed = tf.nn.embedding_lookup(question2_embedding, question2_inputs)

            question2_lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
            question2_drop = tf.contrib.rnn.DropoutWrapper(question2_lstm, output_keep_prob=keep_prob)
            question2_multi_lstm = tf.contrib.rnn.MultiRNNCell([question2_drop] * lstm_layers)

            q2_initial_state = question2_multi_lstm.zero_state(batch_size, tf.float32)
            
            question2_outputs, question2_final_state = tf.nn.dynamic_rnn(question2_multi_lstm, question2_embed, initial_state=q2_initial_state)
            question2_predictions = tf.contrib.layers.fully_connected(question2_outputs[:, -1], 1, activation_fn=tf.sigmoid)


In [183]:
with graph.as_default():
    diff = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(question1_outputs[:, -1, :], question2_outputs[:, -1, :])), reduction_indices=1))

    margin = tf.constant(1.) 
    labels = tf.to_float(labels)
    match_loss = tf.expand_dims(tf.square(diff, 'match_term'), 0)
    mismatch_loss = tf.expand_dims(tf.maximum(0., tf.subtract(margin, tf.square(diff)), 'mismatch_term'), 0)

    loss = tf.add(tf.matmul(labels, match_loss), tf.matmul((1 - labels), mismatch_loss), 'loss_add')
    distance = tf.reduce_mean(loss)

    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(distance)

In [184]:
def get_batches(x1_input, x2_input, y, batch_size=100):
    n_batches = len(x1_input)//batch_size
    x1, x2, y = x1_input[:n_batches*batch_size], x2_input[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x1), batch_size):
        yield x1[ii:ii+batch_size], x2[ii:ii+batch_size], y[ii:ii+batch_size]

In [185]:
def get_test_batches(x1_input, x2_input, test_ids, batch_size=100):
    n_batches = len(x1_input)//batch_size
    x1, x2= x1_input[:n_batches*batch_size], x2_input[:n_batches*batch_size]
    for ii in range(0, len(x1), batch_size):
        yield x1[ii:ii+batch_size], x2[ii:ii+batch_size], test_ids[ii:ii+batch_size]

**Validation Accuracy**

In [186]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(distance), tf.float32), labels)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [187]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer(), feed_dict={embedding_placeholder: embedding_matrix})
    
    iteration = 1
    for e in range(epochs):
        # q1_int_state = sess.run(q1_initial_state)
        # q2_int_state = sess.run(q2_initial_state)
        summary_writer = tf.summary.FileWriter('/Users/mithun/projects/kaggle/quora_question_pairs/logs', sess.graph)
        summary_writer.add_graph(sess.graph)
        
        for ii, (x1, x2, y) in enumerate(get_batches(question1_train, question2_train, label_train, batch_size), 1):
            feed = {question1_inputs: x1,
                    question2_inputs: x2,
                    labels: y[:, None],
                    keep_prob: 0.9,
                    # q1_initial_state: q1_int_state,
                    # q2_initial_state: q2_int_state
                   }
            loss1, q2_out, q1_int_state, q2_int_state = sess.run([distance, question2_outputs, question1_final_state, question2_final_state], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss1))

            if iteration%50==0:
                val_acc = []
                # q1_val_int_state = sess.run(q1_initial_state)
                # q2_val_int_state = sess.run(q2_initial_state)
                for x1, x2, y in get_batches(question1_val, question2_val, label_val, batch_size):
                    feed = {question1_inputs: x1,
                            question2_inputs: x2,
                            labels: y[:, None],
                            keep_prob: 1,
                            # q1_initial_state: q1_val_int_state,
                            # q2_initial_state: q2_val_int_state
                           }
                    batch_acc, q1_val_int_state, q2_val_int_state = sess.run([accuracy, question1_final_state, question2_final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
            
    saver.save(sess, "checkpoints/quora_pairs.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.581
Epoch: 0/10 Iteration: 10 Train loss: 0.535
Epoch: 0/10 Iteration: 15 Train loss: 0.600
Epoch: 0/10 Iteration: 20 Train loss: 0.534
Epoch: 0/10 Iteration: 25 Train loss: 0.586
Epoch: 0/10 Iteration: 30 Train loss: 0.557
Epoch: 0/10 Iteration: 35 Train loss: 0.559
Epoch: 0/10 Iteration: 40 Train loss: 0.576
Epoch: 0/10 Iteration: 45 Train loss: 0.580
Epoch: 0/10 Iteration: 50 Train loss: 0.563
Val acc: 0.398
Epoch: 0/10 Iteration: 55 Train loss: 0.566
Epoch: 0/10 Iteration: 60 Train loss: 0.550
Epoch: 1/10 Iteration: 65 Train loss: 0.558
Epoch: 1/10 Iteration: 70 Train loss: 0.591
Epoch: 1/10 Iteration: 75 Train loss: 0.567
Epoch: 1/10 Iteration: 80 Train loss: 0.568
Epoch: 1/10 Iteration: 85 Train loss: 0.573
Epoch: 1/10 Iteration: 90 Train loss: 0.534
Epoch: 1/10 Iteration: 95 Train loss: 0.539
Epoch: 1/10 Iteration: 100 Train loss: 0.611
Val acc: 0.398
Epoch: 1/10 Iteration: 105 Train loss: 0.578
Epoch: 1/10 Iteration: 110 Train loss: 0.558


KeyboardInterrupt: 

**Testing**

In [168]:
test_df = pd.read_csv('data/dev_test.csv')

In [169]:
test_df = test_df[test_df['question1'].notnull()]
test_df = test_df[test_df['question2'].notnull()]

test_question1_int = []
test_question2_int = []
test_ids = []

for index, row in test_df.iterrows():
    test_question1_words = text_to_wordlist(row['question1'])
    test_question2_words = text_to_wordlist(row['question2'])
    question1 = []
    question2 = []
    
    if (len(test_question1_words) > 3) and (len(test_question2_words) > 3):
        for word in test_question1_words:
            question1.append(vocab_to_int.get(word, 0))

        test_question1_int.append(question1)

        for word in test_question2_words:
            question2.append(vocab_to_int.get(word, 0))

        test_question2_int.append(question2)
        test_ids.append(row['test_id'])

In [170]:
test_question1_features = np.zeros((len(test_question1_int), seq_len), dtype=int)
for i, row in enumerate(test_question1_int):
    test_question1_features[i, -len(row):] = np.array(row)[:seq_len]
    
test_question2_features = np.zeros((len(test_question2_int), seq_len), dtype=int)
for i, row in enumerate(test_question2_int):
    test_question2_features[i, -len(row):] = np.array(row)[:seq_len]
    
assert (len(test_question1_features) == len(test_question2_features)), "Num of q1 and q2 features are not equal"

In [None]:
test_acc = []

with graph.as_default():
    saver = tf.train.Saver()
    
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer(), feed_dict={embedding_placeholder: embedding_matrix})
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    
    #test_state = sess.run(initial_state)
    
    for ii, (x1, x2, batch_test_ids) in enumerate(get_test_batches(test_question1_features, test_question2_features, test_ids, batch_size), 1):
        
        

In [195]:
test_acc = []

with graph.as_default():
    saver = tf.train.Saver()
    
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer(), feed_dict={embedding_placeholder: embedding_matrix})
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    
    #test_state = sess.run(initial_state)
    q1_test_int_state = sess.run(q1_initial_state)
    q2_test_int_state = sess.run(q2_initial_state)
    
    for ii, (x1, x2, batch_test_ids) in enumerate(get_test_batches(test_question1_features, test_question2_features, test_ids, batch_size), 1):
        feed = {question1_inputs: x1,
                question2_inputs: x2,
                keep_prob: 1,
                q1_initial_state: q1_test_int_state,
                q2_initial_state: q2_test_int_state
               }
        test_state, distance1, q1_test_int_state, q1_test_int_state = sess.run([question1_final_state, distance, question1_final_state, question2_final_state], feed_dict=feed)
        
        prediction = (distance1 < 0.5).astype(int)
        submission = pd.DataFrame({'test_id':batch_test_ids, 'is_duplicate':prediction.ravel(), 'q1_pred': q1_pred.ravel(), 'q2_pred': q2_pred.ravel()}, columns=["test_id", "is_duplicate", "q1_pred", "q2_pred"])
        submission.to_csv('submission.csv', mode='a', header=False, index=False)

(LSTMStateTuple(c=array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), h=array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)),)
(LSTMStateTuple(c=array([[  9.17194858e-02,  -8.64592791e-02,  -1.75758421e-01, ...,
         -1.43904015e-02,   2.49351472e-01,   2.17731223e-02],
       [ -1.15225222e-02,  -5.31134345e-02,  -1.23475045e-01, ...,
          2.34776810e-02,   1.26286015e-01,   4.99736518e-02],
       [  4.62286063e-02,   2.26912852e-02,  -5.57401776e-02, ...,
         -1.03118479e-01,   1.60762385e-01,  

In [126]:
a = np.arange(60).reshape(3, 5, 4)

In [127]:
print(a)

[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]
  [12 13 14 15]
  [16 17 18 19]]

 [[20 21 22 23]
  [24 25 26 27]
  [28 29 30 31]
  [32 33 34 35]
  [36 37 38 39]]

 [[40 41 42 43]
  [44 45 46 47]
  [48 49 50 51]
  [52 53 54 55]
  [56 57 58 59]]]


In [129]:
a[:, -1, :]

array([[16, 17, 18, 19],
       [36, 37, 38, 39],
       [56, 57, 58, 59]])